In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import requests
from datetime import datetime
from plotnine import *

## **Unindo os datasets**

In [None]:
bechdel_df = pd.read_csv('datasets/Bechdel_detailed.csv')
links_df = pd.read_csv('datasets/movie-dataset/links.csv') # contém o id do imdb e tmdb de cada filme

# remove colunas não usadas
bechdel_df = bechdel_df.drop(['Unnamed: 0', 'submitterid', 'date', 'visible', 'id'], axis=1)

# renomeia rating para não coincidir com o dos outros datasets
bechdel_df = bechdel_df.rename(columns={'rating': 'bt_score'})

# junta os 3 dataframes e descarta colunas duplicadas
bechdel_df = pd.merge(bechdel_df, links_df, left_on='imdbid', right_on='imdbId', how='inner').drop(['imdbId', 'movieId'], axis=1)

bechdel_df.head()

In [None]:
bechdel_df = bechdel_df.sort_values(by=['bt_score', 'year'], ascending=[False, True]).reset_index(drop=True)
bechdel_df.head()

## **Tratamento dos dados**

### **Retirada de nulos**

In [None]:
bechdel_df.describe()

In [None]:
non_null_data = bechdel_df.dropna().copy()

In [None]:
non_null_data.describe()

### **Transformando tipos de dados**

In [None]:
non_null_data['year'] = non_null_data['year'].astype('int')
non_null_data['bt_score'] = non_null_data['bt_score'].astype('int')
non_null_data['dubious'] = non_null_data['dubious'].astype('int')
non_null_data['imdbid'] = non_null_data['imdbid'].astype('int')
non_null_data['tmdbId'] = non_null_data['tmdbId'].astype('int')

### **API**

##### **Raspar dados dos filmes**

In [None]:
non_null_data['genres'] = pd.NA
non_null_data['popularity'] = pd.NA
non_null_data['production_companies'] = pd.NA
non_null_data['production_countries'] = pd.NA
non_null_data['release_date'] = pd.NA
non_null_data['revenue'] = pd.NA
non_null_data['title'] = pd.NA
non_null_data['vote_average'] = pd.NA
non_null_data['vote_count'] = pd.NA
non_null_data['cast'] = pd.NA
non_null_data['crew'] = pd.NA

In [None]:
api_key = 'b12a91c6b0aa37015f0b966b61efd74d'

for index, row in non_null_data.iterrows():
    try:
        info = requests.get(f'https://api.themoviedb.org/3/movie/{row["tmdbId"]}?api_key={api_key}')
        credits = requests.get(f'https://api.themoviedb.org/3/movie/{row["tmdbId"]}/credits?api_key={api_key}')
        
        details = info.json()
        details['cast'] = credits.json()['cast']
        details['crew'] = credits.json()['crew']

        non_null_data.loc[index, 'budget'] = details['budget']
        non_null_data.loc[index, 'genres'] = str(details['genres'])
        non_null_data.loc[index, 'popularity'] = details['popularity']
        non_null_data.loc[index, 'production_companies'] = str(details['production_companies'])
        non_null_data.loc[index, 'production_countries'] = str(details['production_countries'])
        non_null_data.loc[index, 'release_date'] = details['release_date']
        non_null_data.loc[index, 'revenue'] = details['revenue']
        non_null_data.loc[index, 'title'] = details['title']
        non_null_data.loc[index, 'vote_average'] = details['vote_average']
        non_null_data.loc[index, 'vote_count'] = details['vote_count']
        non_null_data.loc[index, 'cast'] = str(details['cast'])
        non_null_data.loc[index, 'crew'] = str(details['crew'])
    except:
        print(row['tmdbId'])

non_null_data.to_csv('api_data.csv')

In [None]:
data = pd.read_csv('api_data.csv', index_col=0)

data = data.dropna().copy()

data.head()

#### **Tratamento das colunas**

In [None]:
data.info()

In [None]:
for index, movie in data.iterrows():
    genres = eval(movie['genres'])
    genres_list = []

    for genre in genres:
        genre_name = genre['name']
        genres_list.append(genre_name)

    data.at[index, 'genres'] = genres_list

In [None]:
for index, movie in data.iterrows():
    companies = eval(movie['production_companies'])
    companies_list = []

    for company in companies:
        company_name = company['name']
        companies_list.append(company_name)

    data.at[index, 'production_companies'] = companies_list

In [None]:
for index, movie in data.iterrows():
    countries = eval(movie['production_countries'])
    countries_list = []

    for country in countries:
        country_name = country['name']
        countries_list.append(country_name)

    data.at[index, 'production_countries'] = countries_list

In [None]:
for index, movie in data.iterrows():
    date = datetime.strptime(movie['release_date'], '%Y-%m-%d')
    reformatted_date = date.strftime('%d/%m/%Y')

    data.at[index, 'release_date'] = reformatted_date

In [None]:
data['cast_gender'] = pd.NA


for index, movie in data.iterrows():
    cast = eval(movie['cast'])
    gender_list = []

    for cast_member in cast:
        gender = cast_member['gender']
        gender_list.append(gender)

    data.at[index, 'cast_gender'] = gender_list

In [None]:
data['crew_gender'] = pd.NA


for index, movie in data.iterrows():
    crew = eval(movie['crew'])
    gender_list = []

    for crew_member in crew:
        gender = crew_member['gender']
        gender_list.append(gender)

    data.at[index, 'crew_gender'] = gender_list

In [None]:
# GENDER INFO
# 0: -
# 1: Feminino
# 2: Masculino
# 3: Não-binário


data['cast_female_representation'] = pd.NA
data['crew_female_representation'] = pd.NA


for index, movie in data.iterrows():
    if len(movie['cast_gender']) != 0:
        data.at[index, 'cast_female_representation'] = 100*(movie['cast_gender'].count(1)/len(movie['cast_gender']))

    if len(movie['crew_gender']) != 0:
        data.at[index, 'crew_female_representation'] = 100*(movie['crew_gender'].count(1)/len(movie['crew_gender']))


data = data.dropna().copy()
data['cast_female_representation'] = data['cast_female_representation'].astype('float64')
data['crew_female_representation'] = data['crew_female_representation'].astype('float64')

In [None]:
data.describe()

In [None]:
data.head()

In [None]:
data.to_csv('cleaned_data.csv')

In [None]:
fig, axs = plt.subplots(figsize=(10, 8), constrained_layout=True)


def plot(axs, data, title):
    corr = data.corr(method='spearman')
    cmap = sns.diverging_palette(255, 255, sep=1, as_cmap=True)
    mask = np.triu(np.ones_like(corr, dtype=bool))

    graph = sns.heatmap(corr, annot=True, vmin=-1, vmax=1, fmt='.1%', cmap=cmap, mask=mask, ax=axs)
    graph.set_title(title, pad=10)


plot(axs, data.drop(['imdbid', 'tmdbId'], axis=1), 'Mapa de correlação entre variáveis')
plt.show()