# Desafio de Ciência de Dados - IMDb 

In [79]:
# Importando as bibliotecas necessárias 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

### 1 - Pré processamento e limpeza dos dados

In [80]:
# Carregando o dataset e imprimindo as 5 primeiras observações 
df = pd.read_csv("../data/desafio_indicium_imdb.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
1,2,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
2,3,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
3,4,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000
4,5,The Lord of the Rings: The Return of the King,2003,U,201 min,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,Viggo Mortensen,Ian McKellen,Orlando Bloom,1642758,377845905


In [81]:
# Exibindo o nome das colunas 
df.columns

Index(['Unnamed: 0', 'Series_Title', 'Released_Year', 'Certificate', 'Runtime',
       'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director', 'Star1',
       'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [82]:
# Exibindo a dimensionalidade do dataframe (linhas, colunas)
df.shape

(999, 16)

In [83]:
# Exibindo as informações sobre o dataframe 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     999 non-null    int64  
 1   Series_Title   999 non-null    object 
 2   Released_Year  999 non-null    object 
 3   Certificate    898 non-null    object 
 4   Runtime        999 non-null    object 
 5   Genre          999 non-null    object 
 6   IMDB_Rating    999 non-null    float64
 7   Overview       999 non-null    object 
 8   Meta_score     842 non-null    float64
 9   Director       999 non-null    object 
 10  Star1          999 non-null    object 
 11  Star2          999 non-null    object 
 12  Star3          999 non-null    object 
 13  Star4          999 non-null    object 
 14  No_of_Votes    999 non-null    int64  
 15  Gross          830 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 125.0+ KB


In [84]:
# Transformando as colunas "Released_year", "Runtime" e "Gross" para valores numéricos 

# Limpa as strings das colunas "Runtime" e "Gross", que possuem texto em seus valores 
df["Runtime"] = df["Runtime"].str.extract(r'(\d+)') # Extrai apenas os números de cada célula 
df["Gross"] = df["Gross"].str.replace(",", "", regex=False) # Substitui as vírgulas das células por valores vazios, tornando assim o valor das células apenas em números

# Converte para valores numéricos e substitui os erros por NaN
df["Released_Year"] = pd.to_numeric(df["Released_Year"], errors="coerce")
df["Runtime"] = pd.to_numeric(df["Runtime"], errors="coerce")
df["Gross"] = pd.to_numeric(df["Gross"], errors="coerce")

# Calculando a mediana 
mediana_meta_score = df["Meta_score"].median()
print(f"Mediana Meta Score: {mediana_meta_score}")
mediana_gross = df["Gross"].median()
print(f"Mediana faturamento: {mediana_gross}")

Mediana Meta Score: 79.0
Mediana faturamento: 23457439.5


In [85]:
# Preenchimento das colunas "Meta_score" e "Gross"
# Como nessas colunas há muitos valores nulos, o ideal é preenche-las de algumas forma ao invés de excluir as celulas com valores NaN

# Calculando a mediana 
mediana_meta_score = df["Meta_score"].median()
print(f"Mediana Meta Score: {mediana_meta_score}")
mediana_gross = df["Gross"].median()
print(f"Mediana faturamento: {mediana_gross}")

# Preenchendo com a mediana os valores nulos 
df["Meta_score"] = df["Meta_score"].fillna(mediana_meta_score)
df["Gross"] = df["Gross"].fillna(mediana_gross)

Mediana Meta Score: 79.0
Mediana faturamento: 23457439.5


In [89]:
# Preenchendo os valores ausentes da coluna "Certificate" 

# Por ser do tipo object, o preenchimento é a adição de uma categoria nova, de não avaliado
df["Certificate"] = df["Certificate"].fillna("Not Rated")

# Exclusão da celula de valor nulo na coluna "Released_year"
df.dropna(subset=["Released_Year"], inplace=True)

# Convertendo "Released_year" para int
df["Released_Year"] = df["Released_Year"].astype(int)

In [90]:
# Exibindo as informações do dataframe, após todas as transformações e balanceamentos terem sido realizados 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 998 entries, 0 to 998
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     998 non-null    int64  
 1   Series_Title   998 non-null    object 
 2   Released_Year  998 non-null    int64  
 3   Certificate    998 non-null    object 
 4   Runtime        998 non-null    int64  
 5   Genre          998 non-null    object 
 6   IMDB_Rating    998 non-null    float64
 7   Overview       998 non-null    object 
 8   Meta_score     998 non-null    float64
 9   Director       998 non-null    object 
 10  Star1          998 non-null    object 
 11  Star2          998 non-null    object 
 12  Star3          998 non-null    object 
 13  Star4          998 non-null    object 
 14  No_of_Votes    998 non-null    int64  
 15  Gross          998 non-null    float64
dtypes: float64(3), int64(4), object(9)
memory usage: 132.5+ KB


In [91]:
# Exibindo os dados estatísticos do dataframe 
df.describe()

Unnamed: 0.1,Unnamed: 0,Released_Year,Runtime,IMDB_Rating,Meta_score,No_of_Votes,Gross
count,998.0,998.0,998.0,998.0,998.0,998.0,998.0
mean,499.533066,1991.214429,122.854709,7.948297,78.132265,271623.9,60419850.0
std,288.297542,23.308539,28.110078,0.272203,11.379412,321073.5,101456800.0
min,1.0,1920.0,45.0,7.6,28.0,25088.0,1305.0
25%,250.25,1976.0,103.0,7.7,72.0,55416.75,5010758.0
50%,499.5,1999.0,119.0,7.9,79.0,138168.5,23457440.0
75%,748.75,2009.0,136.75,8.1,85.75,373506.2,61446630.0
max,999.0,2020.0,321.0,9.2,100.0,2303232.0,936662200.0
