In [81]:
# Atividade Prática – Manipulação de Dados com Pandas

## Análise de Filmes - Dataset IMDb

#### Objetivo: Analisar um conjunto de filmes usando DataFrames do Pandas para responder perguntas sobre desempenho, gêneros, diretores e características das produções.

In [82]:
# Importação das bibliotecas necessárias
import pandas as pd
import numpy as np
import os

## 1. Carregamento e Inspeção

### Objetivos desta seção:
- Carregar o arquivo CSV em um DataFrame
- Explorar a estrutura básica dos dados
- Identificar valores ausentes
- 00:51 Segundos

In [83]:
# 1.1 Carregamento do arquivo CSV
# TODO: Carregue o arquivo 'imdb.csv' em um DataFrame chamado 'df'
df = pd.read_csv("./datasets/imdb.csv")
# TODO: Exiba informações básicas sobre o dataset (número de linhas, colunas)
df.info()
# TODO: Liste os nomes das colunas disponíveis
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33600 entries, 0 to 33599
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     33600 non-null  object 
 1   title                  33600 non-null  object 
 2   link                   33600 non-null  object 
 3   year                   33600 non-null  int64  
 4   duration               33379 non-null  object 
 5   rating_mpa             25624 non-null  object 
 6   rating_imdb            33462 non-null  float64
 7   vote                   33462 non-null  float64
 8   budget                 11815 non-null  float64
 9   gross_world_wide       18222 non-null  float64
 10  gross_us_canada        17571 non-null  float64
 11  gross_opening_weekend  15523 non-null  float64
 12  director               33241 non-null  object 
 13  writer                 32024 non-null  object 
 14  star                   33127 non-null  object 
 15  ge

Index(['id', 'title', 'link', 'year', 'duration', 'rating_mpa', 'rating_imdb',
       'vote', 'budget', 'gross_world_wide', 'gross_us_canada',
       'gross_opening_weekend', 'director', 'writer', 'star', 'genre',
       'country_origin', 'filming_location', 'production_company', 'language',
       'win', 'nomination', 'oscar'],
      dtype='object')

In [84]:
# 1.2 Exibição das primeiras linhas e informações básicas
# TODO: Use df.head() para exibir as primeiras 5 linhas
df.head()
# TODO: Use df.info() para exibir informações gerais sobre o DataFrame
df.info()
# TODO: Use df.describe() para exibir estatísticas descritivas
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33600 entries, 0 to 33599
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     33600 non-null  object 
 1   title                  33600 non-null  object 
 2   link                   33600 non-null  object 
 3   year                   33600 non-null  int64  
 4   duration               33379 non-null  object 
 5   rating_mpa             25624 non-null  object 
 6   rating_imdb            33462 non-null  float64
 7   vote                   33462 non-null  float64
 8   budget                 11815 non-null  float64
 9   gross_world_wide       18222 non-null  float64
 10  gross_us_canada        17571 non-null  float64
 11  gross_opening_weekend  15523 non-null  float64
 12  director               33241 non-null  object 
 13  writer                 32024 non-null  object 
 14  star                   33127 non-null  object 
 15  ge

Unnamed: 0,year,rating_imdb,vote,budget,gross_world_wide,gross_us_canada,gross_opening_weekend,win,nomination,oscar
count,33600.0,33462.0,33462.0,11815.0,18222.0,17571.0,15523.0,33600.0,33600.0,33600.0
mean,1992.393631,6.155158,66372.11,84543200.0,38149610.0,18082360.0,5110082.0,0.0,4.850357,0.102619
std,18.957395,1.14607,665314.7,2866281000.0,121010500.0,48531810.0,14883190.0,0.0,17.719188,0.508687
min,1960.0,1.1,6.0,1.0,1.0,64.0,11.0,0.0,0.0,0.0
25%,1976.0,5.5,517.0,2000000.0,158993.8,86036.5,13996.5,0.0,0.0,0.0
50%,1993.0,6.3,14000.0,9000000.0,2311544.0,909411.0,107536.0,0.0,0.0,0.0
75%,2009.0,7.0,40000.0,27000000.0,20929310.0,14051370.0,3772558.0,0.0,3.0,0.0
max,2024.0,9.6,26000000.0,300000000000.0,2923706000.0,936662200.0,357115000.0,0.0,433.0,11.0


In [85]:
# 1.3 Identificação de valores ausentes
# TODO: Use df.isnull().sum() para contar valores ausentes por coluna
total_nulls = df.isnull().sum()
# TODO: Calcule o percentual de valores ausentes
print(total_nulls / len(df))
# TODO: Crie um DataFrame organizado mostrando colunas com valores ausentes
#df[df.isnull()]

id                       0.000000
title                    0.000000
link                     0.000000
year                     0.000000
duration                 0.006577
rating_mpa               0.237381
rating_imdb              0.004107
vote                     0.004107
budget                   0.648363
gross_world_wide         0.457679
gross_us_canada          0.477054
gross_opening_weekend    0.538006
director                 0.010685
writer                   0.046905
star                     0.014077
genre                    0.011369
country_origin           0.010893
filming_location         0.200268
production_company       0.041012
language                 0.014613
win                      0.000000
nomination               0.000000
oscar                    0.000000
dtype: float64


## 2. Limpeza e Transformação

### Objetivos desta seção:
- Converter colunas para tipos numéricos apropriados
- Criar nova coluna de lucro
- Padronizar nomes das colunas

In [86]:
# 2.1 Conversão de tipos de dados
# TODO: Crie uma função para converter duration (ex: "1h 38m" -> 98 minutos)
def convert_time(time_string:str):
    if pd.isna(time_string):
        return None
    hour = 0
    minute = 0

    if "h" in time_string:
        split_time = time_string.split(" ")[0]
        hours = int(split_time[0].split("h")[0])
        hour = hours * 60
        if "m" in time_string:
            split_time = time_string.split(" ")[1]
            minute = int(split_time[0].split("m")[0])        
        return hour + minute
    elif "m" in time_string:
        split_time = time_string.split(" ")[0]
        minute = int(split_time[0].split("m")[0])
        return minute
    else:
        return None
    



# TODO: Aplique as conversões usando pd.to_numeric() com errors='coerce':
cols_to_change = ["year", "rating_imdb", "vote", "budget", "gross_world_wide", "gross_us_canada", "oscar"]
#pd.to_numeric(df[cols_to_change], errors="coerce")
for col in cols_to_change:
    pd.to_numeric(df[col], errors="coerce")
# - year para numérico
# - duration_minutes usando a função criada
df["duration_minutes"] = df["duration"].apply(convert_time)
# - rating_imdb para numérico
# - vote para numérico
# - budget para numérico
# - gross_world_wide para numérico
# - gross_us_canada para numérico
# - oscar para numérico

# TODO: Exiba uma amostra das conversões realizadas
df

Unnamed: 0,id,title,link,year,duration,rating_mpa,rating_imdb,vote,budget,gross_world_wide,...,star,genre,country_origin,filming_location,production_company,language,win,nomination,oscar,duration_minutes
0,tt0073470,Strip Nude for Your Killer,https://www.imdb.com/title/tt0073470,1975,1h 38m,Not Rated,5.6,33000.0,,,...,"Edwige Fenech, Nino Castelnuovo, Femi Benussi","Erotic Thriller, Giallo, Slasher Horror, Horro...",Italy,"Milan, Lombardia, Italy",FRAL Cinematografica,Italian,0,0,0,63.0
1,tt0072764,Carry on Behind,https://www.imdb.com/title/tt0072764,1975,1h 30m,Not Rated,5.6,31000.0,,,...,"Elke Sommer, Kenneth Williams, Bernard Bresslaw","Parody, Slapstick, Comedy",United Kingdom,"Bad Godesberg Way, Maidenhead, Berkshire, Engl...","The Rank Organisation, Peter Rogers Productions",English,0,0,0,63.0
2,tt0131526,"So Young, So Lovely, So Vicious...",https://www.imdb.com/title/tt0131526,1975,1h 30m,,4.9,487.0,,,...,"Gloria Guida, Dagmar Lassander, Fred Robsahm","Drama, Mystery, Thriller",Italy,"Sardinia, Italy",Domiziana Internazionale Cinematografica,Italian,0,0,0,63.0
3,tt0073335,Mahogany,https://www.imdb.com/title/tt0073335,1975,1h 49m,PG,6.1,23000.0,,,...,"Diana Ross, Billy Dee Williams, Anthony Perkins","Drama, Romance",United States,Marshall Field Co Department Store 111 N Sta...,"Motown Productions, Nikor Productions, Paramou...","English, Italian",0,0,1,64.0
4,tt0073006,Foreplay,https://www.imdb.com/title/tt0073006,1975,1h 15m,R,4.0,185.0,,,...,"Irwin Corey, Pat Paulsen, Deborah Loomis","Parody, Satire, Sketch Comedy, Comedy",United States,,SynFrank Enterprises,English,0,0,0,61.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33595,tt0095428,Kansas,https://www.imdb.com/title/tt0095428,1988,1h 50m,R,5.5,14000.0,,2432536.0,...,"Matt Dillon, Andrew McCarthy, Leslie Hope","Crime, Drama, Romance, Thriller",United States,"Lawrence, Kansas, USA",Trans World Entertainment TWE,English,0,0,0,65.0
33596,tt0094980,The Decline of Western Civilization Part II: T...,https://www.imdb.com/title/tt0094980,1988,1h 33m,R,7.2,45000.0,500000.0,373743.0,...,"Joe Perry, Steven Tyler, Gene Simmons","Documentary, Music",United States,"Los Angeles, California, USA",IRS World Media,English,0,0,0,63.0
33597,tt0093170,Man Behind the Sun,https://www.imdb.com/title/tt0093170,1988,1h 45m,Not Rated,6.1,76000.0,,,...,"Gang Wang, Dai Yao Wu, Runshen Wang","Drama, History, Horror, War",Hong Kong,,SilMetropole Organisation,Mandarin,0,0,0,64.0
33598,tt0170783,Yesterday,https://www.imdb.com/title/tt0170783,1988,1h 24m,,8.9,36000.0,,,...,"Hristo Shopov, Georgi Staykov, Sofiya Kuzeva",Drama,Bulgaria,"Plovdiv, Bulgaria",,"Bulgarian, English",0,0,0,62.0


In [87]:
# 2.2 Criação da coluna profit
# TODO: Crie a coluna 'profit' = gross_world_wide - budget
df["profit"] = df["gross_world_wide"] - df["budget"]
# TODO: Exiba estatísticas da nova coluna (quantos filmes têm dados de lucro, etc.)
df[df["profit"] > 0]

Unnamed: 0,id,title,link,year,duration,rating_mpa,rating_imdb,vote,budget,gross_world_wide,...,genre,country_origin,filming_location,production_company,language,win,nomination,oscar,duration_minutes,profit
12,tt0076138,Heroes,https://www.imdb.com/title/tt0076138,1977,1h 52m,PG,6.0,22000.0,3200000.0,3.350000e+07,...,"Psychological Drama, Comedy, Drama",United States,"Times Square Army Recruiting Station, Manhatta...","David Foster Productions, Universal Pictures","English, Spanish",0,0,0,65.0,3.030000e+07
63,tt1611224,Abraham Lincoln: Vampire Hunter,https://www.imdb.com/title/tt1611224,2012,1h 45m,R,5.9,164000.0,69000000.0,1.164716e+08,...,"Dark Fantasy, Vampire Horror, Action, Fantasy,...","United States, Russia","French Quarter, New Orleans, Louisiana, USA","Abraham Productions, Genre Films, Tim Burton P...",English,0,0,0,64.0,4.747158e+07
64,tt1649419,The Impossible,https://www.imdb.com/title/tt1649419,2012,1h 54m,PG-13,7.5,252000.0,45000000.0,1.980872e+08,...,"Disaster, Tragedy, Drama, History, Thriller","Spain, Thailand, United States","Khao Lak Orchid Beach Resort, Phang Nga, Thailand","Mediaset Espaa, Summit Entertainment, Apaches ...","English, Thai, German, Swedish, Spanish",0,70,1,65.0,1.530872e+08
65,tt1568338,Man on a Ledge,https://www.imdb.com/title/tt1568338,2012,1h 42m,PG-13,6.6,160000.0,42000000.0,4.763603e+07,...,"Dark Comedy, Action, Crime, Thriller",United States,"Roosevelt Hotel 45th Street Madison Avenue, ...","Summit Entertainment, Di Bonaventura Pictures","English, Spanish",0,0,0,64.0,5.636031e+06
68,tt2527336,Star Wars: Episode VIII - The Last Jedi,https://www.imdb.com/title/tt2527336,2017,2h 32m,PG-13,6.9,687000.0,317000000.0,1.334408e+09,...,"Action Epic, Adventure Epic, Fantasy Epic, Sci...",United States,"Salar de Uyuni, Bolivia","Lucasfilm, Ram Bergman Productions, Bad Robot",English,0,99,4,123.0,1.017408e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33545,tt5690360,Slender Man,https://www.imdb.com/title/tt5690360,2018,1h 33m,PG-13,3.3,40000.0,10000000.0,5.173855e+07,...,"Psychological Horror, Supernatural Horror, Tee...",United States,"New England Studios, Devens, Massachusetts, USA","Screen Gems, Mythology Entertainment, Madhouse...",English,0,0,0,63.0,4.173855e+07
33546,tt5670152,Traffik,https://www.imdb.com/title/tt5670152,2018,1h 36m,R,5.9,22000.0,4000000.0,9.515914e+06,...,"Action, Thriller",United States,"1510 Farmham Rd, Ojai, California, USA","Summit Entertainment, Codeblack Films, Hidden ...",English,0,0,0,63.0,5.515914e+06
33560,tt0255094,The Circle,https://www.imdb.com/title/tt0255094,2000,1h 30m,Not Rated,7.4,68000.0,10000.0,7.560350e+05,...,Drama,"Iran, Italy, Switzerland","Tehran, Iran",Direction du Dveloppement et de la Coopration ...,Persian,0,7,0,63.0,7.460350e+05
33567,tt0109444,Clear and Present Danger,https://www.imdb.com/title/tt0109444,1994,2h 21m,PG-13,6.9,111000.0,62000000.0,2.158877e+08,...,"Conspiracy Thriller, Political Thriller, Spy, ...","United States, Canada","Cuernavaca, Morelos, Mexico","Mace Neufeld Productions, Paramount Pictures","English, Spanish",0,11,2,122.0,1.538877e+08


In [88]:
# 2.3 Padronização dos nomes das colunas
# TODO: Padronize os nomes das colunas (minúsculas, sem espaços, substituir por _)
cols = df.columns
for col in cols:
    col = col.lower().replace(" ", "_")
df = df.set_axis(cols, axis='columns')
# TODO: Exiba os nomes das colunas após padronização
df.columns

Index(['id', 'title', 'link', 'year', 'duration', 'rating_mpa', 'rating_imdb',
       'vote', 'budget', 'gross_world_wide', 'gross_us_canada',
       'gross_opening_weekend', 'director', 'writer', 'star', 'genre',
       'country_origin', 'filming_location', 'production_company', 'language',
       'win', 'nomination', 'oscar', 'duration_minutes', 'profit'],
      dtype='object')

## 3. Exploração e Agregação

### Objetivos desta seção:
- Identificar filmes com maiores notas IMDb
- Calcular estatísticas por década
- Encontrar o melhor diretor
- Identificar gênero mais comum

In [89]:
# 3.1 Top 5 filmes com maior nota IMDb
# TODO: Use sort_values() para ordenar por 'rating_imdb' (decrescente)
df.sort_values("rating_imdb", ascending=False)
# TODO: Selecione os primeiros 5 filmes
top5_movies = df.sort_values("rating_imdb", ascending=False).head()
# TODO: Exiba colunas relevantes: title, year, rating_imdb, vote, director, genre
top5_movies[["title", "year", "rating_imdb", "vote", "director", "genre"]]

Unnamed: 0,title,year,rating_imdb,vote,director,genre
11166,Day by Day: The Dynasty,2023,9.6,9.0,Justin Le Pera,Documentary
20986,Love Goes Public,2019,9.6,15.0,Kevin J Flannagan,Documentary
12225,Water Brother,2024,9.5,26.0,"Charles Kinnane, Daniel Kinnane",Documentary
14532,SEVENTEEN TOUR 'FOLLOW' to JAPAN: LIVE VIEWING,2023,9.5,37.0,,"Documentary, Music"
27302,The Cowboy and the Queen,2023,9.4,71.0,Andrea Blaugrund Nevins,"Biography, Documentary"


In [90]:
# 3.2 Média de duração por década
# TODO: Crie uma coluna 'decade' usando (year // 10) * 10
df["decade"] = (df["year"] // 10) * 10
# TODO: Use groupby('decade') para calcular a média de 'duration_minutes'
df_grouped = df.groupby("decade")["duration_minutes"].mean()
# TODO: Exiba os resultados
print(df_grouped)
# TODO: EXTRA - Calcule outras estatísticas por década:
# - Média de rating_imdb por década
# - Média de votos por década  
# - Quantidade de filmes por década
df_grouped = df.groupby("decade").agg(
    mean_rating_imdb = ("rating_imdb", "mean"),
    mean_votes = ("vote", "mean"),
    total_movies = ("title", "count"),
)
df_grouped

decade
1960    71.321133
1970    70.427601
1980    71.009598
1990    73.283615
2000    73.472857
2010    76.351751
2020    76.493997
Name: duration_minutes, dtype: float64


Unnamed: 0_level_0,mean_rating_imdb,mean_votes,total_movies
decade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,6.179788,12085.734954,5149
1970,5.935427,27782.298211,5050
1980,5.913377,36688.839197,5050
1990,6.047076,107496.14157,5001
2000,6.335423,110709.429266,5349
2010,6.457151,114801.467707,5151
2020,6.234367,41643.711315,2850


In [91]:
# 3.3 Diretor com maior média de nota IMDb (mínimo 3 filmes)
# TODO: Use groupby('director') para agrupar por diretor
df_grouped = df.groupby("director")
# TODO: Calcule estatísticas: média de rating_imdb, contagem de filmes, lista de títulos
df_grouped = df.groupby("director").agg(
    mean_rating_imdb = ("rating_imdb", "mean"),
    title_count = ("title", "count"),
    movies = ("title", list)
)

# TODO: Filtre apenas diretores com 3 ou mais filmes
df_grouped = df_grouped[df_grouped["title_count"] >= 3]
# TODO: Ordene por nota média (decrescente)
df_grouped = df_grouped.sort_values("mean_rating_imdb", ascending=False)
# TODO: Exiba o top 10 e identifique o melhor diretor
df_grouped.head(10)

Unnamed: 0_level_0,mean_rating_imdb,title_count,movies
director,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jim Brown,8.633333,3,"[The Weavers: Wasn't That a Time, Isn't This a..."
Sachin Pilgaonkar,8.533333,3,"[Atmavishwas, Ashi Hi Banwa Banwi, Aayatya Gha..."
Upendra,8.514286,7,"[Shhh!, Operation Antha, Om, Swasthik, UI, Upe..."
Slobodan Sijan,8.5,3,"[Maratonci trce pocasni krug, Strangler vs. St..."
Kamalakara Kameshwara Rao,8.433333,3,"[Pandava Vanavasam, Narthanasala, Gundamma Katha]"
Mircea Dragan,8.4,5,"[Columna, The Miscellaneous Brigade on the Wat..."
Moustapha Akkad,8.4,3,"[The Message, The Message, Lion of the Desert]"
Sam Wrench,8.4,3,"[Billie Eilish Live at the O2, Laufey's A Nigh..."
Bharathan,8.3,4,"[Vaishali, Thevar Magan, Thazhvaram, Amaram]"
Srdjan Dragojevic,8.3,3,"[The Wounds, We Are Not Angels, Pretty Village..."


In [92]:
# 3.4 Gênero mais recorrente no dataset
# TODO: Separar gêneros combinados (ex: "Drama, Romance" -> ["Drama", "Romance"])
df["genre"] = df["genre"].str.split(", ")
# TODO: Criar uma lista com todos os gêneros individuais
genres = df["genre"].explode()

# TODO: Use pd.Series().value_counts() para contar frequências
print(genres.value_counts())
# TODO: Exiba o top 10 gêneros com percentuais
top_listed = round((genres.value_counts(normalize=True).head(10)) * 100, 2)

# TODO: Identifique o gênero mais recorrente
top_listed.head(1)

genre
Drama         18343
Comedy        11021
Thriller       6873
Romance        6451
Action         5472
              ...  
Talk Show         1
Reality TV        1
Game Show         1
Josei             1
Soap Opera        1
Name: count, Length: 191, dtype: int64


genre
Drama    17.64
Name: proportion, dtype: float64

## 4. Joins e Relações

### Objetivos desta seção:
- Criar DataFrame separado com informações de Oscar
- Realizar merge entre DataFrames
- Filtrar filmes ganhadores de Oscar

In [93]:
# 4.1 Carregue o arquivo 'oscars.csv' em um DataFrame chamado 'oscars'.
# Certifique-se de utilizar a codificação correta ao ler o arquivo.
oscars = pd.read_csv("./datasets/oscars.csv", delimiter="\t")
oscars.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation
0,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,The Noose|The Patent Leather Kid,tt0019217|tt0018253,Richard Barthelmess,Richard Barthelmess,nm0001932,,Nickie Elkins|The Patent Leather Kid,,
1,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,The Last Command|The Way of All Flesh,tt0019071|tt0019553,Emil Jannings,Emil Jannings,nm0417837,True,General Dolgorucki [Grand Duke Sergius Alexand...,,
2,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,A Ship Comes In,tt0018389,Louise Dresser,Louise Dresser,nm0237571,,Mrs. Pleznik,,
3,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,7th Heaven|Street Angel|Sunrise,tt0018379|tt0019429|tt0018455,Janet Gaynor,Janet Gaynor,nm0310980,True,Diane|Angela|The Wife,,
4,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,Sadie Thompson,tt0019344,Gloria Swanson,Gloria Swanson,nm0841797,,Sadie Thompson,,


In [94]:
oscars = oscars.dropna(subset="FilmId")
# 4.2 Realize o merge entre o DataFrame principal (df) e o DataFrame 'oscars',
# utilizando as colunas apropriadas para identificar os filmes.
id_test = set(df["id"])
test_set = set(oscars["FilmId"])

matches = []
for oscar_id in test_set:
    for imdb_id in id_test:
        if imdb_id in oscar_id:
            matches.append(imdb_id)

#FAILED


In [95]:
# 4.3 Identifique o diretor com o maior número de prêmios de BEST PICTURE,
# considerando apenas os filmes vencedores dessa categoria.
oscars_directors = oscars
oscars_directors = oscars_directors[oscars_directors["Category"] == "BEST PICTURE"]
oscars_directors = oscars_directors[~oscars_directors["Winner"].isna()]
oscars_directors["Nominees"] = oscars_directors["Nominees"].str.split("|")
Nominees = oscars_directors["Nominees"].explode()
Nominees.value_counts()

Nominees
Saul Zaentz          3
Clint Eastwood       2
Albert S. Ruddy      2
Jeremy Kleiner       2
Dede Gardner         2
                    ..
Kevin Costner        1
Jim Wilson           1
Lili Fini Zanuck     1
Richard D. Zanuck    1
Sean Baker           1
Name: count, Length: 127, dtype: int64

## 5. Pivot e Reshaping

### Objetivos desta seção:
- Criar pivot table com dados de rating por país e década
- Usar melt para reorganizar dados de bilheteria

In [96]:
# 5.1 Pivot table: média de rating_imdb por país e década
# TODO: Use pivot_table() com:
# - values='rating_imdb'
# - index='country_origin' 
# - columns='decade'
# - aggfunc='mean'
# - fill_value=0
df["decade"] = (df["year"] // 10) * 10
pivoted_imdb = pd.pivot_table(df, values="rating_imdb", index="country_origin",
                              columns="decade", aggfunc="mean", fill_value=0)
# TODO: Exiba as dimensões e primeiras linhas da pivot table
pivoted_imdb.head()
pivoted_imdb["country"] = pivoted_imdb.index
# TODO: EXTRA: Identifique os países com melhores médias por década
pivoted_imdb = pivoted_imdb.melt(id_vars="country", value_vars=pivoted_imdb.columns) #(Yugoslavia?????)
pivoted_imdb.groupby("decade").agg(
    country_list = ("country", list),
    max_value = ("value", "max")
)

Unnamed: 0_level_0,country_list,max_value
decade,Unnamed: 1_level_1,Unnamed: 2_level_1
1960,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",9.1
1970,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",8.6
1980,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",8.9
1990,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",9.2
2000,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",8.9
2010,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",8.8
2020,"[Afghanistan, Ireland, Japan, Netherlands, Ira...",8.9


In [97]:
# 5.2 Melt: reorganizar colunas de bilheteria em formato longo
# TODO: Use melt() para reorganizar colunas de bilheteria:
# - id_vars=['id', 'title', 'year']
# - value_vars=['gross_us_canada', 'gross_world_wide'] 
# - var_name='gross_type'
# - value_name='gross_value'
df_melted = pd.melt(df, id_vars=['id', 'title', 'year'], value_vars=['gross_us_canada', 'gross_world_wide'],
                   var_name='gross_type', value_name='gross_value')
# TODO: Remova valores nulos da coluna gross_value
df_melted = df_melted.dropna(subset="gross_value")
# TODO: Compare shapes antes e depois do melt
print(df_melted.shape)
print(df.shape)
# TODO: Calcule estatísticas por tipo de bilheteria (count, mean, std)

df_melted.groupby("gross_type").agg(
    amount_of_titles = ("title", "count"),
    mean_value = ("gross_value", "mean"),
    std_value = ("gross_value", "std")
) #Not very clear what should be done

(35793, 5)
(33600, 26)


Unnamed: 0_level_0,amount_of_titles,mean_value,std_value
gross_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gross_us_canada,17571,18082360.0,48531810.0
gross_world_wide,18222,38149610.0,121010500.0


## 6. Exportação

### Objetivo desta seção:
- Salvar tabela final processada em formato CSV

In [98]:
# 6.1 Exportação da tabela pivot
# TODO: Gere a tabela pivot mostrando a média de 'rating_imdb' por país e por década
pivoted_imdb = pd.pivot_table(df, values="rating_imdb", index="country_origin",
                              columns="decade", aggfunc="mean", fill_value=0)

# TODO: Verifique se o DataFrame resultante não está vazio
print(len(pivoted_imdb))

# TODO: Exiba algumas linhas da tabela pivot para conferência
print(pivoted_imdb.head())

# TODO: Exporte a tabela pivot para CSV com nome 'rating_pivot_pais_decada.csv'
# Dica: Use to_csv() com index=False e encoding='utf-8'
save_df_path = os.getcwd() + "/rating_pivot_pais_decada_gabriel.csv"
pivoted_imdb.to_csv(save_df_path, index=False, encoding="utf-8")

# TODO: Exiba mensagem de confirmação e amostra dos dados exportados
print(f"\n")

print(f"Arquivo salvo em {save_df_path}")
print(f"\n")
print(pivoted_imdb.head())

2937
decade                                          1960  1970  1980  1990  2000  \
country_origin                                                                 
Afghanistan, Ireland, Japan, Netherlands, Iran   0.0   0.0   0.0   0.0   7.3   
Algeria                                          0.0   7.3   0.0   0.0   0.0   
Algeria, France                                  0.0   0.0   0.0   0.0   5.8   
Algeria, France, Morocco, Belgium                0.0   0.0   0.0   0.0   7.0   
Angola, France                                   0.0   7.0   0.0   0.0   0.0   

decade                                          2010  2020  
country_origin                                              
Afghanistan, Ireland, Japan, Netherlands, Iran   0.0   0.0  
Algeria                                          0.0   0.0  
Algeria, France                                  0.0   0.0  
Algeria, France, Morocco, Belgium                0.0   0.0  
Angola, France                                   0.0   0.0  


Arqui