# Importando Bibliotecas

In [3]:
import gc
import os, requests
import pandas as pd
from numpy import NaN
from dotenv import load_dotenv
from tmdbv3api import TMDb, Movie

# Funções

In [2]:
def get_id_movie(title_movie):
    try:
        res = movie.search(title_movie)
        return int(res[0].id)
    except:
        return NaN

# Extração

In [4]:
tmdb = TMDb()
movie = Movie()

In [6]:
load_dotenv('../content/keys.env')
tmdb.api_key = os.getenv('api_key')

In [5]:
urls = [
    'https://en.wikipedia.org/wiki/List_of_American_films_of_2018', 
    'https://en.wikipedia.org/wiki/List_of_American_films_of_2019', 
    'https://en.wikipedia.org/wiki/List_of_American_films_of_2020',
    'https://en.wikipedia.org/wiki/List_of_American_films_of_2021',
    'https://en.wikipedia.org/wiki/List_of_American_films_of_2022',
    'https://en.wikipedia.org/wiki/List_of_American_films_of_2023'
    ]

df = pd.DataFrame()
for i in range(4):
    for j in range(2,7):
        temp_df = pd.read_html(str(urls[i]), header = 0)[j]
        df = pd.concat([temp_df, df], ignore_index=True)

df = df.filter(['Title'], axis = 1)

In [6]:
df.dropna(inplace=True)
df

Unnamed: 0,Title
1,Venom: Let There Be Carnage
2,The Many Saints of Newark
3,The Addams Family 2
4,Bingo Hell
5,Black as Night
...,...
1155,Ready Player One
1156,Tyler Perry's Acrimony
1157,God's Not Dead: A Light in Darkness
1158,Gemini


In [7]:
df.replace(regex={r'The SpongeBob Movie: Sponge on the Run.*':'The SpongeBob Movie: Sponge on the Run'}, inplace= True)
df.replace(regex={r'Finding(.){7}':'Finding Ohana'}, inplace= True)

df.rename(columns={'Title':'original_title'}, inplace= True)

df

Unnamed: 0,original_title
1,Venom: Let There Be Carnage
2,The Many Saints of Newark
3,The Addams Family 2
4,Bingo Hell
5,Black as Night
...,...
1155,Ready Player One
1156,Tyler Perry's Acrimony
1157,God's Not Dead: A Light in Darkness
1158,Gemini


# Enriquecimento

In [8]:
df['id'] = df['original_title'].map(lambda x: get_id_movie(str(x)))
df.dropna(subset='id', inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,original_title,id
0,Venom: Let There Be Carnage,580489.0
1,The Many Saints of Newark,524369.0
2,The Addams Family 2,639721.0
3,Bingo Hell,802226.0
4,Black as Night,767504.0
...,...,...
1148,Ready Player One,333339.0
1149,Tyler Perry's Acrimony,464502.0
1150,God's Not Dead: A Light in Darkness,454286.0
1151,Gemini,412302.0


In [9]:
ids = df['id'].astype(int).tolist()

In [10]:
features_df = pd.DataFrame()

for movie_id in ids:
    response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id, tmdb.api_key))
    data_json = response.json()
    try:
        temp_df = pd.json_normalize(data_json)
        features_df = pd.concat([features_df, temp_df], ignore_index=True)
        del temp_df
        gc.collect()
    except:
        features_df = features_df

features_df

Unnamed: 0,adult,backdrop_path,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,belongs_to_collection
0,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,580489,tt7097896,en,Venom: Let There Be Carnage,After finding a host body in investigative rep...,...,,Venom: Let There Be Carnage,False,6.880,8642,558216.0,Venom Collection,/4bXIKqdZIjR8wKgZaGDaLhLj4yF.jpg,/rhLspFB1B8ZCkWEHFYmc3NKagzq.jpg,
1,False,/hrzoy8vvUrxQixOM11pwW9AX7Bu.jpg,0,"[{'id': 80, 'name': 'Crime'}]",https://www.themanysaintsofnewarkmovie.com,524369,tt8110232,en,The Many Saints of Newark,Young Anthony Soprano is growing up in one of ...,...,Who made Tony Soprano?,The Many Saints of Newark,False,6.475,508,,,,,
2,False,/9e6wp707XMouPG939o2fHunXXJR.jpg,0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",https://mgm.com/movies/the-addams-family-2,639721,tt11125620,en,The Addams Family 2,The Addams get tangled up in more wacky advent...,...,Unhappy to see you again.,The Addams Family 2,False,7.073,1079,750822.0,The Addams Family (Animated) Collection,/392omPB0NlpsxEqJkagjdUJeuhb.jpg,/3VfaWlf8wJewGsKSTYhJsY0pnNx.jpg,
3,False,/4lmDTjpZNf5S1DlFzIbuQ7dus7p.jpg,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",,802226,tt13372992,en,Bingo Hell,In the Barrio of Oak Springs live a strong and...,...,Are you feeling lucky?,Bingo Hell,False,4.895,95,,,,,
4,False,/8y556k6ihZeYv2OXcFHTdMJKp1m.jpg,0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,767504,tt13372884,en,Black as Night,A teenage girl with self-esteem issues finds c...,...,Find your own way to slay.,Black as Night,False,5.621,112,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1148,False,/dbrLfmFNFEJWv8rLnjpgCKlXWSy.jpg,175000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://readyplayeronemovie.com,333339,tt1677720,en,Ready Player One,When the creator of a popular video game syste...,...,A better reality awaits.,Ready Player One,False,7.615,13879,,,,,
1149,False,/h9USMzm43BZyUXc45OZAPlPBUjZ.jpg,20000000,"[{'id': 53, 'name': 'Thriller'}]",https://www.acrimony.movie/,464502,tt6063050,en,Acrimony,A faithful wife takes action when it becomes c...,...,Hell Hath No Fury,Acrimony,False,6.800,346,,,,,
1150,False,/gGBE24xD5NdMqvkJJEzP04jtFG9.jpg,0,"[{'id': 18, 'name': 'Drama'}]",,454286,tt6652708,en,God's Not Dead: A Light in Darkness,Pastor Dave responds to the unimaginable trag...,...,,God's Not Dead: A Light in Darkness,False,6.872,179,409343.0,God's Not Dead Collection,/bR20AQt3ycgQ6frHYF5tkomwbFp.jpg,/rX1S0AZpvLNFURBqMFXrXIj4Vmm.jpg,
1151,False,/eReiiUBlDgF2dAtKkpxzzK6H8t2.jpg,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",https://www.geminithefilm.com/,412302,tt5795086,en,Gemini,A heinous crime tests the complex relationship...,...,It’s all there if you know how to look.,Gemini,False,5.500,118,,,,,


In [11]:
credits_df = pd.DataFrame()

for movie_id in ids:
    response = requests.get('https://api.themoviedb.org/3/movie/{}/credits?api_key={}'.format(movie_id, tmdb.api_key))
    data_json = response.json()
    try:
        temp_df = pd.json_normalize(data_json)
        credits_df = pd.concat([credits_df, temp_df], ignore_index=True)
        del temp_df
        gc.collect()
    except:
        credits_df = credits_df

credits_df

Unnamed: 0,id,cast,crew
0,580489,"[{'adult': False, 'gender': 2, 'id': 2524, 'kn...","[{'adult': False, 'gender': 2, 'id': 149, 'kno..."
1,524369,"[{'adult': False, 'gender': 2, 'id': 4941, 'kn...","[{'adult': False, 'gender': 2, 'id': 2100, 'kn..."
2,639721,"[{'adult': False, 'gender': 2, 'id': 25072, 'k...","[{'adult': False, 'gender': 2, 'id': 5359, 'kn..."
3,802226,"[{'adult': False, 'gender': 1, 'id': 270, 'kno...","[{'adult': False, 'gender': 2, 'id': 84348, 'k..."
4,767504,"[{'adult': False, 'gender': 1, 'id': 1317152, ...","[{'adult': False, 'gender': 0, 'id': 24512, 'k..."
...,...,...,...
1148,333339,"[{'adult': False, 'gender': 2, 'id': 1034681, ...","[{'adult': False, 'gender': 2, 'id': 37, 'know..."
1149,464502,"[{'adult': False, 'gender': 1, 'id': 40036, 'k...","[{'adult': False, 'gender': 2, 'id': 49911, 'k..."
1150,454286,"[{'adult': False, 'gender': 2, 'id': 116431, '...","[{'adult': False, 'gender': 2, 'id': 24968, 'k..."
1151,412302,"[{'adult': False, 'gender': 1, 'id': 1345418, ...","[{'adult': False, 'gender': 0, 'id': 222365, '..."


In [12]:
keywords_df = pd.DataFrame()

for movie_id in ids:
    response = requests.get('https://api.themoviedb.org/3/movie/{}/keywords?api_key={}'.format(movie_id, tmdb.api_key))
    data_json = response.json()
    try:
        temp_df = pd.json_normalize(data_json)
        keywords_df = pd.concat([keywords_df, temp_df], ignore_index=True)
        del temp_df
        gc.collect()
    except:
        keywords_df = keywords_df

keywords_df

Unnamed: 0,id,keywords
0,580489,"[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n..."
1,524369,"[{'id': 586, 'name': 'new jersey'}, {'id': 700..."
2,639721,[]
3,802226,"[{'id': 10936, 'name': 'bingo'}]"
4,767504,"[{'id': 2411, 'name': 'new orleans, louisiana'..."
...,...,...
1148,333339,"[{'id': 282, 'name': 'video game'}, {'id': 818..."
1149,464502,"[{'id': 9748, 'name': 'revenge'}]"
1150,454286,"[{'id': 14765, 'name': 'church'}, {'id': 17943..."
1151,412302,"[{'id': 5306, 'name': 'boss'}, {'id': 12396, '..."


In [14]:
df = df.merge(features_df.drop('original_title', axis=1), how='left', on='id')
df = df.merge(credits_df, how='left', on='id')
df = df.merge(keywords_df, how='left', on='id')
df

Unnamed: 0,original_title,id,adult,backdrop_path,budget,genres,homepage,imdb_id,original_language,overview,...,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,belongs_to_collection,cast,crew,keywords
0,Venom: Let There Be Carnage,580489.0,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,tt7097896,en,After finding a host body in investigative rep...,...,6.880,8642,558216.0,Venom Collection,/4bXIKqdZIjR8wKgZaGDaLhLj4yF.jpg,/rhLspFB1B8ZCkWEHFYmc3NKagzq.jpg,,"[{'adult': False, 'gender': 2, 'id': 2524, 'kn...","[{'adult': False, 'gender': 2, 'id': 149, 'kno...","[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n..."
1,The Many Saints of Newark,524369.0,False,/hrzoy8vvUrxQixOM11pwW9AX7Bu.jpg,0,"[{'id': 80, 'name': 'Crime'}]",https://www.themanysaintsofnewarkmovie.com,tt8110232,en,Young Anthony Soprano is growing up in one of ...,...,6.475,508,,,,,,"[{'adult': False, 'gender': 2, 'id': 4941, 'kn...","[{'adult': False, 'gender': 2, 'id': 2100, 'kn...","[{'id': 586, 'name': 'new jersey'}, {'id': 700..."
2,The Addams Family 2,639721.0,False,/9e6wp707XMouPG939o2fHunXXJR.jpg,0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",https://mgm.com/movies/the-addams-family-2,tt11125620,en,The Addams get tangled up in more wacky advent...,...,7.073,1079,750822.0,The Addams Family (Animated) Collection,/392omPB0NlpsxEqJkagjdUJeuhb.jpg,/3VfaWlf8wJewGsKSTYhJsY0pnNx.jpg,,"[{'adult': False, 'gender': 2, 'id': 25072, 'k...","[{'adult': False, 'gender': 2, 'id': 5359, 'kn...",[]
3,Bingo Hell,802226.0,False,/4lmDTjpZNf5S1DlFzIbuQ7dus7p.jpg,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",,tt13372992,en,In the Barrio of Oak Springs live a strong and...,...,4.895,95,,,,,,"[{'adult': False, 'gender': 1, 'id': 270, 'kno...","[{'adult': False, 'gender': 2, 'id': 84348, 'k...","[{'id': 10936, 'name': 'bingo'}]"
4,Black as Night,767504.0,False,/8y556k6ihZeYv2OXcFHTdMJKp1m.jpg,0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,tt13372884,en,A teenage girl with self-esteem issues finds c...,...,5.621,112,,,,,,"[{'adult': False, 'gender': 1, 'id': 1317152, ...","[{'adult': False, 'gender': 0, 'id': 24512, 'k...","[{'id': 2411, 'name': 'new orleans, louisiana'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,Ready Player One,333339.0,False,/dbrLfmFNFEJWv8rLnjpgCKlXWSy.jpg,175000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://readyplayeronemovie.com,tt1677720,en,When the creator of a popular video game syste...,...,7.615,13879,,,,,,"[{'adult': False, 'gender': 2, 'id': 1034681, ...","[{'adult': False, 'gender': 2, 'id': 37, 'know...","[{'id': 282, 'name': 'video game'}, {'id': 818..."
1205,Tyler Perry's Acrimony,464502.0,False,/h9USMzm43BZyUXc45OZAPlPBUjZ.jpg,20000000,"[{'id': 53, 'name': 'Thriller'}]",https://www.acrimony.movie/,tt6063050,en,A faithful wife takes action when it becomes c...,...,6.800,346,,,,,,"[{'adult': False, 'gender': 1, 'id': 40036, 'k...","[{'adult': False, 'gender': 2, 'id': 49911, 'k...","[{'id': 9748, 'name': 'revenge'}]"
1206,God's Not Dead: A Light in Darkness,454286.0,False,/gGBE24xD5NdMqvkJJEzP04jtFG9.jpg,0,"[{'id': 18, 'name': 'Drama'}]",,tt6652708,en,Pastor Dave responds to the unimaginable trag...,...,6.872,179,409343.0,God's Not Dead Collection,/bR20AQt3ycgQ6frHYF5tkomwbFp.jpg,/rX1S0AZpvLNFURBqMFXrXIj4Vmm.jpg,,"[{'adult': False, 'gender': 2, 'id': 116431, '...","[{'adult': False, 'gender': 2, 'id': 24968, 'k...","[{'id': 14765, 'name': 'church'}, {'id': 17943..."
1207,Gemini,412302.0,False,/eReiiUBlDgF2dAtKkpxzzK6H8t2.jpg,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",https://www.geminithefilm.com/,tt5795086,en,A heinous crime tests the complex relationship...,...,5.500,118,,,,,,"[{'adult': False, 'gender': 1, 'id': 1345418, ...","[{'adult': False, 'gender': 0, 'id': 222365, '...","[{'id': 5306, 'name': 'boss'}, {'id': 12396, '..."


In [16]:
belongs_to_collection_columns = ['belongs_to_collection.id','belongs_to_collection.name','belongs_to_collection.poster_path','belongs_to_collection.backdrop_path']

In [17]:
df['belongs_to_collection'] = df[belongs_to_collection_columns].values.tolist()
df

Unnamed: 0,original_title,id,adult,backdrop_path,budget,genres,homepage,imdb_id,original_language,overview,...,vote_average,vote_count,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path,belongs_to_collection,cast,crew,keywords
0,Venom: Let There Be Carnage,580489.0,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,tt7097896,en,After finding a host body in investigative rep...,...,6.880,8642,558216.0,Venom Collection,/4bXIKqdZIjR8wKgZaGDaLhLj4yF.jpg,/rhLspFB1B8ZCkWEHFYmc3NKagzq.jpg,"[558216.0, Venom Collection, /4bXIKqdZIjR8wKgZ...","[{'adult': False, 'gender': 2, 'id': 2524, 'kn...","[{'adult': False, 'gender': 2, 'id': 149, 'kno...","[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n..."
1,The Many Saints of Newark,524369.0,False,/hrzoy8vvUrxQixOM11pwW9AX7Bu.jpg,0,"[{'id': 80, 'name': 'Crime'}]",https://www.themanysaintsofnewarkmovie.com,tt8110232,en,Young Anthony Soprano is growing up in one of ...,...,6.475,508,,,,,"[nan, nan, nan, nan]","[{'adult': False, 'gender': 2, 'id': 4941, 'kn...","[{'adult': False, 'gender': 2, 'id': 2100, 'kn...","[{'id': 586, 'name': 'new jersey'}, {'id': 700..."
2,The Addams Family 2,639721.0,False,/9e6wp707XMouPG939o2fHunXXJR.jpg,0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",https://mgm.com/movies/the-addams-family-2,tt11125620,en,The Addams get tangled up in more wacky advent...,...,7.073,1079,750822.0,The Addams Family (Animated) Collection,/392omPB0NlpsxEqJkagjdUJeuhb.jpg,/3VfaWlf8wJewGsKSTYhJsY0pnNx.jpg,"[750822.0, The Addams Family (Animated) Collec...","[{'adult': False, 'gender': 2, 'id': 25072, 'k...","[{'adult': False, 'gender': 2, 'id': 5359, 'kn...",[]
3,Bingo Hell,802226.0,False,/4lmDTjpZNf5S1DlFzIbuQ7dus7p.jpg,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",,tt13372992,en,In the Barrio of Oak Springs live a strong and...,...,4.895,95,,,,,"[nan, nan, nan, nan]","[{'adult': False, 'gender': 1, 'id': 270, 'kno...","[{'adult': False, 'gender': 2, 'id': 84348, 'k...","[{'id': 10936, 'name': 'bingo'}]"
4,Black as Night,767504.0,False,/8y556k6ihZeYv2OXcFHTdMJKp1m.jpg,0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,tt13372884,en,A teenage girl with self-esteem issues finds c...,...,5.621,112,,,,,"[nan, nan, nan, nan]","[{'adult': False, 'gender': 1, 'id': 1317152, ...","[{'adult': False, 'gender': 0, 'id': 24512, 'k...","[{'id': 2411, 'name': 'new orleans, louisiana'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,Ready Player One,333339.0,False,/dbrLfmFNFEJWv8rLnjpgCKlXWSy.jpg,175000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://readyplayeronemovie.com,tt1677720,en,When the creator of a popular video game syste...,...,7.615,13879,,,,,"[nan, nan, nan, nan]","[{'adult': False, 'gender': 2, 'id': 1034681, ...","[{'adult': False, 'gender': 2, 'id': 37, 'know...","[{'id': 282, 'name': 'video game'}, {'id': 818..."
1205,Tyler Perry's Acrimony,464502.0,False,/h9USMzm43BZyUXc45OZAPlPBUjZ.jpg,20000000,"[{'id': 53, 'name': 'Thriller'}]",https://www.acrimony.movie/,tt6063050,en,A faithful wife takes action when it becomes c...,...,6.800,346,,,,,"[nan, nan, nan, nan]","[{'adult': False, 'gender': 1, 'id': 40036, 'k...","[{'adult': False, 'gender': 2, 'id': 49911, 'k...","[{'id': 9748, 'name': 'revenge'}]"
1206,God's Not Dead: A Light in Darkness,454286.0,False,/gGBE24xD5NdMqvkJJEzP04jtFG9.jpg,0,"[{'id': 18, 'name': 'Drama'}]",,tt6652708,en,Pastor Dave responds to the unimaginable trag...,...,6.872,179,409343.0,God's Not Dead Collection,/bR20AQt3ycgQ6frHYF5tkomwbFp.jpg,/rX1S0AZpvLNFURBqMFXrXIj4Vmm.jpg,"[409343.0, God's Not Dead Collection, /bR20AQt...","[{'adult': False, 'gender': 2, 'id': 116431, '...","[{'adult': False, 'gender': 2, 'id': 24968, 'k...","[{'id': 14765, 'name': 'church'}, {'id': 17943..."
1207,Gemini,412302.0,False,/eReiiUBlDgF2dAtKkpxzzK6H8t2.jpg,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",https://www.geminithefilm.com/,tt5795086,en,A heinous crime tests the complex relationship...,...,5.500,118,,,,,"[nan, nan, nan, nan]","[{'adult': False, 'gender': 1, 'id': 1345418, ...","[{'adult': False, 'gender': 0, 'id': 222365, '...","[{'id': 5306, 'name': 'boss'}, {'id': 12396, '..."


In [24]:
df.drop(belongs_to_collection_columns, axis=1, inplace=True)
df

Unnamed: 0,original_title,id,adult,backdrop_path,budget,genres,homepage,imdb_id,original_language,overview,...,status,tagline,title,video,vote_average,vote_count,belongs_to_collection,cast,crew,keywords
0,Venom: Let There Be Carnage,580489.0,False,/vIgyYkXkg6NC2whRbYjBD7eb3Er.jpg,110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,tt7097896,en,After finding a host body in investigative rep...,...,Released,,Venom: Let There Be Carnage,False,6.880,8642,,"[{'adult': False, 'gender': 2, 'id': 2524, 'kn...","[{'adult': False, 'gender': 2, 'id': 149, 'kno...","[{'id': 1701, 'name': 'hero'}, {'id': 2095, 'n..."
1,The Many Saints of Newark,524369.0,False,/hrzoy8vvUrxQixOM11pwW9AX7Bu.jpg,0,"[{'id': 80, 'name': 'Crime'}]",https://www.themanysaintsofnewarkmovie.com,tt8110232,en,Young Anthony Soprano is growing up in one of ...,...,Released,Who made Tony Soprano?,The Many Saints of Newark,False,6.475,508,,"[{'adult': False, 'gender': 2, 'id': 4941, 'kn...","[{'adult': False, 'gender': 2, 'id': 2100, 'kn...","[{'id': 586, 'name': 'new jersey'}, {'id': 700..."
2,The Addams Family 2,639721.0,False,/9e6wp707XMouPG939o2fHunXXJR.jpg,0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",https://mgm.com/movies/the-addams-family-2,tt11125620,en,The Addams get tangled up in more wacky advent...,...,Released,Unhappy to see you again.,The Addams Family 2,False,7.073,1079,,"[{'adult': False, 'gender': 2, 'id': 25072, 'k...","[{'adult': False, 'gender': 2, 'id': 5359, 'kn...",[]
3,Bingo Hell,802226.0,False,/4lmDTjpZNf5S1DlFzIbuQ7dus7p.jpg,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",,tt13372992,en,In the Barrio of Oak Springs live a strong and...,...,Released,Are you feeling lucky?,Bingo Hell,False,4.895,95,,"[{'adult': False, 'gender': 1, 'id': 270, 'kno...","[{'adult': False, 'gender': 2, 'id': 84348, 'k...","[{'id': 10936, 'name': 'bingo'}]"
4,Black as Night,767504.0,False,/8y556k6ihZeYv2OXcFHTdMJKp1m.jpg,0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,tt13372884,en,A teenage girl with self-esteem issues finds c...,...,Released,Find your own way to slay.,Black as Night,False,5.621,112,,"[{'adult': False, 'gender': 1, 'id': 1317152, ...","[{'adult': False, 'gender': 0, 'id': 24512, 'k...","[{'id': 2411, 'name': 'new orleans, louisiana'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,Ready Player One,333339.0,False,/dbrLfmFNFEJWv8rLnjpgCKlXWSy.jpg,175000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://readyplayeronemovie.com,tt1677720,en,When the creator of a popular video game syste...,...,Released,A better reality awaits.,Ready Player One,False,7.615,13879,,"[{'adult': False, 'gender': 2, 'id': 1034681, ...","[{'adult': False, 'gender': 2, 'id': 37, 'know...","[{'id': 282, 'name': 'video game'}, {'id': 818..."
1205,Tyler Perry's Acrimony,464502.0,False,/h9USMzm43BZyUXc45OZAPlPBUjZ.jpg,20000000,"[{'id': 53, 'name': 'Thriller'}]",https://www.acrimony.movie/,tt6063050,en,A faithful wife takes action when it becomes c...,...,Released,Hell Hath No Fury,Acrimony,False,6.800,346,,"[{'adult': False, 'gender': 1, 'id': 40036, 'k...","[{'adult': False, 'gender': 2, 'id': 49911, 'k...","[{'id': 9748, 'name': 'revenge'}]"
1206,God's Not Dead: A Light in Darkness,454286.0,False,/gGBE24xD5NdMqvkJJEzP04jtFG9.jpg,0,"[{'id': 18, 'name': 'Drama'}]",,tt6652708,en,Pastor Dave responds to the unimaginable trag...,...,Released,,God's Not Dead: A Light in Darkness,False,6.872,179,,"[{'adult': False, 'gender': 2, 'id': 116431, '...","[{'adult': False, 'gender': 2, 'id': 24968, 'k...","[{'id': 14765, 'name': 'church'}, {'id': 17943..."
1207,Gemini,412302.0,False,/eReiiUBlDgF2dAtKkpxzzK6H8t2.jpg,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",https://www.geminithefilm.com/,tt5795086,en,A heinous crime tests the complex relationship...,...,Released,It’s all there if you know how to look.,Gemini,False,5.500,118,,"[{'adult': False, 'gender': 1, 'id': 1345418, ...","[{'adult': False, 'gender': 0, 'id': 222365, '...","[{'id': 5306, 'name': 'boss'}, {'id': 12396, '..."


In [25]:
features = ['id', 'adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'imdb_id', 'original_language', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count']

In [26]:
df = df[features]
df

Unnamed: 0,id,adult,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,580489.0,False,,110000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.venom.movie,tt7097896,en,After finding a host body in investigative rep...,206.904,...,2021-09-30,506863592,97,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,,Venom: Let There Be Carnage,False,6.880,8642
1,524369.0,False,,0,"[{'id': 80, 'name': 'Crime'}]",https://www.themanysaintsofnewarkmovie.com,tt8110232,en,Young Anthony Soprano is growing up in one of ...,35.364,...,2021-09-22,11620603,120,"[{'english_name': 'Latin', 'iso_639_1': 'la', ...",Released,Who made Tony Soprano?,The Many Saints of Newark,False,6.475,508
2,639721.0,False,,0,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",https://mgm.com/movies/the-addams-family-2,tt11125620,en,The Addams get tangled up in more wacky advent...,86.693,...,2021-10-01,119815153,93,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Unhappy to see you again.,The Addams Family 2,False,7.073,1079
3,802226.0,False,,0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",,tt13372992,en,In the Barrio of Oak Springs live a strong and...,11.837,...,2021-10-01,0,85,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Are you feeling lucky?,Bingo Hell,False,4.895,95
4,767504.0,False,,0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,tt13372884,en,A teenage girl with self-esteem issues finds c...,15.525,...,2021-10-01,0,87,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Find your own way to slay.,Black as Night,False,5.621,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,333339.0,False,,175000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",http://readyplayeronemovie.com,tt1677720,en,When the creator of a popular video game syste...,70.681,...,2018-03-28,582890172,140,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,A better reality awaits.,Ready Player One,False,7.615,13879
1205,464502.0,False,,20000000,"[{'id': 53, 'name': 'Thriller'}]",https://www.acrimony.movie/,tt6063050,en,A faithful wife takes action when it becomes c...,16.712,...,2018-03-30,34764055,120,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Hell Hath No Fury,Acrimony,False,6.800,346
1206,454286.0,False,,0,"[{'id': 18, 'name': 'Drama'}]",,tt6652708,en,Pastor Dave responds to the unimaginable trag...,48.581,...,2018-03-30,5728940,105,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,God's Not Dead: A Light in Darkness,False,6.872,179
1207,412302.0,False,,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 80, '...",https://www.geminithefilm.com/,tt5795086,en,A heinous crime tests the complex relationship...,12.494,...,2017-06-14,0,93,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,It’s all there if you know how to look.,Gemini,False,5.500,118


In [27]:
df.to_csv('../content/scrap_movies.csv', sep=';', index=False)

# Unindo as bases de dados (Extraída por WebScraping e usando Dataset do Kaggle)

In [9]:
df = pd.read_csv('../content/scrap_movies.csv', sep=';')

In [10]:
movies_data = pd.read_csv('../content/movies_metadata.csv')

  movies_data = pd.read_csv('../content/movies_metadata.csv')


In [11]:
credits_data = pd.read_csv('../content/credits.csv')

In [12]:
keywords_data = pd.read_csv('../content/keywords.csv')

In [13]:
indexes = []

for movie_id in movies_data['id']:
  try:
    int(movie_id)
  except:
    error_index = movies_data.loc[movies_data['id']==movie_id].index.tolist()
    indexes.append(error_index[0])
    print(movie_id)

1997-08-20
2012-09-29
2014-01-01


In [14]:
movies_data.drop(indexes, axis = 0, inplace = True)
movies_data.reset_index(drop=True, inplace=True)

In [15]:
movies_data['id'] = movies_data['id'].astype('int64')

In [16]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45463 entries, 0 to 45462
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45463 non-null  object 
 1   belongs_to_collection  4491 non-null   object 
 2   budget                 45463 non-null  object 
 3   genres                 45463 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45463 non-null  int64  
 6   imdb_id                45446 non-null  object 
 7   original_language      45452 non-null  object 
 8   original_title         45463 non-null  object 
 9   overview               44509 non-null  object 
 10  popularity             45460 non-null  object 
 11  poster_path            45077 non-null  object 
 12  production_companies   45460 non-null  object 
 13  production_countries   45460 non-null  object 
 14  release_date           45376 non-null  object 
 15  re

In [17]:
credits_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [18]:
keywords_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [19]:
df_merged = keywords_data.merge(credits_data, how='left', on='id')

In [20]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46496 entries, 0 to 46495
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46496 non-null  int64 
 1   keywords  46496 non-null  object
 2   cast      46496 non-null  object
 3   crew      46496 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.8+ MB


In [21]:
movies_data = movies_data.merge(df_merged, on='id', how='left')

In [22]:
movies_data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [23]:
df_final = pd.concat([movies_data, df], ignore_index=True)

# Exportando

In [24]:
df_final.to_csv('../content/df_final.csv', index = False, header = True, sep=';')