# Getting Movie Keywords

## Dataset Creation

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Import 2 tables from IMDb datasets that we need
title_basics = pd.read_csv('title.basics.tsv.gz', sep='\t', header=0)
title_ratings = pd.read_csv('title.ratings.tsv.gz', sep='\t', header=0)

FileNotFoundError: [Errno 2] No such file or directory: 'title.basics.tsv.gz'

In [3]:
# Select only movies from the title_basics table
movies = title_basics[(title_basics.titleType == 'tvSeries')].copy()

In [4]:
# Set the indicies for our 2 tables to the IMDb key for movies
movies.set_index('tconst', inplace=True)
title_ratings.set_index('tconst', inplace=True)
# Joint the 2 tables by tconst, the IMDb key for all movies
movies_with_rating = movies.join(title_ratings, how='inner')

In [5]:
movies_with_rating.head(15000)

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0035803,tvSeries,The German Weekly Review,Die Deutsche Wochenschau,0,1940,1945,12,"Documentary,News",8.1,55
tt0039120,tvSeries,Americana,Americana,0,1947,1949,30,"Family,Game-Show",3.0,15
tt0039123,tvSeries,Kraft Theatre,Kraft Television Theatre,0,1947,1958,60,Drama,8.2,198
tt0039125,tvSeries,Public Prosecutor,Public Prosecutor,0,1947,1951,20,"Crime,Drama,Mystery",5.5,23
tt0040021,tvSeries,Actor's Studio,Actor's Studio,0,1948,1950,30,Drama,7.2,80
...,...,...,...,...,...,...,...,...,...,...
tt0352035,tvSeries,"60 jours, 60 nuits","60 jours, 60 nuits",0,2003,\N,20,Documentary,2.8,10
tt0352036,tvSeries,The All New Harry Hill Show,The All New Harry Hill Show,0,2003,\N,30,Comedy,8.0,12
tt0352037,tvSeries,"The Adventures of Pepero, Son of the Andes",Andesu shônen Pepero no bôken,0,1975,1976,24,"Action,Adventure,Animation",7.2,106
tt0352038,tvSeries,Blue Blink,Aoi Blink,0,1989,\N,25,"Adventure,Animation,Family",7.4,103


In [41]:
# Take only the top 1,000 movies, where we rank movies by the number of votes they have received
movies_sorted_by_rank = movies_with_rating.sort_values(by=['numVotes'], ascending=False)[:10000]

In [42]:
len(movies_sorted_by_rank)

10000

In [43]:
# Take only the top 1,000 movies, where we rank movies by the number of votes they have received
movies_since_1992 = movies_sorted_by_rank.loc[(movies_sorted_by_rank['startYear'].isin(['2022',
                                                                                     '2021',
                                                                                     '2020','2019','2018','2017','2016',
                                                                                     '2015','2014','2013','2012','2011',
                                                                                     '2010','2009','2008','2007','2006',
                                                                                     '2005','2004','2003','2002','2001',
                                                                                     '2000','1999','1998','1997','1996',
                                                                                     '1995','1994','1993','1992']))]
movies_index = movies_since_1992.index

In [47]:
len(movies_since_1992)

8735

In [48]:
movies_since_1992.tail(100)

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt5458128,tvSeries,The White Slave,La esclava blanca,0,2016,\N,60,History,8.0,617
tt14377480,tvSeries,Kamen Rider Black Sun,Kamen Rider Black Sun,0,2022,\N,86,"Action,Adventure,Drama",7.3,617
tt7867522,tvSeries,Bullets,Bullets,0,2018,2018,60,"Crime,Drama,Thriller",6.5,617
tt1483620,tvSeries,Kamen Rider W,Kamen raidâ Daburu,0,2009,2010,25,"Action,Adventure,Comedy",8.3,617
tt0805978,tvSeries,This American Life,This American Life,0,2007,2012,22,Documentary,8.4,617
...,...,...,...,...,...,...,...,...,...,...
tt0388583,tvSeries,Bruiser,Bruiser,0,2000,2000,29,Comedy,7.1,605
tt14129388,tvSeries,Bust Down,Bust Down,0,2022,\N,28,Comedy,6.8,605
tt10427926,tvSeries,Magical Sempai,Tejina Senpai,0,2019,\N,15,"Animation,Comedy,Drama",6.1,605
tt1559681,tvSeries,Gary: Tank Commander,Gary: Tank Commander,0,2009,2012,\N,"Comedy,War",7.8,605


In [50]:
print(movies_since_1992.shape)

(8735, 10)


In [51]:
movies_sorted_by_rank.head(100)

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0944947,tvSeries,Game of Thrones,Game of Thrones,0,2011,2019,57,"Action,Adventure,Drama",9.2,2142235
tt0903747,tvSeries,Breaking Bad,Breaking Bad,0,2008,2013,49,"Crime,Drama,Thriller",9.5,1947109
tt4574334,tvSeries,Stranger Things,Stranger Things,0,2016,2024,51,"Drama,Fantasy,Horror",8.7,1226410
tt1520211,tvSeries,The Walking Dead,The Walking Dead,0,2010,2022,44,"Drama,Horror,Thriller",8.1,1017040
tt0108778,tvSeries,Friends,Friends,0,1994,2004,22,"Comedy,Romance",8.9,1016319
...,...,...,...,...,...,...,...,...,...,...
tt3920596,tvSeries,Big Little Lies,Big Little Lies,0,2017,2019,60,"Crime,Drama,Mystery",8.5,206639
tt0452046,tvSeries,Criminal Minds,Criminal Minds,0,2005,\N,42,"Crime,Drama,Mystery",8.1,205451
tt0098936,tvSeries,Twin Peaks,Twin Peaks,0,1990,1991,47,"Crime,Drama,Mystery",8.8,202240
tt6257970,tvSeries,The End of the F***ing World,The End of the F***ing World,0,2017,2019,25,"Adventure,Comedy,Crime",8.0,199242


In [52]:
movies_sorted_by_rank.shape

(10000, 10)

In [53]:
from tqdm import tqdm
from time import sleep
from imdb import IMDb

ia = IMDb()

In [54]:
keywords_dict = {}
for movie_index in tqdm(movies_index):
    sleep(1)
    try:
        keywords_dict[movie_index] = ia.get_movie_keywords(movie_index[2:])['data']['keywords']
    except:
        keywords_dict[movie_index] = ''   
        
keywords = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in keywords_dict.items() ])).transpose()
keywords = keywords.apply(lambda x: ','.join(x.dropna()), axis=1)
keywords = pd.DataFrame(keywords)
keywords.rename(columns={0:'keywords'}, inplace=True)

100%|█████████████████████████████████████| 8735/8735 [4:41:15<00:00,  1.93s/it]


In [57]:
# Save the plots to a CSV
keywords.to_csv(path_or_buf='keywords_test.csv')

In [58]:
# Load our keywords for the CSV
keywords = pd.read_csv('keywords_test.csv')
keywords.rename(columns={'Unnamed: 0':'tconst'}, inplace=True)
keywords.set_index('tconst', inplace=True)

In [59]:
keywords.head()

Unnamed: 0_level_0,keywords
tconst,Unnamed: 1_level_1
tt0944947,"based-on-novel,dragon,politics,nudity,incest,q..."
tt0903747,"cancer,chemistry,methamphetamine,albuquerque-n..."
tt4574334,"government-conspiracy,1980s,friendship,telekin..."
tt1520211,"zombie,survival,post-apocalypse,based-on-comic..."
tt0108778,"friendship,friend,new-york-city,roommate,divor..."


In [60]:
# Join to our table
movies = movies_since_1992.join(keywords, how='inner')

In [61]:
# Save our dataframe to a CSV if you don't want to re-run the loop
movies.to_csv(path_or_buf='tvSeries_keywords_test.csv')

In [None]:
# get unique 