In [1]:
import os
import requests
import pandas as pd

In [2]:
BASICS_URL = "https://datasets.imdbws.com/title.basics.tsv.gz"
AKAS_URL = "https://datasets.imdbws.com/title.akas.tsv.gz"
RATINGS_URL = "https://datasets.imdbws.com/title.ratings.tsv.gz"

BASICS_FILE = "title.basics.tsv.gz"
AKAS_FILE = "title.akas.tsv.gz"
RATINGS_FILE = "title.ratings.tsv.gz"

In [3]:
def download_if_missing(url, filename):
    if not os.path.exists(filename):
        r = requests.get(url, stream=True)
        r.raise_for_status()
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)

In [4]:
download_if_missing(BASICS_URL, BASICS_FILE)
download_if_missing(AKAS_URL, AKAS_FILE)
download_if_missing(RATINGS_URL, RATINGS_FILE)

In [5]:
df_basics = pd.read_csv(BASICS_FILE, sep='\t', compression='gzip', low_memory=False)
print(df_basics.shape[0])
df_basics.head()

12011673


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [6]:
df_akas = pd.read_csv(AKAS_FILE, sep='\t', compression='gzip', low_memory=False)
print(df_akas.shape[0])
df_akas.head()

53534450


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,\N,\N,original,\N,1
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita,US,\N,imdbDisplay,\N,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
4,tt0000001,5,Καρμενσίτα,GR,\N,imdbDisplay,\N,0


In [7]:
df_ratings = pd.read_csv(RATINGS_FILE, sep='\t', compression='gzip', low_memory=False)
print(df_ratings.shape[0])
df_ratings.head()

1630728


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2182
1,tt0000002,5.5,302
2,tt0000003,6.4,2260
3,tt0000004,5.2,194
4,tt0000005,6.2,2999


In [8]:
df_merged = pd.merge(df_basics, df_akas, left_on='tconst', right_on='titleId', how='inner')
df_merged = pd.merge(df_merged, df_ratings, on='tconst', how='inner')
df_merged.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,titleId,ordering,title,region,language,types,attributes,isOriginalTitle,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",tt0000001,1,Carmencita,\N,\N,original,\N,1,5.7,2182
1,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",tt0000001,2,Carmencita,DE,\N,\N,literal title,0,5.7,2182
2,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",tt0000001,3,Carmencita,US,\N,imdbDisplay,\N,0,5.7,2182
3,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",tt0000001,4,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0,5.7,2182
4,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",tt0000001,5,Καρμενσίτα,GR,\N,imdbDisplay,\N,0,5.7,2182


In [9]:
df_filtered = df_merged
ALLOWED_TYPES = ['movie', 'tvMovie']
df_filtered = df_merged[df_merged['titleType'].isin(ALLOWED_TYPES)]
df_filtered['startYear'] = pd.to_numeric(df_filtered['startYear'], errors='coerce')
df_filtered = df_filtered[df_filtered['startYear'] >= 1980]
df_filtered = df_filtered[df_filtered['language'] == 'en']
allowed_regions = ['US', 'GB', 'DE', 'FR', 'IT', 'ES', 'PL', 'AU', 'CA']
df_filtered = df_filtered[df_filtered['region'].isin(allowed_regions)]
df_filtered = df_filtered.drop_duplicates(subset='tconst', keep='first')
print(df_filtered.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['startYear'] = pd.to_numeric(df_filtered['startYear'], errors='coerce')


62207


In [10]:
MIN_VOTES = 1000
df_filtered = df_filtered[df_merged['numVotes'] >= MIN_VOTES]
df_filtered = df_filtered.sort_values(by=['numVotes', 'averageRating'], ascending=[False, False])

  df_filtered = df_filtered[df_merged['numVotes'] >= MIN_VOTES]


In [11]:
df_top10k = df_filtered.head(10000)
df_top10k = df_top10k[['tconst', 'title', 'startYear', 'genres', 'averageRating', 'numVotes']]
df_top10k.head(10)

Unnamed: 0,tconst,title,startYear,genres,averageRating,numVotes
966370,tt0111161,The Shawshank Redemption,1994.0,Drama,9.3,3113920
1851898,tt0468569,The Dark Knight,2008.0,"Action,Crime,Drama",9.1,3089698
3175079,tt1375666,Inception,2010.0,"Action,Adventure,Sci-Fi",8.8,2744591
1113038,tt0137523,Fight Club,1999.0,"Crime,Drama,Thriller",8.8,2529183
953845,tt0109830,Forrest Gump,1994.0,"Drama,Romance",8.8,2431823
2202665,tt0816692,Interstellar,2014.0,"Adventure,Drama,Sci-Fi",8.7,2423212
963836,tt0110912,Pulp Fiction,1994.0,"Crime,Drama",8.8,2379487
1100863,tt0133093,The Matrix,1999.0,"Action,Sci-Fi",8.7,2196789
1058273,tt0120737,The Lord of the Rings: The Fellowship of the Ring,2001.0,"Adventure,Drama,Fantasy",8.9,2150633
1197325,tt0167260,The Lord of the Rings: The Return of the King,2003.0,"Adventure,Drama,Fantasy",9.0,2115499


In [13]:
df_top10k.tail(10)

Unnamed: 0,tconst,title,startYear,genres,averageRating,numVotes
1009631,tt0116118,Doctor Who: The Movie,1996.0,"Adventure,Drama,Sci-Fi",6.3,11170
5421820,tt4594834,Chi-Raq,2015.0,"Comedy,Crime,Drama",5.9,11169
905145,tt0104670,Ladybugs,1992.0,"Comedy,Sport",5.5,11168
2730940,tt11541872,Bigbug,2022.0,"Comedy,Sci-Fi",5.5,11167
1303791,tt0206420,Woman on Top,2000.0,"Comedy,Fantasy,Romance",5.3,11163
6023052,tt7399470,Qarib Qarib Singlle,2017.0,"Comedy,Romance",7.2,11156
2943823,tt12616480,Slingshot,2024.0,"Mystery,Sci-Fi,Thriller",5.7,11155
4128136,tt2182256,Premature,2014.0,"Comedy,Drama,Fantasy",5.7,11153
6386895,tt9624766,Jiu Jitsu,2020.0,"Action,Sci-Fi,Thriller",2.9,11153
4360011,tt2474024,The Last Five Years,2014.0,"Comedy,Drama,Musical",5.9,11149


In [14]:
df_top10k.to_csv("top10k_movies.csv", index=False)