In [2]:
import pandas as pd

In [3]:
df1 = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False)
df2 = pd.read_csv('title.ratings.tsv', sep='\t')

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11702150 entries, 0 to 11702149
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 803.5+ MB


In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575801 entries, 0 to 1575800
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1575801 non-null  object 
 1   averageRating  1575801 non-null  float64
 2   numVotes       1575801 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 36.1+ MB


In [6]:
df1['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvEpisode', 'tvSeries',
       'tvMiniSeries', 'tvSpecial', 'video', 'videoGame', 'tvPilot'],
      dtype=object)

In [7]:
# convert runtimeMinutes and startYear to numeric, coerce errors to NaN
df1['runtimeMinutes'] = pd.to_numeric(df1['runtimeMinutes'], errors='coerce')
df1['startYear'] = pd.to_numeric(df1['startYear'], errors='coerce')

In [8]:
# grab only movie values from titleType
df1 = df1[df1['titleType'].isin(['movie', 'tvMovie'])]

In [9]:
# cutoff movies values at being greater than or equal to 45 minutes and less than or equal to 300 minutes
df1 = df1[(df1['runtimeMinutes'] >= 45) & (df1['runtimeMinutes'] <= 300)]

In [None]:
# cutoff movies where is adult == 1
df1 = df1[df1['isAdult'] == "0"]

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 482187 entries, 15479 to 11702100
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          482187 non-null  object 
 1   titleType       482187 non-null  object 
 2   primaryTitle    482186 non-null  object 
 3   originalTitle   482186 non-null  object 
 4   isAdult         482187 non-null  object 
 5   startYear       482187 non-null  float64
 6   endYear         482187 non-null  object 
 7   runtimeMinutes  482187 non-null  float64
 8   genres          482187 non-null  object 
dtypes: float64(2), object(7)
memory usage: 36.8+ MB


In [12]:
# inner join df1 and df2 on tconst
df = pd.merge(df1, df2, on='tconst', how='inner')

In [None]:
df = df[(df['startYear'] >= 1950) | (df['averageRating'] >= 7.0) & (df['numVotes'] >= 5000)]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
378,tt0006864,movie,Intolerance,Intolerance: Love's Struggle Throughout the Ages,0,1916.0,\N,163.0,"Drama,History",7.7,17445
902,tt0009968,movie,Broken Blossoms,Broken Blossoms or The Yellow Man and the Girl,0,1919.0,\N,90.0,"Drama,Romance",7.2,11487
982,tt0010323,movie,The Cabinet of Dr. Caligari,Das Cabinet des Dr. Caligari,0,1920.0,\N,67.0,"Horror,Mystery,Thriller",8.0,73629
1166,tt0011237,movie,The Golem,"Der Golem, wie er in die Welt kam",0,1920.0,\N,91.0,"Fantasy,Horror",7.2,9086
1292,tt0011841,movie,Way Down East,Way Down East,0,1920.0,\N,145.0,"Drama,Romance",7.3,6127
...,...,...,...,...,...,...,...,...,...,...,...
17977,tt0042040,movie,Whisky Galore!,Whisky Galore!,0,1949.0,\N,82.0,"Comedy,Crime",7.1,7018
17978,tt0042041,movie,White Heat,White Heat,0,1949.0,\N,114.0,"Action,Crime,Drama",8.1,37280
17980,tt0042046,movie,The Window,The Window,0,1949.0,\N,73.0,"Drama,Film-Noir,Thriller",7.4,5201
18157,tt0042369,movie,D.O.A.,D.O.A.,0,1949.0,\N,83.0,"Crime,Drama,Film-Noir",7.2,13681


In [19]:
# remove all rows where votes are less than 10, this removes and obscure movies, then remove all rows with votes under 500, if poorly rated
df = df[df['numVotes'] >= 25]

In [20]:
# drop movies with less than 500 ratings and lower scores, keeping nicher movies if there are rated well
df = df[~((df['numVotes'] <= 500) & (df['averageRating'] < 7))]

In [21]:
# drop all columns that are noise and not needed
df.drop(columns=['isAdult','endYear', 'titleType','originalTitle',], inplace=True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108642 entries, 378 to 338661
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          108642 non-null  object 
 1   primaryTitle    108642 non-null  object 
 2   startYear       108642 non-null  float64
 3   runtimeMinutes  108642 non-null  float64
 4   genres          108642 non-null  object 
 5   averageRating   108642 non-null  float64
 6   numVotes        108642 non-null  int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 6.6+ MB


In [23]:
# convert to csv and save
df.to_csv('cleaned_imdb_movies.csv', index=False)