In [1]:
import pandas as pd

# Load full TMDB dataset
tmdb = pd.read_csv("tmdb_movie_data_full.csv")

# Load scraped themes
themes = pd.read_csv("movie_themes.csv", usecols=["title","themes"])

# Merge on title (left join keeps all TMDB rows)
df = tmdb.merge(themes, on="title", how="left")

# Fill movies with no themes found
df["themes"] = df["themes"].fillna("")

# Save
df.to_csv("final_tmdb.csv", index=False)


In [2]:
df.drop_duplicates(inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104763 entries, 0 to 137276
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   imdb_id               104763 non-null  object 
 1   tmdb_id               104763 non-null  int64  
 2   title                 104763 non-null  object 
 3   release_date          104193 non-null  object 
 4   genres                100547 non-null  object 
 5   revenue               104763 non-null  int64  
 6   budget                104763 non-null  int64  
 7   runtime               104763 non-null  int64  
 8   vote_average          104763 non-null  float64
 9   vote_count            104763 non-null  int64  
 10  top_cast              97547 non-null   object 
 11  director              102285 non-null  object 
 12  keywords              62804 non-null   object 
 13  spoken_languages      94149 non-null   object 
 14  collection_name       9343 non-null    object 
 15  watch

In [4]:
# drop useless error column
df.drop(columns=['error'],inplace = True)

In [5]:
# fill missing values with empty strings for modeling purposes
text_cols = [
    "genres",
    "top_cast",
    "director",
    "keywords",
    "spoken_languages",
    "watch_providers",
    "production_companies",
    "certification",
    "overview",
    "themes",
    "collection_name",
    "poster_url"
]


df[text_cols] = df[text_cols].fillna("")

# 4) If you’d rather keep release_date as a datetime, fill missing with pd.NaT
df["release_date"] = df["release_date"].fillna(pd.NaT)


In [6]:
# convert release_date to day,month,year
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['day'] = df['release_date'].dt.day
df['month'] = df['release_date'].dt.month
df['year'] = df['release_date'].dt.year
df.drop(columns=['release_date'], inplace=True)

In [7]:
# convert nans to 0 for day, month, year
for col in ("day","month","year"):
    if col in df.columns:
        df[col] = df[col].fillna(0).astype(int)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104763 entries, 0 to 137276
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   imdb_id               104763 non-null  object 
 1   tmdb_id               104763 non-null  int64  
 2   title                 104763 non-null  object 
 3   genres                104763 non-null  object 
 4   revenue               104763 non-null  int64  
 5   budget                104763 non-null  int64  
 6   runtime               104763 non-null  int64  
 7   vote_average          104763 non-null  float64
 8   vote_count            104763 non-null  int64  
 9   top_cast              104763 non-null  object 
 10  director              104763 non-null  object 
 11  keywords              104763 non-null  object 
 12  spoken_languages      104763 non-null  object 
 13  collection_name       104763 non-null  object 
 14  watch_providers       104763 non-null  object 
 15  produ

In [9]:
df.to_csv("final_cleaned_tmdb.csv", index=False)