In [1]:
# https://www.kaggle.com/c/tmdb-box-office-prediction/data?select=train.csv

In [None]:
import pandas as pd
import numpy as np
import json

In [3]:
# Read the train and test dataframe

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

# Analyse the data

In [5]:
train_df.shape, test_df.shape

((3000, 23), (4398, 22))

In [6]:
# train_df.info(), test_df.info()

In [7]:
# Analyse and add other columns accordingly 
# Remove status and impute others
drop_cols = ["id", "belongs_to_collection", "homepage", "imdb_id", "original_title", "overview", "poster_path", "production_companies", "spoken_languages", "status", "Keywords", "cast", "crew", "tagline", "title"]
cols_to_drop_after_preprocessing = []
len(drop_cols)

15

In [8]:
numerical_categories = []
categorical_categories = []

In [9]:
train_df.drop(drop_cols, axis = 1, inplace=True)
test_df.drop(drop_cols, axis = 1, inplace=True)
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                3000 non-null   int64  
 1   genres                2993 non-null   object 
 2   original_language     3000 non-null   object 
 3   popularity            3000 non-null   float64
 4   production_countries  2945 non-null   object 
 5   release_date          3000 non-null   object 
 6   runtime               2998 non-null   float64
 7   revenue               3000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4398 non-null   int64  
 1   genres                4382 non-null   object 
 2   origi

(None, None)

# Feature 1. Budget : Put mean of budget if budget is empty

In [10]:
# Add budget to numerical category
numerical_categories.append("budget")

In [11]:
# Use mean budget if budget is empty/0
budget_mean_train = train_df[train_df.budget != 0].budget.mean()
train_df.budget = train_df.budget.apply(lambda x : budget_mean_train if x == 0 else x)

In [12]:
# Do the same for test data
budget_mean_test = test_df[test_df.budget != 0].budget.mean()
test_df.budget = test_df.budget.apply(lambda x : budget_mean_test if x == 0 else x)

# Feature 2. Genres

In [13]:
# Add genres_list to categorical_category
categorical_categories.append("genres_list")

In [14]:
# Create a list of genres instead of the string

def get_genres(genres_string):
    """Returns tuple of genres generated from the passed genres string"""
    all_genres = []
    if pd.isna(genres_string):
        return []
#     print(f"processing {type(genres_string)}")
    for genre in json.loads(genres_string.replace("'", "\"")):
        all_genres.append(genre.get("name"))
    return tuple(all_genres)

train_df["genres_list"] = train_df.genres.apply(lambda x : get_genres(x))

In [15]:
# Perform the same operation on test dataframe
test_df["genres_list"] = test_df.genres.apply(lambda x : get_genres(x))

In [16]:
# Create a complete list of all the possible genres
all_genres_set = set()
for genres_list in train_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)

for genres_list in test_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)
        
# all_genres_set

In [17]:
cols_to_drop_after_preprocessing.append("genres")

# Feature 3. Original_language

In [18]:
# Add original_language to categorical_category
categorical_categories.append("original_language")

In [19]:
# No empty language, so proceed
train_df.original_language.isna().any(), test_df.original_language.isna().any()

(False, False)

# Feature 4. Popularity

In [20]:
# Add popularity to numerical_category
numerical_categories.append("popularity")

In [21]:
# No empty popularity, so proceed
train_df.popularity.isna().any(), test_df.popularity.isna().any()

(False, False)

# Feature 5. production_countries

In [22]:
# Add production_countries_list to categorical_category
categorical_categories.append("production_countries_list")

In [23]:
import json
from ast import literal_eval
# Creat complete list of the possible genres and then one hot encode it
def get_production_countries(production_string):
    all_production_countries = []
    if pd.isna(production_string):
        return []
    for country in literal_eval(production_string):
        all_production_countries.append(country.get("iso_3166_1"))
    return tuple(all_production_countries)

train_df["production_countries_list"] = train_df.production_countries.apply(lambda x : get_production_countries(x))

In [24]:
test_df["production_countries_list"] = test_df.production_countries.apply(lambda x : get_production_countries(x))

In [25]:
# Create a complete list of all the possible production countries
all_production_countries_set = set()
for production_countries_list in train_df.production_countries_list:
    for country in production_countries_list:
        all_production_countries_set.add(country)
        
for production_countries_list in test_df.production_countries_list:
    for country in production_countries_list:
        all_production_countries_set.add(country)
# all_production_countries_set

In [26]:
cols_to_drop_after_preprocessing.append("production_countries")

# Feature 5. Release date needs to be converted to age feature

In [27]:
# Add age to numerical category
numerical_categories.append("age")

In [28]:
from datetime import datetime
train_df["age"] = train_df.release_date.apply(lambda x : datetime.now().year - datetime.strptime(x, '%m/%d/%y').year)

In [29]:
# Perform the same on test dataframe
test_df["age"] = test_df.release_date.apply(lambda x : datetime.now().year - datetime.strptime(x, '%m/%d/%y').year if pd.isna(x) is False else train_df.age.mean())

In [30]:
cols_to_drop_after_preprocessing.append("release_date")

# Feature 6. Add runtime to numerical categories

In [31]:
numerical_categories.append("runtime")

# Drop the columns not required

In [32]:
train_df.drop(cols_to_drop_after_preprocessing, axis=1, inplace=True)
test_df.drop(cols_to_drop_after_preprocessing, axis=1, inplace=True)

In [33]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     3000 non-null   float64
 1   original_language          3000 non-null   object 
 2   popularity                 3000 non-null   float64
 3   runtime                    2998 non-null   float64
 4   revenue                    3000 non-null   int64  
 5   genres_list                3000 non-null   object 
 6   production_countries_list  3000 non-null   object 
 7   age                        3000 non-null   int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     4398 non-null   floa

(None, None)

In [34]:
categorical_categories, numerical_categories

(['genres_list', 'original_language', 'production_countries_list'],
 ['budget', 'popularity', 'age', 'runtime'])

# Create a pipeline to scale data

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [36]:
# Scale the numerical categories
train_numerical_data = scaler.fit_transform(train_df[numerical_categories])
test_numerical_data = scaler.fit_transform(test_df[numerical_categories])

In [42]:
# One Hot Encode the categorical categories
from sklearn.preprocessing import MultiLabelBinarizer
multilabelbinarizer = MultiLabelBinarizer()

In [46]:
categorical_categories

['genres_list', 'original_language', 'production_countries_list']

array([7, 7, 7, ..., 7, 7, 7])