In [1]:
# https://www.kaggle.com/c/tmdb-box-office-prediction/data?select=train.csv

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Read the train and test dataframe

In [3]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

# Analyse the data

In [4]:
train_df.shape, test_df.shape

((3000, 23), (4398, 22))

In [5]:
# train_df.info(), test_df.info()

In [6]:
# Analyse and add other columns accordingly 
# Remove status and impute others
drop_cols = ["id", "belongs_to_collection", "homepage", "imdb_id", "original_title", "overview", "poster_path", "production_companies", "spoken_languages", "status", "Keywords", "cast", "crew", "tagline", "title"]
cols_to_drop_after_preprocessing = []
len(drop_cols)

15

In [7]:
numerical_categories = []
categorical_categories = []

In [8]:
train_df.drop(drop_cols, axis = 1, inplace=True)
test_df.drop(drop_cols, axis = 1, inplace=True)
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                3000 non-null   int64  
 1   genres                2993 non-null   object 
 2   original_language     3000 non-null   object 
 3   popularity            3000 non-null   float64
 4   production_countries  2945 non-null   object 
 5   release_date          3000 non-null   object 
 6   runtime               2998 non-null   float64
 7   revenue               3000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4398 non-null   int64  
 1   genres                4382 non-null   object 
 2   origi

(None, None)

# Feature 1. Budget : Put mean of budget if budget is empty

In [9]:
# Add budget to numerical category
numerical_categories.append("budget")

In [10]:
# Use mean budget if budget is empty/0
budget_mean_train = train_df[train_df.budget != 0].budget.mean()
train_df.budget = train_df.budget.apply(lambda x : budget_mean_train if x == 0 else x)

In [11]:
# Do the same for test data
budget_mean_test = test_df[test_df.budget != 0].budget.mean()
test_df.budget = test_df.budget.apply(lambda x : budget_mean_test if x == 0 else x)

# Feature 2. Genres

In [12]:
# Add genres_list to categorical_category
categorical_categories.append("genres_list")

In [13]:
# Create a list of genres instead of the string

def get_genres(genres_string):
    """Returns tuple of genres generated from the passed genres string"""
    all_genres = []
    if pd.isna(genres_string):
        return []
#     print(f"processing {type(genres_string)}")
    for genre in json.loads(genres_string.replace("'", "\"")):
        all_genres.append(genre.get("name"))
    return tuple(all_genres)

train_df["genres_list"] = train_df.genres.apply(lambda x : get_genres(x))

In [14]:
# Perform the same operation on test dataframe
test_df["genres_list"] = test_df.genres.apply(lambda x : get_genres(x))

In [15]:
# Create a complete list of all the possible genres
all_genres_set = set()
for genres_list in train_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)

for genres_list in test_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)
        
# all_genres_set

In [16]:
cols_to_drop_after_preprocessing.append("genres")

# Feature 3. Original_language

In [17]:
# Add original_language to categorical_category
categorical_categories.append("original_language")

In [18]:
# No empty language, so proceed
train_df.original_language.isna().any(), test_df.original_language.isna().any()

(False, False)

# Feature 4. Popularity

In [19]:
# Add popularity to numerical_category
numerical_categories.append("popularity")

In [20]:
# No empty popularity, so proceed
train_df.popularity.isna().any(), test_df.popularity.isna().any()

(False, False)

# Feature 5. production_countries

In [21]:
# Add production_countries_list to categorical_category
categorical_categories.append("production_countries_list")

In [22]:
import json
from ast import literal_eval
# Creat complete list of the possible genres and then one hot encode it
def get_production_countries(production_string):
    all_production_countries = []
    if pd.isna(production_string):
        return []
    for country in literal_eval(production_string):
        all_production_countries.append(country.get("iso_3166_1"))
    return tuple(all_production_countries)

train_df["production_countries_list"] = train_df.production_countries.apply(lambda x : get_production_countries(x))

In [23]:
test_df["production_countries_list"] = test_df.production_countries.apply(lambda x : get_production_countries(x))

In [24]:
# Create a complete list of all the possible production countries
all_production_countries_set = set()
for production_countries_list in train_df.production_countries_list:
    for country in production_countries_list:
        all_production_countries_set.add(country)
        
for production_countries_list in test_df.production_countries_list:
    for country in production_countries_list:
        all_production_countries_set.add(country)
# all_production_countries_set

In [25]:
cols_to_drop_after_preprocessing.append("production_countries")

# Feature 5. Release date needs to be converted to age feature

In [26]:
# Add age to numerical category
numerical_categories.append("age")

In [27]:
from datetime import datetime
train_df["age"] = train_df.release_date.apply(lambda x : datetime.now().year - datetime.strptime(x, '%m/%d/%y').year)

In [28]:
# Perform the same on test dataframe
test_df["age"] = test_df.release_date.apply(lambda x : datetime.now().year - datetime.strptime(x, '%m/%d/%y').year if pd.isna(x) is False else train_df.age.mean())

In [29]:
cols_to_drop_after_preprocessing.append("release_date")

# Feature 6. Add runtime to numerical categories

In [30]:
numerical_categories.append("runtime")

# Drop the columns not required

In [31]:
train_df.drop(cols_to_drop_after_preprocessing, axis=1, inplace=True)
test_df.drop(cols_to_drop_after_preprocessing, axis=1, inplace=True)

In [32]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     3000 non-null   float64
 1   original_language          3000 non-null   object 
 2   popularity                 3000 non-null   float64
 3   runtime                    2998 non-null   float64
 4   revenue                    3000 non-null   int64  
 5   genres_list                3000 non-null   object 
 6   production_countries_list  3000 non-null   object 
 7   age                        3000 non-null   int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     4398 non-null   floa

(None, None)

In [33]:
categorical_categories, numerical_categories

(['genres_list', 'original_language', 'production_countries_list'],
 ['budget', 'popularity', 'age', 'runtime'])

# Create a pipeline to scale data

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [67]:
# Scale the numerical categories
train_numerical_df = pd.DataFrame(data = scaler.fit_transform(train_df[numerical_categories]), columns=numerical_categories)
test_numerical_df = pd.DataFrame(data = scaler.fit_transform(test_df[numerical_categories]), columns=numerical_categories)

In [36]:
# One Hot Encode the categorical categories
from sklearn.preprocessing import MultiLabelBinarizer

In [62]:
def get_categorical_df(df):
    combined_categorical_df = pd.DataFrame()
    for category in categorical_categories:
        multilabelbinarizer = MultiLabelBinarizer()
        data = multilabelbinarizer.fit_transform(df[category])
        columns = multilabelbinarizer.classes_
        new_df = pd.DataFrame(data, columns=columns)
        combined_categorical_df = pd.concat([combined_categorical_df, new_df], axis=1)
    return combined_categorical_df

In [63]:
train_categorical_df = get_categorical_df(train_df)
test_categorical_df = get_categorical_df(test_df)

In [74]:
# Combine Numerical and Categorical Dataframes to form a single unique dataframe
train_final_df = pd.concat([train_numerical_df, train_categorical_df, train_df["revenue"]], axis=1)
test_final_df = pd.concat([test_numerical_df, test_categorical_df], axis=1)

In [80]:
# Split the dataframe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_final_df.drop("revenue", axis=1, inplace=False), train_final_df["revenue"], test_size=0.2, shuffle=True, random_state=42)

1801      9247881
1190            7
1817        33700
251      10017322
2505      8910819
          ...    
104     149521495
2087     44030246
599      42930462
1756     88658655
1323      3337685
Name: revenue, Length: 600, dtype: int64

# Create a keras model for predictions

In [86]:
from keras.layers import Dense, Dropout
from keras.models import Sequential

In [178]:
X_train

Unnamed: 0,budget,popularity,age,runtime,Action,Adventure,Animation,Comedy,Crime,Documentary,...,SE,SG,SI,TN,TR,TW,UA,US,UY,ZA
642,0.235795,-0.052321,1.071162,0.368769,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
700,-0.171403,-0.087101,1.200376,-0.627484,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
226,0.555737,-0.065646,-0.414798,-0.129358,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1697,0.000000,-0.215085,0.812734,1.093316,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1010,0.119453,0.731068,-0.673226,0.776327,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,0.000000,-0.015873,-0.220977,0.957464,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1095,0.788421,0.514194,0.618913,1.002748,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1130,0.000000,-0.064415,1.781838,-0.536915,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1294,-0.200488,-0.349545,1.717231,1.183885,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [187]:
# Define a keras model
model = Sequential()

model.add(Dense(128, input_dim=len(X_train.columns), activation="relu"))
# model.add(Dense(128, activation="relu"))
# model.add(Dropout(0.2))

model.add(Dense(64, activation="relu"))
# model.add(Dense(256, activation="relu"))
# model.add(Dropout(0.2))

# model.add(Dense(128, activation="relu"))
# model.add(Dense(128, activation="relu"))
# model.add(Dropout(0.2))

# model.add(Dense(64, activation="relu"))
# model.add(Dense(64, activation="relu"))
# model.add(Dropout(0.2))

model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="linear"))

In [201]:
from keras.optimizers import SGD
opt = SGD(lr=0.01, momentum=0.9)
model.compile(loss='mean_squared_logarithmic_error', optimizer=opt)

In [202]:
X_train.values

array([[ 0.23579525, -0.05232095,  1.07116176, ...,  1.        ,
         0.        ,  0.        ],
       [-0.17140292, -0.08710103,  1.20037566, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.55573667, -0.06564587, -0.41479818, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.06441541,  1.78183825, ...,  1.        ,
         0.        ,  0.        ],
       [-0.2004885 , -0.34954547,  1.71723129, ...,  1.        ,
         0.        ,  0.        ],
       [-0.2004885 , -0.35248193,  0.23127136, ...,  1.        ,
         0.        ,  0.        ]])

In [203]:
# data = X_train[:, 0:len(X_train.columns) - 1]

In [204]:
model.fit(X_train.values, y_train.values, validation_data=(X_test.values, y_test.values), epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe9c2db6290>

In [200]:
int(y_train.values[0])

288752301