In [1]:
# https://www.kaggle.com/c/tmdb-box-office-prediction/data?select=train.csv

In [40]:
import pandas as pd
import numpy as np
import json

In [41]:
# Read the train and test dataframe

In [42]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

# Analyse the data

In [43]:
train_df.shape, test_df.shape

((3000, 23), (4398, 22))

In [44]:
# train_df.info(), test_df.info()

In [45]:
# Analyse and add other columns accordingly 
# Remove status and impute others
drop_cols = ["id", "belongs_to_collection", "homepage", "imdb_id", "original_title", "overview", "poster_path", "production_companies", "spoken_languages", "status", "Keywords", "cast", "crew", "tagline", "title"]
cols_to_drop_after_preprocessing = []
len(drop_cols)

15

In [46]:
numerical_categories = []
categorical_categories = []

In [47]:
train_df.drop(drop_cols, axis = 1, inplace=True)
test_df.drop(drop_cols, axis = 1, inplace=True)
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                3000 non-null   int64  
 1   genres                2993 non-null   object 
 2   original_language     3000 non-null   object 
 3   popularity            3000 non-null   float64
 4   production_countries  2945 non-null   object 
 5   release_date          3000 non-null   object 
 6   runtime               2998 non-null   float64
 7   revenue               3000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4398 non-null   int64  
 1   genres                4382 non-null   object 
 2   origi

(None, None)

# Feature 1. Budget : Put mean of budget if budget is empty

In [48]:
# Add budget to numerical category
numerical_categories.append("budget")

In [49]:
# Use mean budget if budget is empty/0
budget_mean_train = train_df[train_df.budget != 0].budget.mean()
train_df.budget = train_df.budget.apply(lambda x : budget_mean_train if x == 0 else x)

In [50]:
# Do the same for test data
budget_mean_test = test_df[test_df.budget != 0].budget.mean()
test_df.budget = test_df.budget.apply(lambda x : budget_mean_test if x == 0 else x)

# Feature 2. Genres

In [51]:
# Add genres_list to categorical_category
categorical_categories.append("genres_list")

In [52]:
# Create a list of genres instead of the string

def get_genres(genres_string):
    """Returns tuple of genres generated from the passed genres string"""
    all_genres = []
    if pd.isna(genres_string):
        return []
#     print(f"processing {type(genres_string)}")
    for genre in json.loads(genres_string.replace("'", "\"")):
        all_genres.append(genre.get("name"))
    return tuple(all_genres)

train_df["genres_list"] = train_df.genres.apply(lambda x : get_genres(x))

In [53]:
# Perform the same operation on test dataframe
test_df["genres_list"] = test_df.genres.apply(lambda x : get_genres(x))

In [54]:
# Create a complete list of all the possible genres
all_genres_set = set()
for genres_list in train_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)

for genres_list in test_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)
        
# all_genres_set

In [55]:
cols_to_drop_after_preprocessing.append("genres")

# Feature 3. Original_language

In [56]:
# Add original_language to categorical_category
categorical_categories.append("original_language")

In [57]:
# No empty language, so proceed
train_df.original_language.isna().any(), test_df.original_language.isna().any()

(False, False)

# Feature 4. Popularity

In [58]:
# Add popularity to numerical_category
numerical_categories.append("popularity")

In [59]:
# No empty popularity, so proceed
train_df.popularity.isna().any(), test_df.popularity.isna().any()

(False, False)

# Feature 5. production_countries

In [60]:
# Add production_countries_list to categorical_category
categorical_categories.append("production_countries_list")

In [61]:
import json
from ast import literal_eval
# Creat complete list of the possible genres and then one hot encode it
def get_production_countries(production_string):
    all_production_countries = []
    if pd.isna(production_string):
        return []
    for country in literal_eval(production_string):
        all_production_countries.append(country.get("iso_3166_1"))
    return tuple(all_production_countries)

train_df["production_countries_list"] = train_df.production_countries.apply(lambda x : get_production_countries(x))

In [62]:
test_df["production_countries_list"] = test_df.production_countries.apply(lambda x : get_production_countries(x))

In [63]:
# Create a complete list of all the possible production countries
all_production_countries_set = set()
for production_countries_list in train_df.production_countries_list:
    for country in production_countries_list:
        all_production_countries_set.add(country)
        
for production_countries_list in test_df.production_countries_list:
    for country in production_countries_list:
        all_production_countries_set.add(country)
all_production_countries_set

{'AE',
 'AF',
 'AO',
 'AR',
 'AT',
 'AU',
 'BA',
 'BE',
 'BF',
 'BG',
 'BO',
 'BR',
 'BS',
 'BW',
 'CA',
 'CD',
 'CH',
 'CI',
 'CL',
 'CM',
 'CN',
 'CO',
 'CR',
 'CS',
 'CY',
 'CZ',
 'DE',
 'DK',
 'DO',
 'DZ',
 'EC',
 'ES',
 'ET',
 'FI',
 'FR',
 'GB',
 'GE',
 'GH',
 'GR',
 'HK',
 'HR',
 'HU',
 'ID',
 'IE',
 'IL',
 'IN',
 'IR',
 'IS',
 'IT',
 'JM',
 'JO',
 'JP',
 'KH',
 'KR',
 'KZ',
 'LI',
 'LK',
 'LT',
 'LU',
 'MA',
 'MC',
 'MK',
 'ML',
 'MN',
 'MR',
 'MT',
 'MX',
 'MY',
 'NA',
 'NL',
 'NO',
 'NZ',
 'PE',
 'PH',
 'PK',
 'PL',
 'PR',
 'PS',
 'PT',
 'PY',
 'QA',
 'RO',
 'RS',
 'RU',
 'SA',
 'SE',
 'SG',
 'SI',
 'SN',
 'TH',
 'TN',
 'TR',
 'TW',
 'UA',
 'US',
 'UY',
 'VE',
 'ZA'}

In [64]:
cols_to_drop_after_preprocessing.append("production_countries")

# Feature 5. Release date needs to be converted to age feature

In [66]:
# Add age to numerical category
numerical_categories.append("age")

In [67]:
from datetime import datetime
train_df["age"] = train_df.release_date.apply(lambda x : datetime.now().year - datetime.strptime(x, '%m/%d/%y').year)

In [68]:
# Perform the same on test dataframe
test_df["age"] = test_df.release_date.apply(lambda x : datetime.now().year - datetime.strptime(x, '%m/%d/%y').year if pd.isna(x) is False else train_df.age.mean())

In [69]:
cols_to_drop_after_preprocessing.append("release_date")

# Feature 6. Add runtime to numerical categories

In [70]:
numerical_categories.append("runtime")
train_df_runtime_mean = train_df.runtime.mean()
train_df.runtime.fillna(value=train_df_runtime_mean, inplace=True)

# Drop the columns not required

In [71]:
train_df.drop(cols_to_drop_after_preprocessing, axis=1, inplace=True)
test_df.drop(cols_to_drop_after_preprocessing, axis=1, inplace=True)

In [72]:
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     3000 non-null   float64
 1   original_language          3000 non-null   object 
 2   popularity                 3000 non-null   float64
 3   runtime                    3000 non-null   float64
 4   revenue                    3000 non-null   int64  
 5   genres_list                3000 non-null   object 
 6   production_countries_list  3000 non-null   object 
 7   age                        3000 non-null   int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   budget                     4398 non-null   floa

(None, None)

In [73]:
categorical_categories, numerical_categories

(['genres_list', 'original_language', 'production_countries_list'],
 ['budget', 'popularity', 'age', 'runtime'])

# Create a pipeline to scale data

In [74]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [75]:
# Scale the numerical categories
train_numerical_df = pd.DataFrame(data = scaler.fit_transform(train_df[numerical_categories]), columns=numerical_categories)
test_numerical_df = pd.DataFrame(data = scaler.fit_transform(test_df[numerical_categories]), columns=numerical_categories)

In [76]:
# One Hot Encode the categorical categories
from sklearn.preprocessing import MultiLabelBinarizer

In [77]:
def get_categorical_df(df):
    combined_categorical_df = pd.DataFrame()
    for category in categorical_categories:
        multilabelbinarizer = MultiLabelBinarizer()
        data = multilabelbinarizer.fit_transform(df[category])
        columns = multilabelbinarizer.classes_
        new_df = pd.DataFrame(data, columns=columns)
        combined_categorical_df = pd.concat([combined_categorical_df, new_df], axis=1)
    return combined_categorical_df

In [78]:
train_categorical_df = get_categorical_df(train_df)
test_categorical_df = get_categorical_df(test_df)

In [83]:
# Combine Numerical and Categorical Dataframes to form a single unique dataframe
train_final_df = pd.concat([train_numerical_df, train_categorical_df, train_df["revenue"]], axis=1)
test_final_df = pd.concat([test_numerical_df, test_categorical_df], axis=1)

In [84]:
# Remove cols not present in both
drop_from_train = list(set(train_final_df.columns) - set(test_final_df.columns))
drop_from_test = list(set(test_final_df.columns) - set(train_final_df.columns))

drop_from_train.remove("revenue")

train_final_df.drop(drop_from_train, axis=1, inplace=True)
test_final_df.drop(drop_from_test, axis=1, inplace=True)

print(train_final_df.shape, test_final_df.shape)

(3000, 104) (4398, 103)


In [85]:
# Split the dataframe
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_final_df.drop("revenue", axis=1, inplace=False), train_final_df["revenue"], test_size=0.2, shuffle=True, random_state=42)

In [86]:
train_final_df.shape

(3000, 104)

# Create a keras model for predictions

In [87]:
from keras.layers import Dense, Dropout
from keras.models import Sequential

In [88]:
train_final_df.isna().any()

budget        False
popularity    False
age           False
runtime       False
Action        False
              ...  
TW            False
UA            False
US            False
ZA            False
revenue       False
Length: 104, dtype: bool

In [89]:
# Define a keras model
model = Sequential()

model.add(Dense(512, input_dim=len(X_train.columns), activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(256, activation="relu"))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(128, activation="relu"))
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(64, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(32, activation="relu"))
model.add(Dense(1, activation="linear"))

In [92]:
from keras.optimizers import RMSprop
# opt = SGD(lr=0.01, momentum=0.9)
model.compile(loss='mean_squared_logarithmic_error', optimizer=RMSprop(learning_rate=0.001), metrics=["mse"])

In [93]:
history = model.fit(X_train.values, y_train.values, validation_data=(X_test.values, y_test.values), epochs=100, batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [94]:
model.evaluate(X_test, y_test)



[6.666798114776611, 5893703910555648.0]

In [103]:
predictions = model.predict(test_final_df.values)
predictions = [prediction[0] for prediction in predictions]
predictions

[99652940.0,
 2645883.8,
 40856212.0,
 48840.242,
 9353804.0,
 47456350.0,
 4797987.5,
 29444856.0,
 58760828.0,
 400364160.0,
 2826854.2,
 232048.02,
 8505950.0,
 134316.52,
 45891044.0,
 38061.58,
 20903032.0,
 268950000.0,
 63756640.0,
 875164400.0,
 8360789.0,
 72633990.0,
 17261.773,
 7915370.0,
 134608.27,
 43141920.0,
 1409905.0,
 122215810.0,
 84215.195,
 59455976.0,
 13491682.0,
 26317124.0,
 813158.3,
 319442.9,
 1927291.2,
 26366844.0,
 144406290.0,
 103547440.0,
 374553.44,
 6830001.5,
 4906611.0,
 8409473.0,
 7705404.0,
 3364841.0,
 467028100.0,
 594450.6,
 16902408.0,
 3513949.0,
 79686960.0,
 81175520.0,
 86237.18,
 33829430.0,
 397231.12,
 1166287.2,
 48495884.0,
 31735.691,
 108041680.0,
 586551200.0,
 27.196936,
 58879640.0,
 79417640.0,
 34580656.0,
 11928641.0,
 135287340.0,
 53300990.0,
 149345180.0,
 11769642.0,
 23619522.0,
 369546560.0,
 2020619.0,
 1388324.0,
 694024300.0,
 5594496.0,
 19527.666,
 45857108.0,
 125108830.0,
 401419.62,
 39404384.0,
 84818620.0,


In [105]:
original_test_df = test_df = pd.read_csv("test.csv")
submission_df = pd.DataFrame({"id" : original_test_df.id, "revenue" : predictions})
submission_df

Unnamed: 0,id,revenue
0,3001,9.965294e+07
1,3002,2.645884e+06
2,3003,4.085621e+07
3,3004,4.884024e+04
4,3005,9.353804e+06
...,...,...
4393,7394,6.719358e+07
4394,7395,1.321240e+07
4395,7396,2.312267e+07
4396,7397,6.522742e+07


In [106]:
submission_df.to_csv("submission.csv", index=False)