In [1]:
# https://www.kaggle.com/c/tmdb-box-office-prediction/data?select=train.csv

In [114]:
import pandas as pd
import numpy as np

In [115]:
# Read the train and test dataframe

In [116]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

# Analyse the data

In [117]:
train_df.shape, test_df.shape

((3000, 23), (4398, 22))

In [118]:
# train_df.info(), test_df.info()

In [126]:
# Analyse and add other columns accordingly 
# Remove status and impute others
drop_cols = ["id", "belongs_to_collection", "homepage", "imdb_id", "original_title", "overview", "poster_path", "production_companies", "spoken_languages", "status", "Keywords", "cast", "crew", "tagline", "title"]
cols_to_drop_after_preprocessing = []
len(drop_cols)

15

In [129]:
numerical_categories = []
categorical_categories = []

In [120]:
train_df.drop(drop_cols, axis = 1, inplace=True)
test_df.drop(drop_cols, axis = 1, inplace=True)
train_df.info(), test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                3000 non-null   int64  
 1   genres                2993 non-null   object 
 2   original_language     3000 non-null   object 
 3   popularity            3000 non-null   float64
 4   production_countries  2945 non-null   object 
 5   release_date          3000 non-null   object 
 6   runtime               2998 non-null   float64
 7   revenue               3000 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 187.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4398 non-null   int64  
 1   genres                4382 non-null   object 
 2   origi

(None, None)

In [121]:
# Useful_columns
# No preprocessing required
useful_no_preprocessing_cols = ["budget", "popularity", "runtime"]

In [131]:
# Feature 1. Budget : Put mean of budget if budget is empty
numerical_categories.append("budget")

In [123]:
budget_mean_train = train_df[train_df.budget != 0].budget.mean()
train_df.budget = train_df.budget.apply(lambda x : budget_mean_train if x == 0 else x)

In [124]:
budget_mean_test = test_df[test_df.budget != 0].budget.mean()
test_df.budget = test_df.budget.apply(lambda x : budget_mean_test if x == 0 else x)

In [125]:
# 2. Genres
import json
# Creat complete list of the possible genres and then one hot encode it
def get_genres(genres_string):
    all_genres = []
    if pd.isna(genres_string):
        return []
#     print(f"processing {type(genres_string)}")
    for genre in json.loads(genres_string.replace("'", "\"")):
        all_genres.append(genre.get("name"))
    return tuple(all_genres)
train_df["genres_list"] = train_df.genres.apply(lambda x : get_genres(x))

train_df

Unnamed: 0,budget,genres,original_language,popularity,production_countries,release_date,runtime,revenue,genres_list
0,1.400000e+07,"[{'id': 35, 'name': 'Comedy'}]",en,6.575393,"[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,12314651,"(Comedy,)"
1,4.000000e+07,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",en,8.248895,"[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,95149435,"(Comedy, Drama, Family, Romance)"
2,3.300000e+06,"[{'id': 18, 'name': 'Drama'}]",en,64.299990,"[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,13092000,"(Drama,)"
3,1.200000e+06,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",hi,3.174936,"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,16000000,"(Thriller, Drama)"
4,3.089305e+07,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",ko,1.148070,"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,3923970,"(Action, Thriller)"
...,...,...,...,...,...,...,...,...,...
2995,3.089305e+07,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",en,9.853270,"[{'iso_3166_1': 'US', 'name': 'United States o...",4/22/94,102.0,1596687,"(Comedy, Romance)"
2996,3.089305e+07,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",sv,3.727996,"[{'iso_3166_1': 'DK', 'name': 'Denmark'}, {'is...",3/28/13,102.0,180590,"(Drama, Music)"
2997,6.500000e+07,"[{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...",en,14.482345,"[{'iso_3166_1': 'US', 'name': 'United States o...",10/11/96,120.0,89456761,"(Crime, Action, Mystery, Thriller)"
2998,4.200000e+07,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",en,15.725542,"[{'iso_3166_1': 'US', 'name': 'United States o...",1/16/04,90.0,171963386,"(Comedy, Romance)"


In [130]:
# Feature 2 : Genres
categorical_categories.append("genres_list")

In [108]:
# Create a complete list of all the possible genres
all_genres_set = set()
for genres_list in train_df.genres_list:
    for genre in genres_list:
        all_genres_set.add(genre)
        
all_genres_set

{'Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western'}

In [127]:
cols_to_drop_after_preprocessing.append("genres")

In [128]:
# # One hot encode the genres
# from sklearn.preprocessing import OneHotEncoder
# one_hot_encoder = OneHotEncoder()
# one_hot_encoder.fit_transform(train_df["genres_list"])
# # X = pd.get_dummies(train_df)
# # X

In [133]:
# 3. Original_language
categorical_categories.append("original_language")

In [136]:
# No empty language
train_df.original_language.isna().any()

False

In [143]:
# 4. popularity
numerical_categories.append("popularity")

In [147]:
# No empty popularity
train_df.popularity.isna().any()

False

In [148]:
# 5. production_countries
train_df["production_countries"]

0       [{'iso_3166_1': 'US', 'name': 'United States o...
1       [{'iso_3166_1': 'US', 'name': 'United States o...
2       [{'iso_3166_1': 'US', 'name': 'United States o...
3                 [{'iso_3166_1': 'IN', 'name': 'India'}]
4           [{'iso_3166_1': 'KR', 'name': 'South Korea'}]
                              ...                        
2995    [{'iso_3166_1': 'US', 'name': 'United States o...
2996    [{'iso_3166_1': 'DK', 'name': 'Denmark'}, {'is...
2997    [{'iso_3166_1': 'US', 'name': 'United States o...
2998    [{'iso_3166_1': 'US', 'name': 'United States o...
2999    [{'iso_3166_1': 'US', 'name': 'United States o...
Name: production_countries, Length: 3000, dtype: object