In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

## Standard Statsmodels import
import statsmodels.api as sm

## fixing random for lesson generation
np.random.seed(321)

## load data
df = pd.read_csv('Data/tmdb_results_combined.csv.gz')
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,,,,,,,,,,,...,,,,,,,,,,
1,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,"After falling prey to underworld, four friends...",...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.wkw-inthemoodforlove.com/,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.112,2009.0,PG


In [2]:
df.shape

(53019, 25)

In [3]:
# Delete unnecessary columns

## Other cleaning steps we could perform.
#     pull out genres
#     pull out collection name from or treat them as booleans
#     pull out date column release_data 
drop_cols = ['backdrop_path','original_title','overview',
                 'poster_path','tagline','id','homepage', 'status',
                 'production_countries','video','spoken_languages',
            'original_language', 'genres', 'title', 'production_companies', 'release_date']
df = df.drop(columns=drop_cols)
df.dropna(subset=['revenue'], inplace=True)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,popularity,revenue,runtime,vote_average,vote_count,certification
1,0.0,,10000000.0,3.466,0.0,86.0,5.5,22.0,
2,0.0,,0.0,2.578,0.0,100.0,5.1,8.0,
3,0.0,,0.0,1.749,0.0,152.0,4.0,1.0,
4,0.0,,150000.0,22.355,12854953.0,99.0,8.112,2009.0,PG
5,0.0,,0.0,3.732,0.0,99.0,6.9,47.0,R


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53018 entries, 1 to 53018
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  53018 non-null  float64
 1   belongs_to_collection  3539 non-null   object 
 2   budget                 53018 non-null  float64
 3   popularity             53018 non-null  float64
 4   revenue                53018 non-null  float64
 5   runtime                53018 non-null  float64
 6   vote_average           53018 non-null  float64
 7   vote_count             53018 non-null  float64
 8   certification          13343 non-null  object 
dtypes: float64(7), object(2)
memory usage: 4.0+ MB


# Perform train/test split


In [5]:
y = df['revenue']
X = df.drop(columns=['revenue'])
X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=321)

# Preprocess out data before Modeling

In [6]:
# Make a column selectors.
cat_selector = make_column_selector(dtype_include='object')
cat_cols = cat_selector(X_train)

num_selector = make_column_selector(dtype_include='number')
num_cols = num_selector(X_train)

In [7]:
# Instantiate transformations

# Scaler
scaler = StandardScaler()

# Imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')

# One Hot Encoding
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [8]:
# Make pipelines 
num_pipe = make_pipeline(num_imputer, scaler)
cat_pipe = make_pipeline(cat_imputer, ohe)

# Tuple the pipelines and column selectors
num_tuple = (num_pipe, num_selector)
cat_tuple = (cat_pipe, cat_selector)

In [9]:
# Put everything in a column transformer
preprocessor = make_column_transformer(cat_tuple, num_tuple, remainder='passthrough', verbose_feature_names_out=False)

In [10]:
preprocessor.fit(X_train, y_train)

# Preprocessing for statsmodels Linear Regression

In [11]:
# # Lets first extract the column names.
# final_features = []

# # Slice the pipeline to extract the One Hot Encoder
# ohe_step = preprocessor.named_transformers_['pipeline-1'][1]

# # Now, get One Hot Encoder feature names
# cat_features = ohe_step.get_feature_names_out(cat_cols)

# # Add the One Hot Enocoded column names to the list of feature names
# final_features.extend(cat_features)
# # Add the numeric column names to the list of feature names
# final_features.extend(num_cols)


# In the new sklearn we dont need to follow the steps above
final_features = preprocessor.get_feature_names_out()

In [12]:
# Create our dataframes with the column names that we just extracted

X_train_df = pd.DataFrame(preprocessor.transform(X_train), columns=final_features, index=X_train.index)

X_test_df = pd.DataFrame(preprocessor.transform(X_test), columns=final_features, index=X_test.index)


In [13]:
# Add constant column
X_train_df = sm.add_constant(X_train_df,has_constant='add', prepend=False)
X_test_df = sm.add_constant(X_test_df,has_constant='add', prepend=False)
display("Train data ", X_train_df.head(2), 'Test data ', X_test_df.head(2))

'Train data '

Unnamed: 0,belongs_to_collection_MISSING,"belongs_to_collection_{'id': 10, 'name': 'Star Wars Collection', 'poster_path': '/gq5Wi7i4SF3lo4HHkJasDV95xI9.jpg', 'backdrop_path': '/d8duYyyC9J5T825Hg7grmaabfxQ.jpg'}","belongs_to_collection_{'id': 1003508, 'name': 'XConfessions Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1004041, 'name': 'Boys Briefs Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1004047, 'name': 'Boys on Film Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 100415, 'name': 'Thumbs! Collection', 'poster_path': '/6ro7ZB5AXX2AhMwR5csuF1Wunb4.jpg', 'backdrop_path': '/94mtvEkxIXYFLcJwFATbx0cbAFs.jpg'}","belongs_to_collection_{'id': 1004686, 'name': 'The Wicked One Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1004692, 'name': 'Inhuman kiss', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1006, 'name': 'Austin Powers Collection', 'poster_path': '/1PkGnyFwRyapmbuILIOXXxiSh7Y.jpg', 'backdrop_path': '/3QJMI8nqQcwU5JBPy26m8TUXtTN.jpg'}","belongs_to_collection_{'id': 1007288, 'name': 'A Bread Factory Collection', 'poster_path': None, 'backdrop_path': None}",...,certification_PG-13,certification_R,certification_UR,adult,budget,popularity,runtime,vote_average,vote_count,const
31801,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.04318,-0.172949,-0.197079,-0.048942,2.07614,-0.19222,1.0
39305,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,23.158969,-0.20139,-0.197079,-2.878404,-1.785893,-0.193056,1.0


'Test data '

Unnamed: 0,belongs_to_collection_MISSING,"belongs_to_collection_{'id': 10, 'name': 'Star Wars Collection', 'poster_path': '/gq5Wi7i4SF3lo4HHkJasDV95xI9.jpg', 'backdrop_path': '/d8duYyyC9J5T825Hg7grmaabfxQ.jpg'}","belongs_to_collection_{'id': 1003508, 'name': 'XConfessions Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1004041, 'name': 'Boys Briefs Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1004047, 'name': 'Boys on Film Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 100415, 'name': 'Thumbs! Collection', 'poster_path': '/6ro7ZB5AXX2AhMwR5csuF1Wunb4.jpg', 'backdrop_path': '/94mtvEkxIXYFLcJwFATbx0cbAFs.jpg'}","belongs_to_collection_{'id': 1004686, 'name': 'The Wicked One Collection', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1004692, 'name': 'Inhuman kiss', 'poster_path': None, 'backdrop_path': None}","belongs_to_collection_{'id': 1006, 'name': 'Austin Powers Collection', 'poster_path': '/1PkGnyFwRyapmbuILIOXXxiSh7Y.jpg', 'backdrop_path': '/3QJMI8nqQcwU5JBPy26m8TUXtTN.jpg'}","belongs_to_collection_{'id': 1007288, 'name': 'A Bread Factory Collection', 'poster_path': None, 'backdrop_path': None}",...,certification_PG-13,certification_R,certification_UR,adult,budget,popularity,runtime,vote_average,vote_count,const
5876,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.04318,-0.202269,-0.132055,0.076812,0.376846,-0.188879,1.0
30913,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.04318,-0.202269,-0.197079,-0.520519,-1.785893,-0.193056,1.0


In [14]:
# sklearn model

# Instantiate model
model = LinearRegression(fit_intercept=False)

# fit the model
model.fit(X_train_df, y_train)

# Define predictions
train_preds = model.predict(X_train_df)
test_preds = model.predict(X_test_df)

# find r-square
print('Training r2:', r2_score(y_train, train_preds))
print('Testing r2:', r2_score(y_test, test_preds))
# find mse
print('\nTraining MSE:', mean_squared_error(y_train, train_preds))
print('Testing MSE:', mean_squared_error(y_test, test_preds))

Training r2: 0.9089960143929974
Testing r2: -189753903159.78073
Training MSE: 410717543641828.1
Testing MSE: 5.2192592135059664e+26


In [15]:
# statsmodels

# instantiate model
model = sm.OLS(y_train, X_train_df, hasconst=True)

# We need to save the output of our .fit as a new variable
result = model.fit()

# Evaluate - get model performance metrics
result.summary()

# store test predictions 
test_preds = result.predict(X_test_df)

# We can then use any of the regression metrics from sklearn.metric's module
test_r2 = r2_score(y_test, test_preds)
test_mse = mean_squared_error(y_test, test_preds)

print(f'The testing r-square value is {test_r2} and the testing mean squared error is {test_mse}.')

The testing r-square value is -12729225763491.494 and the testing mean squared error is 3.5012259426681075e+28.
