In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

## Standard Statsmodels import
import statsmodels.api as sm

## fixing random for lesson generation
np.random.seed(321)

## load data
df = pd.read_csv('Data/tmdb_results_combined.csv.gz')
df.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,,,,,,,,,,,...,,,,,,,,,,
1,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,Two rural teens sing and dance their way throu...,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,Earth is in a state of constant war and two co...,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,"After falling prey to underworld, four friends...",...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.wkw-inthemoodforlove.com/,843.0,cn,花樣年華,"Hong Kong, 1962: Chow Mo-Wan and Su Li-Zhen mo...",...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.112,2009.0,PG


In [2]:
# Delete unnecessary columns
drop_cols = ['backdrop_path','original_title','overview',
                 'poster_path','tagline','id','homepage', 'status',
                 'production_countries','video','spoken_languages',
            'original_language']
df = df.drop(columns=drop_cols)
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,popularity,production_companies,release_date,revenue,runtime,title,vote_average,vote_count,certification
0,,,,,,,,,,,,,
1,0.0,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",3.466,"[{'id': 60, 'logo_path': '/1SEj4nyG3JPBSKBbFht...",2000-09-22,0.0,86.0,The Fantasticks,5.5,22.0,
2,0.0,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",2.578,"[{'id': 925, 'logo_path': '/dIb9hjXNOkgxu4kBWd...",2000-11-15,0.0,100.0,For the Cause,5.1,8.0,
3,0.0,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",1.749,[],2000-04-14,0.0,152.0,Gang,4.0,1.0,
4,0.0,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",22.355,"[{'id': 539, 'logo_path': '/iPLtePguIzOPNtAWfT...",2000-09-29,12854953.0,99.0,In the Mood for Love,8.112,2009.0,PG


# Perform train/test split


In [3]:
y = df['revenue']
X = df.drop(columns=['revenue'])
X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=321)

# Preprocess out data before Modeling

In [4]:
# Make a column selectors.
cat_selector = make_column_selector(dtype_include='object')
cat_cols = cat_selector(X_train)

num_selector = make_column_selector(dtype_include='number')
num_cols = num_selector(X_train)

In [5]:
# Instantiate transformations

# Scaler
scaler = StandardScaler()

# Imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='constant', fill_value='MISSING')

# One Hot Encoding
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [6]:
# Make pipelines
num_pipe = make_pipeline(num_imputer, scaler)
cat_pipe = make_pipeline(ohe, cat_imputer)

# Tuple the pipelines and column selectors
num_tuple = (num_selector, num_pipe)
cat_tuple = (cat_selector, cat_pipe)

In [7]:
# Put everything in a column transformer
preprocessor = make_column_transformer(cat_tuple, num_tuple, remainder='passthrough')

In [8]:
preprocessor.fit(X_train, y_train)

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '<sklearn.compose._column_transformer.make_column_selector object at 0x00000293C412B3D0>' (type <class 'sklearn.compose._column_transformer.make_column_selector'>) doesn't.

# Preprocessing for statsmodels Linear Regression

In [None]:
# Lets first extract the column names.
final_features = []

# Slice the pipeline to extract the One Hot Encoder
ohe_step = preprocessor.named_transformers_['pipeline-1'][1]

# Now, get One Hot Encoder feature names
cat_features = ohe_step.get_feature_names_out(cat_cols)

# Add the One Hot Enocoded column names to the list of feature names
final_features.extend(cat_features)
# Add the numeric column names to the list of feature names
final_features.extend(num_cols)

In [None]:
# Create our dataframes with the column names that we just extracted

X_train_df = pd.DataFrame(preprocessor.transform(X_train), columns=final_features, index=X_train.index)

X_test_df = pd.DataFrame(preprocessor.transform(X_test), columns=final_features, index=X_test.index)


In [None]:
# Add constant column
X_train_df = sm.add_constant(X_train_df,has_constant='add', prepend=False)
X_test_df = sm.add_constant(X_test_df,has_constant='add', prepend=False)
display("Train data ", X_train_df.head(2), 'Test data ', X_test_df.head(2))

In [None]:
# sklearn model

# Instantiate model
model = LinearRegression(fit_intercept=False)

# fit the model
model.fit(X_train_df, y_train)

# Define predictions
train_preds = model.predict(X_train_df)
test_preds = model.predict(X_test_df)

# find r-square
print('Training r2:', r2_score(y_train, train_preds))
print('Testing r2:', r2_score(y_test, test_preds))
# find mse
print('Training MSE:', mean_squared_error(y_train, train_preds))
print('Testing MSE:', mean_squared_error(y_test, test_preds))

In [None]:
# statsmodels

# instantiate model
model = sm.OLS(y_train, X_train_df, hasconst=True)

# We need to save the output of our .fit as a new variable
result = model.fit()

# Evaluate - get model performance metrics
result.summary()

# store test predictions 
test_preds = result.predict(X_test_df)

# We can then use any of the regression metrics from sklearn.metric's module
test_r2 = r2_score(y_test, test_preds)
test_mse = mean_squared_error(y_test, test_preds)

print(f'The testing r-square value is {test_r2} and the testing mean squared error is {test_mse}.')