# Table of contents
1. [Libraries](#libraries)
2. [Question](#quesion)
3. [Data Preparation](#preparation)
4. [Modelling](#modelling)
5. [Evaluation](#evaluation)

<h1 id ="libraries">1. Libraries </h1>

In [102]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif, f_regression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

<h1 id = "question">2. Question </h1>

**A question:** Predict the gross of movies

**Answering this question will help us:**

<h1 id = 'preparation'>3. Data Preparation </h1>

In [91]:
data = pd.read_csv('/content/preprocessed_data.csv')

In [92]:
data['Total Gross'] = data['Worldwide Gross'] + data['Domestic Gross']

In [93]:
# Read the inflation data into a DataFrame
inflation_data = pd.read_csv('inflation_rate.csv')

# Create DataFrame for inflation data
inflation_df = pd.DataFrame(inflation_data)

data['Year'] = pd.to_datetime(data['Release Date']).dt.year

# Merge inflation data with your movie data on 'Year'
data = data.merge(inflation_df, on='Year', how='left')

# Adjust the economic columns by dividing them by the 'Unit' (inflation adjustment factor)
economic_features = ['Production Budget', 'Domestic Gross', 'Worldwide Gross', 'Total Gross']

for feature in economic_features:
    data[feature] = data[feature] * (data['Unit'] / data['Unit'].iloc[-1])

# The adjusted columns will now reflect the true values after inflation adjustment

- Apply log transformation

In [94]:
data['Production Budget'] = (data['Production Budget'] + 1).apply(np.log)
data['Worldwide Gross'] = (data['Worldwide Gross'] + 1).apply(np.log)
data['Domestic Gross'] = (data['Domestic Gross'] + 1).apply(np.log)
data['Total Gross'] = (data['Total Gross'] + 1).apply(np.log)

columns


In [95]:
data = data.drop(columns=['Cast', 'Director'])

In [96]:
# data['Mean UserScore'] = (data['Tomatoes UserScore']+data['Meta UserScore'])/2
# data['Mean CriticScore'] = (data['Tomatoes CriticScore']+data['Metascore'])/2

data = data.drop(columns = 'Year')

data['Year'] = pd.to_datetime(data['Release Date']).dt.year
data['Month'] = pd.to_datetime(data['Release Date']).dt.month

In [97]:
data = data.drop(columns = 'Release Date')
data = data.drop(columns = 'Year')

In [98]:
data.columns

Index(['Title', 'Tomatoes CriticScore', 'Tomatoes UserScore', 'Genre',
       'Rating', 'Studio', 'Production Budget', 'Domestic Gross',
       'Worldwide Gross', 'Metascore', 'Meta UserScore', 'Total Gross', 'Rate',
       'Unit', 'Month'],
      dtype='object')

In [99]:
data.head(5)

Unnamed: 0,Title,Tomatoes CriticScore,Tomatoes UserScore,Genre,Rating,Studio,Production Budget,Domestic Gross,Worldwide Gross,Metascore,Meta UserScore,Total Gross,Rate,Unit,Month
0,L.A. Confidential,99.0,94.0,"['Crime', 'Drama']",R,Warner Home Vídeo,17.98085,18.593793,19.263409,91.0,86.0,19.67678,1.7,1.949,9
1,The Godfather,97.0,98.0,"['Crime', 'Drama']",R,Paramount Pictures,17.702056,20.672324,21.354595,100.0,93.0,21.763699,3.4,7.374,3
2,Casablanca,99.0,95.0,['Drama'],PG,Warner Bros. Pictures,16.686983,18.996521,19.006477,100.0,87.0,19.694659,3.0,18.003,1
3,Parasite,99.0,90.0,"['Comedy', 'Mystery & Thriller', 'Drama']",R,Neon,16.428409,17.937554,19.494757,97.0,89.0,19.685976,2.3,1.224,5
4,Top Gun: Maverick,96.0,99.0,"['Action', 'Adventure']",PG-13,Paramount Pictures,18.951309,20.393,21.104712,78.0,84.0,21.504027,6.5,1.059,5


In [115]:
numerical_features = ['Tomatoes CriticScore', 'Tomatoes UserScore',
                      'Production Budget', 'Metascore', 'Meta UserScore']
categorical_features = ['Genre', 'Rating', 'Studio', 'Month']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

feature_selector = SelectKBest(score_func=f_regression, k=15)

xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3,
                   random_state=42, objective='reg:squarederror')

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('model', xgb)
])

X = data[numerical_features+categorical_features]
y = data['Total Gross']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# 8. Đánh giá kết quả
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MAE:', mae)
print('MSE:', mse)
print('R2:', r2)

MAE: 1.9210003880601692
MSE: 11.822507957482562
R2: 0.5252887964233888


In [118]:
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid = {
    'model__n_estimators': [50, 100, 200],         # Số lượng cây
    'model__max_depth': [3, 6, 9],                 # Độ sâu tối đa của cây
    'model__learning_rate': [0.01, 0.1, 0.2],     # Tốc độ học
    'model__subsample': [0.8, 1.0],                # Tỷ lệ mẫu
    'model__colsample_bytree': [0.8, 1.0]          # Tỷ lệ cột
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = best_model.score(X_test, y_test)
print(f'Mean Squared Error: {mse}')
print(f'R²: {r2}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Mean Squared Error: 11.877197690462554
R²: 0.5230928301309927


In [120]:
from sklearn.ensemble import GradientBoostingRegressor

numerical_features = ['Tomatoes CriticScore', 'Tomatoes UserScore',
                      'Production Budget', 'Metascore', 'Meta UserScore']
categorical_features = ['Genre', 'Rating', 'Studio', 'Month']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

feature_selector = SelectKBest(score_func=f_regression, k=15)

model = GradientBoostingRegressor(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', feature_selector),
    ('model', model)
])

X = data[numerical_features+categorical_features]
y = data['Total Gross']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# 8. Đánh giá kết quả
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('MAE:', mae)
print('MSE:', mse)
print('R2:', r2)

MAE: 1.9395402531587906
MSE: 11.920507402381471
R2: 0.5213538077894113


In [124]:
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid = {
    'model__n_estimators': [50, 100, 200],          # Số lượng cây trong mô hình
    'model__max_depth': [3, 5, 7],                  # Độ sâu của mỗi cây
    'model__learning_rate': [0.01, 0.05, 0.1],      # Tốc độ học
    'model__subsample': [0.8, 1.0],                 # Tỷ lệ mẫu
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = best_model.score(X_test, y_test)
print(f'Mean Squared Error: {mse}')
print(f'R²: {r2}')

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Mean Squared Error: 12.114641458467931
R²: 0.5135587095114961
