# Movie Revenue Prediction

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

from datetime import datetime as dt

# preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

# tuning
from sklearn.compose import make_column_transformer
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# evaluation
from sklearn import metrics


### Configurations

In [None]:
plt.rcParams["figure.figsize"] = (20, 6)
plt.style.use("dark_background")
ENDDEL = "\n\t" + "---" * 15 + "\n"
pd.set_option("display.float_format", "{:,.0f}".format)


### Data Loading

In [None]:
def load_data(train):
    directory = "Training_data/" if train == True else "Testing_data/"
    directors = pd.read_csv(directory + "Prediction_data/movie-director.csv", header=0, names=["Movie_Title", "Director"])
    actors = pd.read_csv(
        directory + "Prediction_data/movie-voice-actors.csv", header=0, names=["Character", "Actor", "Movie_Title"]
    )
    revenues = pd.read_csv(
        directory + "Prediction_data/movies-revenue.csv",
        header=0,
        names=["Movie_Title", "Release_Date", "Genre", "MPAA_Rating", "Revenue"],
    )

    revenues.Revenue = pd.to_numeric(revenues.Revenue.str.replace(r"\D", "", regex=True))
    revenues.Release_Date = pd.to_datetime(revenues.Release_Date, format="%d-%b-%y")

    # Fix incorrect year parsing
    revenues.Release_Date = revenues.Release_Date.apply(
        lambda x: x.replace(year=x.year - 100 if x.year > dt.today().year else x.year)
    )

    return pd.merge(directors, pd.merge(actors, revenues, on="Movie_Title", how="outer"), on="Movie_Title", how="outer")


original_data = load_data(True)
original_data.head()


### Data Exploration

In [None]:
original_data.head()


In [None]:
original_data.isna().sum()


In [None]:
original_data.describe()


In [None]:
print(original_data.Genre.unique(), end=ENDDEL)
print(original_data.MPAA_Rating.unique(), end=ENDDEL)
print(original_data.Director.unique(), end=ENDDEL)


In [None]:
# Some dates are incorrectly parsed
original_data.Release_Date.describe(datetime_is_numeric=True)


In [None]:
_, ax = plt.subplots(1, 2)
sb.stripplot(x=original_data.Revenue, ax=ax[0])
sb.histplot(x=original_data.Revenue, ax=ax[1])
# HUGE variance!
original_data.Revenue.var()


### Data Cleaning

In [None]:
original_data.dropna(subset=["Revenue"], inplace=True)
original_data.MPAA_Rating.fillna("Not Rated", inplace=True)
original_data.Director.fillna("Other", inplace=True)
original_data.Genre.fillna("Other", inplace=True)


In [None]:
original_data.isna().sum()
# original_data[original_data.Character.isna()]
# original_data[original_data.Actor.isna()]


In [None]:
# # Fix incorrect year parsing
# original_data.Release_Date = original_data.Release_Date.apply(
#     lambda x: x.replace(year=x.year - 100 if x.year > dt.today().year else x.year)
# )
original_data.Release_Date.describe(datetime_is_numeric=True)


#### Remove outliters

In [None]:
original_data.describe()
_, ax = plt.subplots(1, 2)
sb.stripplot(x=original_data.Revenue, ax=ax[0])
sb.histplot(x=original_data.Revenue, ax=ax[1])


In [None]:
# Remove outliters
original_data = original_data[original_data.Revenue < original_data.Revenue.quantile(0.9)]
original_data.describe()

_, ax = plt.subplots(1, 2)
sb.stripplot(x=original_data.Revenue, ax=ax[0])
sb.histplot(x=original_data.Revenue, ax=ax[1])


### Feature Engineering

In [None]:
original_data = original_data.sample(frac=1).reset_index(drop=True)


In [None]:
release_day = original_data.Release_Date.dt.day
release_month = original_data.Release_Date.dt.month
release_year = original_data.Release_Date.dt.year

movie_age = dt.today().year - release_year


#### Correlations

In [None]:
print("movie_age correlation with revenue:", movie_age.corr(original_data.Revenue), end=ENDDEL)
print("release_day correlation with revenue:", release_day.corr(original_data.Revenue), end=ENDDEL)
print("release_month correlation with revenue:", release_month.corr(original_data.Revenue), end=ENDDEL)
print("release_year correlation with revenue:", release_year.corr(original_data.Revenue), end=ENDDEL)


In [None]:
age_corr = dict()
for i in range(movie_age.min(), movie_age.max() + 5, 5):
    age_corr[i] = (movie_age >= i).corr(original_data.Revenue)

sb.barplot(x=list(age_corr.keys()), y=list(age_corr.values())).set(xlabel="Movie Age", ylabel="Revenue")
del age_corr


### Visualization

In [None]:
# Drop duplicate movies FOR VISUALIZATION
unique_records = original_data.drop_duplicates(subset=["Movie_Title"])
unique_records.shape


#### [Genre, Director, MPAA_Rating] Frequencies

In [None]:
_, ax = plt.subplots(1, 3)
unique_records.Genre.value_counts().plot(ax=ax[0], kind="bar", title="Genre")
unique_records.Director.value_counts().plot(ax=ax[1], kind="bar", title="Director")
unique_records.MPAA_Rating.value_counts().plot(ax=ax[2], kind="bar", title="MPAA_Rating")


#### [Genre, Director, MPAA_Rating] VS Revenue

In [None]:
_, ax = plt.subplots(1, 3)
sb.scatterplot(
    ax=ax[0],
    x=unique_records.MPAA_Rating[unique_records.Director != "Not Rated"],
    y=unique_records.Revenue[unique_records.Director != "Not Rated"],
)

sb.scatterplot(
    ax=ax[1],
    x=unique_records.Genre[unique_records.Director != "Other"],
    y=unique_records.Revenue[unique_records.Director != "Not Rated"],
)

sb.scatterplot(
    ax=ax[2],
    x=unique_records.Director[unique_records.Director != "Other"],
    y=unique_records.Revenue[unique_records.Director != "Not Rated"],
)


#### Total contribution of each feature to revenue

In [None]:
_, ax = plt.subplots(1, 3)

# dont consider 'other' genres
pd.pivot_table(
    unique_records[unique_records.Genre != "Other"],
    index="Genre",
    values="Revenue",
    aggfunc="sum",
).plot(kind="bar", ax=ax[0], title="Genres")

# dont consider 'other' directors
pd.pivot_table(
    unique_records[unique_records.Director != "Other"],
    index="Director",
    values="Revenue",
    aggfunc="sum",
).plot(kind="bar", ax=ax[1], title="Directors")

# dont consider 'Not Rated' movies
pd.pivot_table(
    unique_records[unique_records.Director != "Not Rated"],
    index="MPAA_Rating",
    values="Revenue",
    aggfunc="sum",
).plot(kind="bar", ax=ax[2], title="MPAA_Rating")


#### Release Year VS Revenue

In [None]:
sb.scatterplot(x=unique_records.Release_Date.dt.year, y=unique_records.Revenue)


#### Movie Age Distribution

In [None]:
sb.histplot(movie_age)
sb.histplot(dt.today().year - unique_records.Release_Date.dt.year)


#### Movie Age VS Revenue

In [None]:
sb.barplot(x=movie_age, y=original_data.Revenue)


#### Cleanup

In [None]:
del unique_records


### Data Preprocessing

#### Preprocess Train Data

In [None]:
original_data.head()


In [None]:
original_data = pd.concat(
    [
        original_data,
        pd.DataFrame(
            {
                "Release_Day": release_day,
                "Release_Month": release_month,
                "Release_Year": release_year,
                "Movie_Age": movie_age,
            }
        ),
    ],
    axis=1,
)


In [None]:
feature_cols = ["Genre", "Director", "MPAA_Rating", "Release_Day", "Release_Month", "Release_Year", "Movie_Age"]


In [None]:
x_train = original_data[feature_cols]
y_train = original_data.Revenue

x_train.describe()


In [None]:
x_train.head()


In [None]:
x_train = x_train.sample(frac=1, random_state=7).reset_index(drop=True)


#### Preprocess Test Data

In [None]:
test_data = load_data(False)


In [None]:
test_data.isna().sum()


In [None]:
release_day = test_data.Release_Date.dt.day
release_month = test_data.Release_Date.dt.month
release_year = test_data.Release_Date.dt.year
movie_age = dt.today().year - release_year

test_data.MPAA_Rating.fillna("Not Rated", inplace=True)
test_data.Director.fillna("Other", inplace=True)

test_data = pd.concat(
    [
        test_data,
        pd.DataFrame(
            {
                "Release_Day": release_day,
                "Release_Month": release_month,
                "Release_Year": release_year,
                "Movie_Age": movie_age,
            }
        ),
    ],
    axis=1,
)

x_test = test_data[x_train.columns]
y_test = test_data.Revenue


In [None]:
y_train = y_train.apply(lambda x: np.log10(x) if x != 0 else x)
y_test = y_test.apply(lambda x: np.log10(x) if x != 0 else x)


#### Column Transformer

In [None]:
ohe_column_transformer = make_column_transformer(
    (OneHotEncoder(), ["Genre", "Director", "MPAA_Rating"]),
    (MinMaxScaler(), ["Release_Day", "Release_Month", "Release_Year"]),
    n_jobs=-1,
    sparse_threshold=0,
    verbose_feature_names_out=False,
)


In [None]:
prep_x_train = ohe_column_transformer.fit_transform(x_train)
prep_x_test = ohe_column_transformer.transform(x_test)


In [None]:
print(prep_x_train.shape, prep_x_test.shape)


### Models

#### Basic Linear Regression

In [None]:
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(prep_x_train, y_train)
lr_model_y_predict = lr_model.predict(prep_x_test)


In [None]:
print(y_test.iloc[0], lr_model_y_predict[0])
metrics.mean_squared_error(y_test, lr_model_y_predict)


#### Basic Ridge Regression

In [None]:
ridge_model = Ridge(alpha=0.001, max_iter=1000, random_state=7)
ridge_model.fit(prep_x_train, y_train)
ridge_model_y_predict = ridge_model.predict(prep_x_test)


In [None]:
print(y_test.iloc[0], ridge_model_y_predict[0])
metrics.mean_squared_error(y_test, ridge_model_y_predict)


#### Cross Validation

In [None]:
lr_cv = cross_val_score(LinearRegression(n_jobs=-1), prep_x_train, y_train, cv=10, scoring="r2")
print(lr_cv, lr_cv.mean(), sep="\n", end=ENDDEL)


In [None]:
ridge_cv = cross_val_score(ridge_model, prep_x_train, y_train, cv=10, scoring="r2")
print(ridge_cv, ridge_cv.mean(), sep="\n", end=ENDDEL)


#### RFE

In [None]:
lr_rfe = RFE(LinearRegression())
lr_rfe.fit(prep_x_train, y_train)


In [None]:
ridge_rfe = RFE(Ridge(alpha=0.001, max_iter=1000, random_state=7))
ridge_rfe.fit(prep_x_train, y_train)


#### Grid Search

##### Linear Regression

In [None]:
lr_gs_model = GridSearchCV(
    estimator=lr_rfe,
    param_grid={"n_features_to_select": list(range(1, prep_x_train.shape[1]))},
    scoring="r2",
    cv=KFold(n_splits=5, shuffle=True, random_state=7),
    verbose=1,
    return_train_score=True,
    n_jobs=-1,
)


In [None]:
lr_gs_model.fit(prep_x_train, y_train)
lr_gs_y_pred = lr_gs_model.predict(prep_x_test)


In [None]:
metrics.mean_squared_error(y_test, lr_gs_y_pred)


##### Ridge Regression

In [None]:
ridge_gs_model = GridSearchCV(
    estimator=ridge_rfe,
    param_grid={"n_features_to_select": list(range(1, prep_x_train.shape[1]))},
    scoring="r2",
    cv=KFold(n_splits=5, shuffle=True, random_state=7),
    verbose=1,
    return_train_score=True,
)


In [None]:
ridge_gs_model.fit(prep_x_train, y_train)
ridge_gs_y_pred = ridge_gs_model.predict(prep_x_test)


In [None]:
metrics.mean_squared_error(y_test, ridge_gs_y_pred)
