# Project 2 IMDB Classification

## Init 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load Data
dataset=pd.read_csv('data/train_dataset.csv')
dataset.drop(columns=['id'], inplace=True)

In [None]:
# Preview Data
dataset.shape
dataset.head()

In [None]:
# Seperate dataset into features and classes
y = dataset['imdb_score_binned']
dataset.drop(columns=['imdb_score_binned'], inplace=True)
X = dataset

## Class Distribution

In [None]:
# We will find the mean imdb score
mean_imdb_score = y.mean()
print(mean_imdb_score)

In [None]:
# Making a histogram of count of imdb score
plt.rcParams.update({'font.size': 22})
plt.figure(figsize=(12,8))
print(y.name)

ax = sns.countplot(x=y.name, data=y.to_frame())
plt.title('Count of Movie Ratings')
plt.xlabel('IMDB Rating Binned')
plt.ylabel('Count')
total = y.count()
ax.bar_label(ax.containers[0], fmt=lambda x: f'{(x/total)*100:0.1f}%')
ax.margins(y=0.1)
plt.show()

## Data Preprocessing

### Normalisation

In [None]:
# We will standardise some of the features. We will use z-score standardisation as it is better for zero mean models.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
# Get all numeric attributes
numeric = X.select_dtypes(include='number')
numeric.drop(columns=['title_year'], inplace=True)
numeric.columns

In [None]:
plt.rcParams.update({'font.size': 12})
# Check distributions before normalisation
for c in numeric.columns:
    # Looking at the distribution of the duration of the movies
    plt.figure(figsize=(10,6))
    sns.histplot(X[c])
    plt.title(f'Distribution of {c}')
    plt.xlabel(c)
    plt.ylabel('Number of Movies')
    plt.show()

In [None]:
# Normalise the fields
for c in numeric.columns:
    X[c] = scaler.fit_transform(X[[c]])

In [None]:
# Check fields are normalised
for c in numeric.columns:
    # Looking at the distribution of the duration of the movies
    plt.figure(figsize=(10,6))
    sns.histplot(X[c])
    plt.title(f'Distribution of {c}')
    plt.xlabel(c)
    plt.ylabel('Number of Movies')
    plt.show()

### Correlation

In [None]:
# We will do a correlation table between all the columns.
print(y.to_frame())
data = pd.concat([X.select_dtypes(include='number'), y.to_frame()], axis=1)
correlation = (data).corr()
# We will plot the correlation table
plt.figure(figsize=(12,8))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Table')
plt.show()
data.head()

### Removing Attributes

#### Removing Highly Correlated Attributes

In [None]:
# cast_total_facebook_likes are correlated heavily with actor_1_facebook_likes, actor_2_facebook_likes, and actor_3_facebook_likes, so we drop the actor facebook likes
X.drop(columns=['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes'], inplace=True)

# num_voted_users and num_user_for_reviews are highly correlated, so we drop the number of user reviews
X.drop(columns=['num_user_for_reviews'], inplace=True)

#### Removing Text Attributes

In [None]:
# We will drop the id, director name, actor names, movie title columns, and plot keywords.
X.drop(columns=['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'movie_title', 'plot_keywords'], inplace=True)

# Removing title embedding
X.drop(columns=['title_embedding'], inplace=True)

#### Removing Attributes without much variation
These attributes have a lot of unique values, with one value being predominant

In [None]:
X.columns

In [None]:
""" for c in X.columns:
    plt.figure(figsize=(30,8))
    sns.countplot(x=c, data=X)
    plt.title(f'Count of Movies {c}')
    plt.xlabel(c)
    plt.ylabel('Count')
    plt.show() """

In [None]:
X.drop(columns=['country', 'language', 'content_rating'], inplace=True)
X.columns

### Get doc2vec data for plot.

This is for vectorizing plot, basically a better bagofwords. If we want to use the movie's plot as a feature instead of useing words we should use this.

In [None]:
doc2vec_plot_data = np.load('data/features_doc2vec/train_doc2vec_features_plot_keywords.npy')

We are assuming that if we have a movie that has a higly related plot point to a film in training dataset then the movie will have a similar score.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

doc2vec_scaled = scaler.fit_transform(doc2vec_plot_data)
# Now we create a similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(doc2vec_scaled)

In [None]:
# We will now create a matrix that is a matrix of difference of rating between movies
imdb_score_matrix = np.array(y)
imdb_score_matrix = imdb_score_matrix.reshape(-1,1)
imdb_score_matrix = np.repeat(imdb_score_matrix, len(imdb_score_matrix), axis=1)
imdb_score_matrix = np.abs(imdb_score_matrix - imdb_score_matrix.T)


In [None]:
# Now we will scatter plot cosine similarity and imdb score difference
plt.figure(figsize=(12,8))
sns.scatterplot(x=similarity_matrix.flatten(), y=imdb_score_matrix.flatten())
plt.title('Cosine Similarity vs IMDB Score Difference')
plt.xlabel('Cosine Similarity')
plt.ylabel('IMDB Score Difference')
plt.show()


### One Hot Encoding Genres

In [None]:
# We have a genre column which has multiple genres separated by '|'. We will split these genres into separate columns.
# We will first find all the unique genres
unique_genres = set()
for genre in X['genres']:
    genre_list = genre.split('|')
    for g in genre_list:
        unique_genres.add(g)
unique_genres = list(unique_genres)
print(unique_genres)
# Now we split the genres into separate columns
for genre in unique_genres:
    X[genre] = X['genres'].str.contains(genre).astype(int)
X.drop(columns=['genres'], inplace=True)
X.head()


In [None]:
# We will also store the genres in a separate variable because we want to do stepwise selection for feature selection
genres = X[unique_genres]
X.drop(columns=unique_genres, inplace=True)
X.head()

## Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve
from sklearn.model_selection import cross_val_score

# Assuming X and y are defined elsewhere in your code:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_genres = pd.concat([X, genres], axis=1)
# Adding genres to training set
X_train_genres = pd.concat([X_train, genres.iloc[X_train.index]], axis=1)
# Adding genres to test set
X_test_genres = pd.concat([X_test, genres.iloc[X_test.index]], axis=1)

In [None]:
class Model:
    def __init__(self, estimator, param_grid):
        self.estimator = estimator
        self.param_grid = param_grid

    def grid_search(self, cv=5, n_jobs=-1, verbose=False, refit=True):
        self.gs = GridSearchCV(self.estimator, param_grid=self.param_grid, cv=cv, n_jobs=n_jobs, verbose=verbose, refit=refit)
    
    def get_best_model(self, X_train, y_train):
        self.gs.fit(X_train, y_train)
        self.best_estimator = self.gs.best_estimator_
        self.best_params = self.gs.best_params_
        self.best_cv = self.gs.best_score_
        return self.best_estimator, self.best_params, self.best_cv 

    def predict(self, X_test):
        y_pred = self.best_estimator.predict(X_test)
        return y_pred
    
    def sfs(self):
        self.sffs = SFS(self.best_estimator, k_features='best', forward=True, floating=False, cv=None, verbose = False)
    
    def get_best_features(self, X_train, y_train):
        self.sffs = self.sffs.fit(X_train, y_train)
        best_subset = self.sffs.subsets_[max(self.sffs.subsets_, key=lambda k: self.sffs.subsets_[k]['avg_score'])]
        return list(best_subset['feature_names'])

    def fit_with_features(self, X_train, y_train, features=None):
        if features is None:
            self.best_estimator.fit(X_train, y_train)
        else: 
            self.best_estimator.fit(X_train[features], y_train)

    def get_learning_curve(self, X_train, y_train):
        self.train_sizes, self.train_scores, self.test_scores = learning_curve(self.best_estimator, X_train, y_train, cv=5)

        train_mean = np.mean(self.train_scores, axis=1)
        train_std = np.std(self.train_scores, axis=1)
        test_mean = np.mean(self.test_scores, axis=1)
        test_std = np.std(self.test_scores, axis=1)

        # Plot learning curves
        plt.plot(self.train_sizes, train_mean, label="Training score")
        plt.plot(self.train_sizes, test_mean, label="Cross-validation score")
        plt.fill_between(self.train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
        plt.fill_between(self.train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
        plt.xlabel("Training Set Size")
        plt.ylabel("Accuracy Score")
        plt.title("Learning Curves")
        plt.legend()
        plt.show()


## Model 1: Support Vector Machine

In [None]:
from sklearn.svm import SVC

### Tuning Hyperparameters

In [None]:
# SVM Model
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'gamma': [0.1, 0.01, 0.001],  # Kernel coefficient for RBF, polynomial and sigmoid
    'kernel': ['poly', 'linear', 'rbf']  # Type of kernel
}

svm = Model(SVC(class_weight = 'balanced', max_iter=10000), param_grid)
svm.grid_search()
svm_best, svm_params, svm_cv = svm.get_best_model(X_train_genres, y_train)

In [None]:
# Printing parameters for the best estimator including kernel)
print(svm_params)
y_pred_svm = svm.predict(X_test_genres)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f'SVM Accuracy: {svm_accuracy}')
print(f'SVM Cross Validation Score Best: {svm_cv}')
svm_cv_scores = cross_val_score(svm_best, X, y, cv=5)
print(f'SVM Cross Validation Score Average: {svm_cv_scores.mean()}')
print(classification_report(y_test, y_pred_svm))

In [None]:
svm.get_learning_curve(X, y)

### Feature Selection

In [None]:
# Doing feature selection with the best SVM model
svm.sfs()
svm_best_features = svm.get_best_features(X_train, y_train)
print(svm_best_features)
svm_best_features += unique_genres
svm.fit_with_features(X_train_genres, y_train, svm_best_features)

In [None]:
# Now checking accuracy with the best features
y_pred_svm_ft = svm.predict(X_test_genres[svm_best_features])
svm_accuracy_ft = accuracy_score(y_test, y_pred_svm_ft)
svm_cv_scores = cross_val_score(svm_best, X_genres, y, cv=5)
print(f'SVM Accuracy with selected features: {svm_accuracy_ft}')
print(f'SVM Cross Validation Score with Best Features: {svm_cv_scores.mean()}')
print(classification_report(y_test, y_pred_svm_ft))

In [None]:
svm.get_learning_curve(X_genres[svm_best_features], y)

## Model 2: Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

### Tuning Hyperparameters

In [None]:
# Logistic Regression Model
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Types of penalties
    'solver': ['saga'],  # Solvers
    'max_iter': [100,200,1000]  # Number of iterations
}
logr = Model(LogisticRegression(), param_grid)
logr.grid_search()
logr_best, logr_params, logr_cv = logr.get_best_model(X_train, y_train)

In [None]:
# Printing parameters for the best estimator including kernel)
print(logr_params)
y_pred_logr = logr.predict(X_test)
logr_accuracy = accuracy_score(y_test, y_pred_logr)
print(f'Logistic Regression Accuracy: {logr_accuracy}')
print(f'Logistic Regression Cross Validation Score Best: {logr_cv}')
logr_cv_scores = cross_val_score(logr_best, X, y, cv=5)
print(f'Logistic Regression Cross Validation Score Average: {logr_cv_scores.mean()}')
print(classification_report(y_test, y_pred_logr))

In [None]:
logr.get_learning_curve(X, y)

### Feature Selection

In [None]:
# Doing feature selection with the best logr model
logr.sfs()
logr_best_features = logr.get_best_features(X_train, y_train)
print(logr_best_features)
logr_best_features += unique_genres
logr.fit_with_features(X_train_genres, y_train, logr_best_features)

In [None]:
# Now checking accuracy with the best features
y_pred_logr_ft = logr.predict(X_test_genres[logr_best_features])
logr_accuracy_ft = accuracy_score(y_test, y_pred_logr_ft)
logr_cv_scores = cross_val_score(logr_best, X_genres, y, cv=5)
print(f'Logistic Regression Accuracy with selected features: {logr_accuracy_ft}')
print(f'Logistic Regression Cross Validation Score with Best Features: {logr_cv_scores.mean()}')
print(classification_report(y_test, y_pred_logr_ft))

In [None]:
logr.get_learning_curve(X_genres[logr_best_features], y)

## Model 3: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Tuning Hyperparameters

In [None]:
# Random Forest Model
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, 30], 'max_features': ['auto', 'sqrt', 'log2']}
rf = Model(RandomForestClassifier(random_state=42), param_grid)
rf.grid_search()
rf_best, rf_params, rf_cv = rf.get_best_model(X_train, y_train)

In [None]:
# Printing parameters for the best estimator including kernel)
print(rf_params)
y_pred_rf = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Accuracy: {rf_accuracy}')
print(f'Random Forest Cross Validation Score Best: {rf_cv}')
rf_cv_scores = cross_val_score(rf_best, X, y, cv=5)
print(f'Random Forest Cross Validation Score Average: {rf_cv_scores.mean()}')
print(classification_report(y_test, y_pred_rf))

In [None]:
# Plotting learning curve for random forest
rf.get_learning_curve(X, y)

### Feature Selection

In [None]:
# Doing feature selection with the best random forest model
rf.sfs()
rf_best_features = rf.get_best_features(X_train, y_train)
print(rf_best_features)
rf_best_features += unique_genres
rf.fit_with_features(X_train_genres, y_train, rf_best_features)

In [None]:
# Now checking accuracy with the best features
y_pred_rf_ft = rf.predict(X_test_genres[rf_best_features])
rf_accuracy_ft = accuracy_score(y_test, y_pred_rf_ft)
rf_cv_scores = cross_val_score(rf_best, X, y, cv=5)
print(f'Random Forest Accuracy with selected features: {rf_accuracy_ft}')
print(f'Random Forest Cross Validation Score with Best Features: {rf_cv_scores.mean()}')
print(classification_report(y_test, y_pred_rf_ft))

In [None]:
# Plotting learning curve for random forest with best features
rf.get_learning_curve(X_genres[rf_best_features], y)

## Model 4: Ensemble Model by Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
estimators = [('rf', rf_best), ('logr', logr_best)]
stack = StackingClassifier(estimators=estimators, final_estimator=logr_best)
stack.fit(X_train_genres, y_train)
y_pred_stack = stack.predict(X_test_genres)
stack_cv_scores = cross_val_score(stack, X_genres, y, cv=5)
print(f'Stacking Classifier Cross Validation Score: {stack_cv_scores.mean()}')

In [None]:
# Plotting learning curve for stacking
train_sizes, train_scores, test_scores = learning_curve(stack, X_genres, y, cv=5)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot learning curves
plt.plot(train_sizes, train_mean, label="Training score")
plt.plot(train_sizes, test_mean, label="Cross-validation score")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
plt.xlabel("Training Set Size")
plt.ylabel("Accuracy Score")
plt.title("Learning Curves")
plt.legend()
plt.show()

# Prediction on test dataset.

In [None]:
# Loading test dataset
test_dataset = pd.read_csv('data/test_dataset.csv')

In [None]:
# Get all numeric attributes
numeric = test_dataset.select_dtypes(include='number')
numeric.columns

In [None]:
# Normalise the fields
for c in numeric.columns:
    test_dataset[c] = scaler.fit_transform(test_dataset[[c]])

In [None]:
# Dropping columns to match training dataset
test_dataset.drop(columns=['actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes'], inplace=True)
test_dataset.drop(columns=['num_user_for_reviews'], inplace=True)
test_dataset.drop(columns=['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'movie_title', 'plot_keywords'], inplace=True)
test_dataset.drop(columns=['title_embedding'], inplace=True)
test_dataset.drop(columns=['country', 'language', 'content_rating'], inplace=True)
test_dataset.drop(columns=['id'], inplace=True)

In [None]:
# Splitting genres into separate columns
for genre in unique_genres:
    test_dataset[genre] = test_dataset['genres'].str.contains(genre).astype(int)
test_dataset.drop(columns=['genres'], inplace=True)

In [None]:
test_dataset.head()

In [None]:
# Predictions for each model
# svm
y_pred_svm = svm.predict(test_dataset[svm_best_features])
test_dataset['imdb_score_binned'] = y_pred_svm
test_dataset['id'] = test_dataset.index +1
test_dataset[['id', 'imdb_score_binned']].to_csv('svm_predictions.csv', index=False)
test_dataset.drop(columns=['imdb_score_binned', 'id'], inplace=True)


In [None]:

# logr
y_pred_logr = logr.predict(test_dataset[logr_best_features])
test_dataset['imdb_score_binned'] = y_pred_logr
test_dataset['id'] = test_dataset.index+1
test_dataset[['id', 'imdb_score_binned']].to_csv('logr_predictions.csv', index=False)
test_dataset.drop(columns=['imdb_score_binned', 'id'], inplace=True)


In [None]:

# rf
y_pred_rf = rf.predict(test_dataset[rf_best_features])
test_dataset['imdb_score_binned'] = y_pred_rf
test_dataset['id'] = test_dataset.index +1
test_dataset[['id', 'imdb_score_binned']].to_csv('rf_predictions.csv', index=False)
test_dataset.drop(columns=['imdb_score_binned', 'id'], inplace=True)



In [None]:
# stack
y_pred_stack = stack.predict(test_dataset)
test_dataset['imdb_score_binned'] = y_pred_stack
test_dataset['id'] = test_dataset.index +1
test_dataset[['id', 'imdb_score_binned']].to_csv('stack_predictions.csv', index=False)
test_dataset.drop(columns=['imdb_score_binned', 'id'], inplace=True)
