# Data Loading

In [119]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score


In [120]:
import pickle

def loadDataPickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [121]:
pickleLE = loadDataPickle('WikipediaMovieDatasetLE.pickle')
moviesLE = pd.DataFrame(pickleLE)
moviesLE.head()

Unnamed: 0,Cast Success Score,Director Success Score,Producer Success Score,Cinematographer Success Score,Runtime,Distributor Success Score,Writer Success Score,Editor Success Score,Production Company Success Score,Release Day,Release Month,Release Year,Age Rating,SentimentScore,Budget,Box Office Status
0,0.531868,1.0,1.0,1.0,98,0.37037,1.0,0.666667,0.633333,11,9,2009,5,0.642857,20000000,1
1,-0.039835,0.0,-0.044444,0.090909,100,-0.1,0.0,-0.666667,0.214286,6,1,2010,3,0.83,19000000,0
2,-0.206944,0.0,-0.333333,0.0,89,0.428571,0.0,-0.375,0.0,19,10,2009,5,0.78,18000000,0
3,0.148918,0.0,-0.318182,0.333333,118,-0.154163,0.0,0.0,-0.380952,15,1,2010,5,0.88,80000000,0
4,-0.139456,0.0,-0.6,-0.333333,94,-0.264474,0.0,-0.333333,0.142857,22,12,2009,3,0.58,28000000,0


In [122]:
pickleOHE = loadDataPickle('WikipediaMovieDatasetOHE.pickle')
moviesOHE = pd.DataFrame(pickleOHE)
moviesOHE.head()

Unnamed: 0,Cast Success Score,Director Success Score,Producer Success Score,Cinematographer Success Score,Runtime,Distributor Success Score,Writer Success Score,Editor Success Score,Production Company Success Score,Release Day,...,Release Year,SentimentScore,Budget,Age Rating_G,Age Rating_NC-17,Age Rating_Not Rated,Age Rating_PG,Age Rating_PG-13,Age Rating_R,Box Office Status
0,0.531868,1.0,1.0,1.0,98,0.37037,1.0,0.666667,0.633333,11,...,2009,0.642857,20000000,0,0,0,0,0,1,1
1,-0.039835,0.0,-0.044444,0.090909,100,-0.1,0.0,-0.666667,0.214286,6,...,2010,0.83,19000000,0,0,0,1,0,0,0
2,-0.206944,0.0,-0.333333,0.0,89,0.428571,0.0,-0.375,0.0,19,...,2009,0.78,18000000,0,0,0,0,0,1,0
3,0.148918,0.0,-0.318182,0.333333,118,-0.154163,0.0,0.0,-0.380952,15,...,2010,0.88,80000000,0,0,0,0,0,1,0
4,-0.139456,0.0,-0.6,-0.333333,94,-0.264474,0.0,-0.333333,0.142857,22,...,2009,0.58,28000000,0,0,0,1,0,0,0


# Data Prediction

## Label Encoded Data

In [123]:
moviesLE.head()

Unnamed: 0,Cast Success Score,Director Success Score,Producer Success Score,Cinematographer Success Score,Runtime,Distributor Success Score,Writer Success Score,Editor Success Score,Production Company Success Score,Release Day,Release Month,Release Year,Age Rating,SentimentScore,Budget,Box Office Status
0,0.531868,1.0,1.0,1.0,98,0.37037,1.0,0.666667,0.633333,11,9,2009,5,0.642857,20000000,1
1,-0.039835,0.0,-0.044444,0.090909,100,-0.1,0.0,-0.666667,0.214286,6,1,2010,3,0.83,19000000,0
2,-0.206944,0.0,-0.333333,0.0,89,0.428571,0.0,-0.375,0.0,19,10,2009,5,0.78,18000000,0
3,0.148918,0.0,-0.318182,0.333333,118,-0.154163,0.0,0.0,-0.380952,15,1,2010,5,0.88,80000000,0
4,-0.139456,0.0,-0.6,-0.333333,94,-0.264474,0.0,-0.333333,0.142857,22,12,2009,3,0.58,28000000,0


Splitting The Dataset Into Features And Target Variable

In [124]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Step 1: Split the dataset into features (X) and target variable (Y)
X = moviesLE.drop('Box Office Status', axis=1)
Y = moviesLE['Box Office Status']

# Step 2: Normalize the feature values
scaler = MinMaxScaler()
XNormalized = scaler.fit_transform(X)

# Step 3: Split the dataset into training and testing sets
XTrain, XTemp, YTrain, YTemp = train_test_split(XNormalized, Y, test_size=0.3, random_state=42)

XTest, XVal, YTest, YVal = train_test_split(XTemp, YTemp, test_size=0.2, random_state=42)

In [125]:
print(XTrain.shape, XVal.shape, XTest.shape)

(1232, 15) (106, 15) (423, 15)


### Machine Learning Models

#### Random Forest Classifier

In [126]:
from sklearn.ensemble import RandomForestClassifier

trees = [10, 25, 50, 100, 200]

for tree in trees:
    # Create a Random Forest classifier
    randomForestClassifier = RandomForestClassifier(n_estimators=tree, random_state=42)

    # Train the Random Forest on the training data
    randomForestClassifier.fit(XTrain, YTrain)

    # Predict on the validation set
    YValPredRF = randomForestClassifier.predict(XVal)

    # Evaluate the model on the validation set
    print(f"Random Forest Classifier Accuracy Measures With {tree} No. Of Trees:")
    print(f"F1 Score: {f1_score(YVal, YValPredRF, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredRF)}")
    print(f"Precision: {precision_score(YVal, YValPredRF, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredRF, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredRF)}")
    print('\n')

Random Forest Classifier Accuracy Measures With 10 No. Of Trees:
F1 Score: 0.9144350739670211
Accuracy: 0.9150943396226415
Precision: 0.9180073562443846
Recall: 0.9150943396226415
ROC AUC Score: 0.908582762351244


Random Forest Classifier Accuracy Measures With 25 No. Of Trees:
F1 Score: 0.9621684440798469
Accuracy: 0.9622641509433962
Precision: 0.9627727944461628
Recall: 0.9622641509433962
ROC AUC Score: 0.9596105301117923


Random Forest Classifier Accuracy Measures With 50 No. Of Trees:
F1 Score: 0.9621684440798469
Accuracy: 0.9622641509433962
Precision: 0.9627727944461628
Recall: 0.9622641509433962
ROC AUC Score: 0.9596105301117923


Random Forest Classifier Accuracy Measures With 100 No. Of Trees:
F1 Score: 0.952637414043932
Accuracy: 0.9528301886792453
Precision: 0.9540128921595751
Recall: 0.9528301886792453
ROC AUC Score: 0.9489722322394518


Random Forest Classifier Accuracy Measures With 200 No. Of Trees:
F1 Score: 0.952637414043932
Accuracy: 0.9528301886792453
Precision: 0.9

#### Decison Tree Classifier

In [127]:
from sklearn.tree import DecisionTreeClassifier

criterions = ['gini', 'entropy']

for criterion in criterions:
    decisionTreeClassifier = DecisionTreeClassifier(criterion=criterion)

    decisionTreeClassifier.fit(XTrain, YTrain)

    YValPredDT = decisionTreeClassifier.predict(XVal)

    print(f"Decision Tree Classifier Accuracy Measures With {criterion} Criterion: ")
    print(f"F1 Score: {f1_score(YVal, YValPredDT, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredDT)}")
    print(f"Precision: {precision_score(YVal, YValPredDT, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredDT, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredDT)}")
    print('\n')

Decision Tree Classifier Accuracy Measures With gini Criterion: 
F1 Score: 0.9430662954943808
Accuracy: 0.9433962264150944
Precision: 0.9455135571853431
Recall: 0.9433962264150944
ROC AUC Score: 0.9383339343671114


Decision Tree Classifier Accuracy Measures With entropy Criterion: 
F1 Score: 0.924088393992508
Accuracy: 0.9245283018867925
Precision: 0.9263670365030611
Recall: 0.9245283018867925
ROC AUC Score: 0.9192210602235845




#### K-Nearest Neighbors Classifier

In [128]:
from sklearn.neighbors import KNeighborsClassifier

neighbors = [5, 10, 15, 20, 30]

for neighbor in neighbors:
    knnClassifier = KNeighborsClassifier(n_neighbors=neighbor)

    knnClassifier.fit(XTrain, YTrain)

    YValPredKNN = knnClassifier.predict(XVal)

    # Evaluate the model on the test set
    print(f'KNN Accuracy Measures For {neighbor} Neighbors')
    print(f"F1 Score: {f1_score(YVal, YValPredKNN, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredKNN)}")
    print(f"Precision: {precision_score(YVal, YValPredKNN, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredKNN, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredKNN)}")
    print('\n')

KNN Accuracy Measures For 5 Neighbors
F1 Score: 0.9234139172708721
Accuracy: 0.9245283018867925
Precision: 0.9335398479301605
Recall: 0.9245283018867925
ROC AUC Score: 0.9148936170212766


KNN Accuracy Measures For 10 Neighbors
F1 Score: 0.8937743074095417
Accuracy: 0.8962264150943396
Precision: 0.912533692722372
Recall: 0.8962264150943396
ROC AUC Score: 0.8829787234042553


KNN Accuracy Measures For 15 Neighbors
F1 Score: 0.9037300651392633
Accuracy: 0.9056603773584906
Precision: 0.9193327864369703
Recall: 0.9056603773584906
ROC AUC Score: 0.8936170212765957


KNN Accuracy Measures For 20 Neighbors
F1 Score: 0.9037300651392633
Accuracy: 0.9056603773584906
Precision: 0.9193327864369703
Recall: 0.9056603773584906
ROC AUC Score: 0.8936170212765957


KNN Accuracy Measures For 30 Neighbors
F1 Score: 0.8837339374845126
Accuracy: 0.8867924528301887
Precision: 0.9059261227743822
Recall: 0.8867924528301887
ROC AUC Score: 0.8723404255319149




#### Linear Regression

In [129]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(XTrain, YTrain)

YValPredLR = lr.predict(XVal)

print('Logistic Regression Accuracy Measures:')
print(f"F1 Score: {f1_score(YVal, YValPredLR, average='weighted')}")
print(f"Accuracy: {accuracy_score(YVal, YValPredLR)}")
print(f"Precision: {precision_score(YVal, YValPredLR, average='weighted')}")
print(f"Recall: {recall_score(YVal, YValPredLR, average='weighted')}")
print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredLR)}")
print('\n')

Logistic Regression Accuracy Measures:
F1 Score: 0.9140559531554977
Accuracy: 0.9150943396226415
Precision: 0.9214479702687249
Recall: 0.9150943396226415
ROC AUC Score: 0.9064190407500902




#### Support Vector Classifier

In [130]:
from sklearn.svm import SVC

kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svmClassifier = SVC(kernel=kernel)

    svmClassifier.fit(XTrain, YTrain)

    YValPredSVM = svmClassifier.predict(XVal)

    # Evaluate the model on the validation set
    print(f'SVM Accuracy Measures with {kernel} Kernel:')
    print(f"F1 Score: {f1_score(YVal, YValPredSVM, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredSVM)}")
    print(f"Precision: {precision_score(YVal, YValPredSVM, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredSVM, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredSVM)}")
    print('\n')

SVM Accuracy Measures with linear Kernel:
F1 Score: 0.923781331267637
Accuracy: 0.9245283018867925
Precision: 0.9292435130447095
Recall: 0.9245283018867925
ROC AUC Score: 0.9170573386224306


SVM Accuracy Measures with poly Kernel:
F1 Score: 0.9428359984507276
Accuracy: 0.9433962264150944
Precision: 0.9486211901306241
Recall: 0.9433962264150944
ROC AUC Score: 0.9361702127659575


SVM Accuracy Measures with rbf Kernel:
F1 Score: 0.9430662954943808
Accuracy: 0.9433962264150944
Precision: 0.9455135571853431
Recall: 0.9433962264150944
ROC AUC Score: 0.9383339343671114


SVM Accuracy Measures with sigmoid Kernel:
F1 Score: 0.21779262728844295
Accuracy: 0.2358490566037736
Precision: 0.20315214135968854
Recall: 0.2358490566037736
ROC AUC Score: 0.214028128380815




## One Hot Encoded Data

In [131]:
## Label Encoded Data
moviesOHE.head()

Unnamed: 0,Cast Success Score,Director Success Score,Producer Success Score,Cinematographer Success Score,Runtime,Distributor Success Score,Writer Success Score,Editor Success Score,Production Company Success Score,Release Day,...,Release Year,SentimentScore,Budget,Age Rating_G,Age Rating_NC-17,Age Rating_Not Rated,Age Rating_PG,Age Rating_PG-13,Age Rating_R,Box Office Status
0,0.531868,1.0,1.0,1.0,98,0.37037,1.0,0.666667,0.633333,11,...,2009,0.642857,20000000,0,0,0,0,0,1,1
1,-0.039835,0.0,-0.044444,0.090909,100,-0.1,0.0,-0.666667,0.214286,6,...,2010,0.83,19000000,0,0,0,1,0,0,0
2,-0.206944,0.0,-0.333333,0.0,89,0.428571,0.0,-0.375,0.0,19,...,2009,0.78,18000000,0,0,0,0,0,1,0
3,0.148918,0.0,-0.318182,0.333333,118,-0.154163,0.0,0.0,-0.380952,15,...,2010,0.88,80000000,0,0,0,0,0,1,0
4,-0.139456,0.0,-0.6,-0.333333,94,-0.264474,0.0,-0.333333,0.142857,22,...,2009,0.58,28000000,0,0,0,1,0,0,0


Splitting The Dataset Into Features And Target Variable

In [132]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Step 1: Split the dataset into features (X) and target variable (Y)
X = moviesOHE.drop('Box Office Status', axis=1)
Y = moviesOHE['Box Office Status']

# Step 2: Normalize the feature values
scaler = MinMaxScaler()
XNormalized = scaler.fit_transform(X)

# Step 3: Split the dataset into training and testing sets
XTrain, XTemp, YTrain, YTemp = train_test_split(XNormalized, Y, test_size=0.3, random_state=42)

XTest, XVal, YTest, YVal = train_test_split(XTemp, YTemp, test_size=0.2, random_state=42)
print(XTrain.shape, XVal.shape, XTest.shape)

(1232, 20) (106, 20) (423, 20)


### Machine Learning Models


#### Random Forest Classifier

In [133]:
from sklearn.ensemble import RandomForestClassifier

trees = [10, 25, 50, 100, 200]

for tree in trees:
    # Create a Random Forest classifier
    randomForestClassifier = RandomForestClassifier(n_estimators=tree, random_state=42)

    # Train the Random Forest on the training data
    randomForestClassifier.fit(XTrain, YTrain)

    # Predict on the validation set
    YValPredRF = randomForestClassifier.predict(XVal)

    # Evaluate the model on the validation set
    print(f"Random Forest Classifier Accuracy Measures With {tree} No. Of Trees:")
    print(f"F1 Score: {f1_score(YVal, YValPredRF, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredRF)}")
    print(f"Precision: {precision_score(YVal, YValPredRF, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredRF, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredRF)}")
    print('\n')

Random Forest Classifier Accuracy Measures With 10 No. Of Trees:
F1 Score: 0.9715824484263593
Accuracy: 0.9716981132075472
Precision: 0.973067559342666
Recall: 0.9716981132075472
ROC AUC Score: 0.9680851063829787


Random Forest Classifier Accuracy Measures With 25 No. Of Trees:
F1 Score: 0.9715824484263593
Accuracy: 0.9716981132075472
Precision: 0.973067559342666
Recall: 0.9716981132075472
ROC AUC Score: 0.9680851063829787


Random Forest Classifier Accuracy Measures With 50 No. Of Trees:
F1 Score: 0.9621684440798469
Accuracy: 0.9622641509433962
Precision: 0.9627727944461628
Recall: 0.9622641509433962
ROC AUC Score: 0.9596105301117923


Random Forest Classifier Accuracy Measures With 100 No. Of Trees:
F1 Score: 0.9432526661197703
Accuracy: 0.9433962264150944
Precision: 0.9437948929442898
Recall: 0.9433962264150944
ROC AUC Score: 0.9404976559682653


Random Forest Classifier Accuracy Measures With 200 No. Of Trees:
F1 Score: 0.9621684440798469
Accuracy: 0.9622641509433962
Precision: 0.

#### Decison Tree Classifier

In [134]:
from sklearn.tree import DecisionTreeClassifier

criterions = ['gini', 'entropy']

for criterion in criterions:
    decisionTreeClassifier = DecisionTreeClassifier(criterion=criterion)

    decisionTreeClassifier.fit(XTrain, YTrain)

    YValPredDT = decisionTreeClassifier.predict(XVal)

    print(f"Decision Tree Classifier Accuracy Measures With {criterion} Criterion: ")
    print(f"F1 Score: {f1_score(YVal, YValPredDT, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredDT)}")
    print(f"Precision: {precision_score(YVal, YValPredDT, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredDT, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredDT)}")
    print('\n')

Decision Tree Classifier Accuracy Measures With gini Criterion: 
F1 Score: 0.9334495019743498
Accuracy: 0.9339622641509434
Precision: 0.9372613432165319
Recall: 0.9339622641509434
ROC AUC Score: 0.9276956364947712


Decision Tree Classifier Accuracy Measures With entropy Criterion: 
F1 Score: 0.9336923796615049
Accuracy: 0.9339622641509434
Precision: 0.9349582249764843
Recall: 0.9339622641509434
ROC AUC Score: 0.9298593580959249




#### K-Nearest Neighbors Classifier

In [135]:
from sklearn.neighbors import KNeighborsClassifier

neighbors = [5, 10, 15, 20, 30]

for neighbor in neighbors:
    knnClassifier = KNeighborsClassifier(n_neighbors=neighbor)

    knnClassifier.fit(XTrain, YTrain)

    YValPredKNN = knnClassifier.predict(XVal)

    # Evaluate the model on the test set
    print(f'KNN Accuracy Measures For {neighbor} Neighbors')
    print(f"F1 Score: {f1_score(YVal, YValPredKNN, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredKNN)}")
    print(f"Precision: {precision_score(YVal, YValPredKNN, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredKNN, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredKNN)}")
    print('\n')

KNN Accuracy Measures For 5 Neighbors
F1 Score: 0.9037300651392633
Accuracy: 0.9056603773584906
Precision: 0.9193327864369703
Recall: 0.9056603773584906
ROC AUC Score: 0.8936170212765957


KNN Accuracy Measures For 10 Neighbors
F1 Score: 0.8937743074095417
Accuracy: 0.8962264150943396
Precision: 0.912533692722372
Recall: 0.8962264150943396
ROC AUC Score: 0.8829787234042553


KNN Accuracy Measures For 15 Neighbors
F1 Score: 0.9136078023543394
Accuracy: 0.9150943396226415
Precision: 0.9263318534961154
Recall: 0.9150943396226415
ROC AUC Score: 0.9042553191489362


KNN Accuracy Measures For 20 Neighbors
F1 Score: 0.8837339374845126
Accuracy: 0.8867924528301887
Precision: 0.9059261227743822
Recall: 0.8867924528301887
ROC AUC Score: 0.8723404255319149


KNN Accuracy Measures For 30 Neighbors
F1 Score: 0.8736021536924125
Accuracy: 0.8773584905660378
Precision: 0.8995020964360587
Recall: 0.8773584905660378
ROC AUC Score: 0.8617021276595744




#### Linear Regression

In [136]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(XTrain, YTrain)

YValPredLR = lr.predict(XVal)

print('Linear Regression Accuracy Measures:')
print(f"F1 Score: {f1_score(YVal, YValPredLR, average='weighted')}")
print(f"Accuracy: {accuracy_score(YVal, YValPredLR)}")
print(f"Precision: {precision_score(YVal, YValPredLR, average='weighted')}")
print(f"Recall: {recall_score(YVal, YValPredLR, average='weighted')}")
print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredLR)}")
print('\n')

Linear Regression Accuracy Measures:
F1 Score: 0.9140559531554977
Accuracy: 0.9150943396226415
Precision: 0.9214479702687249
Recall: 0.9150943396226415
ROC AUC Score: 0.9064190407500902




#### Support Vector Classifier

In [137]:
from sklearn.svm import SVC

kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svmClassifier = SVC(kernel=kernel)

    svmClassifier.fit(XTrain, YTrain)

    YValPredSVM = svmClassifier.predict(XVal)

    # Evaluate the model on the validation set
    print(f'SVM Accuracy Measures with {kernel} Kernel:')
    print(f"F1 Score: {f1_score(YVal, YValPredSVM, average='weighted')}")
    print(f"Accuracy: {accuracy_score(YVal, YValPredSVM)}")
    print(f"Precision: {precision_score(YVal, YValPredSVM, average='weighted')}")
    print(f"Recall: {recall_score(YVal, YValPredSVM, average='weighted')}")
    print(f"ROC AUC Score: {roc_auc_score(YVal, YValPredSVM)}")
    print('\n')

SVM Accuracy Measures with {kernel} Kernel:
F1 Score: 0.923781331267637
Accuracy: 0.9245283018867925
Precision: 0.9292435130447095
Recall: 0.9245283018867925
ROC AUC Score: 0.9170573386224306


SVM Accuracy Measures with {kernel} Kernel:
F1 Score: 0.9430662954943808
Accuracy: 0.9433962264150944
Precision: 0.9455135571853431
Recall: 0.9433962264150944
ROC AUC Score: 0.9383339343671114


SVM Accuracy Measures with {kernel} Kernel:
F1 Score: 0.9334495019743498
Accuracy: 0.9339622641509434
Precision: 0.9372613432165319
Recall: 0.9339622641509434
ROC AUC Score: 0.9276956364947712


SVM Accuracy Measures with {kernel} Kernel:
F1 Score: 0.20754716981132076
Accuracy: 0.20754716981132076
Precision: 0.20754716981132076
Recall: 0.20754716981132076
ROC AUC Score: 0.19725928597187162


