# Import Libraries and Data

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

%run -i 'functions/model_eval.py'

In [2]:
modeling_df = pd.read_csv('/Users/will4856/Downloads/modeling_df.csv', index_col='Unnamed: 0')
modeling_df.head()

Unnamed: 0,Title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,script,Metascore,imdbRating,imdbID,ROI_scaled,Metascore_scaled,imdbRating_scaled,success_metric,cleaned_scripts_lemmatize,success_failure,word_cloud_scripts
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...,45.0,6.6,tt1298650,0.005394,0.408602,0.649351,1.38,sub edit npdv indoheroes gmail com advertise p...,0,"['sub', 'edit', 'npdv', 'indoheroes', 'gmail',..."
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...,66.0,7.3,tt2395427,0.00752,0.634409,0.74026,1.56,distant explosion strucker pa report station i...,1,"['distant', 'explosion', 'strucker', 'pa', 're..."
2,Justice League,2017,300000000,229024295,655945209,355945209,1.19,"There he is! Oh, sorry. Superman, Superman, ca...",45.0,6.4,tt0974015,0.004251,0.408602,0.623377,1.34,oh sorry superman superman ask question podcas...,0,"['oh', 'sorry', 'superman', 'superman', 'ask',..."
3,Spectre,2015,300000000,200074175,879620923,579620923,1.93,"Where are you going? I won't be long. Welcome,...",60.0,6.8,tt2379713,0.005735,0.569892,0.675325,1.45,going long welcome signor soiarra trust pleasa...,1,"['going', 'long', 'welcome', 'signor', 'soiarr..."
4,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,There was a time above. A time before. There w...,44.0,6.5,tt2975590,0.006517,0.397849,0.636364,1.36,time time perfect thing diamond absolute thing...,0,"['time', 'time', 'perfect', 'thing', 'diamond'..."


# Data Pre-Processing

In [4]:
modeling_df.head()

Unnamed: 0,Title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,script,Metascore,imdbRating,imdbID,ROI_scaled,Metascore_scaled,imdbRating_scaled,success_metric,cleaned_scripts,success_failure
0,The Amazing Spider-Man,2012,220000000,262030663,757890267,537890267,2.44,"<font color=""#D900D9"">(♪♪♪)</font> <font color...",66.0,6.9,tt0948470,0.006758,0.634409,0.688312,1.48,♪♪♪ peter five four three two one ready or not...,1
1,Battleship,2012,220000000,65233400,313477717,93477717,0.42,"<font color=""#808080"">NOGRADY:</font> Today re...",41.0,5.8,tt1440129,0.002707,0.365591,0.545455,1.22,nogrady today really mark the first stage of a...,0
2,Maleficent,2014,180000000,241407328,758536735,578536735,3.21,"<font color=""#808080"">NARRATOR:</font> Let us ...",56.0,7.0,tt1587310,0.008302,0.526882,0.701299,1.48,narrator let u tell an old story anew and we w...,1
3,Thor: The Dark World,2013,150000000,206362140,644602516,494602516,3.3,"<font color=""#808080"">ODIN:</font> Long before...",54.0,6.9,tt1981115,0.008482,0.505376,0.688312,1.46,odin long before the birth of light there wa d...,1
4,Pan,2015,150000000,35088320,151525973,1525973,0.01,PAN Translation and review by Angel. I am goin...,36.0,5.7,tt3332064,0.001885,0.311828,0.532468,1.19,pan translation and review by angel i am going...,0


In [20]:
modeling_df = modeling_df.loc[modeling_df['cleaned_scripts'].notna()]

## Vectorizing

In [28]:
vectorizer = TfidfVectorizer()

In [29]:
X = modeling_df['cleaned_scripts']
y = modeling_df['success_failure']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [31]:
X_vect = vectorizer.fit_transform(X)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Dimensionality Reduction with TruncatedSVD

In [44]:
tsvd = TruncatedSVD()

In [45]:
tsvd.fit(X_vect)
X_train_tsvd = tsvd.transform(X_train_tfidf)
X_test_tsvd = tsvd.transform(X_test_tfidf)

# Modeling

## XGBoosted Classifier

In [74]:
xgbc_clf = XGBClassifier()

In [75]:
model_eval(xgbc_clf)

              precision    recall  f1-score   support

           0       0.55      0.65      0.60       283
           1       0.62      0.53      0.57       311

    accuracy                           0.58       594
   macro avg       0.59      0.59      0.58       594
weighted avg       0.59      0.58      0.58       594

[[183 100]
 [147 164]]
Training Accuracy Score ->  64.92411467116358
Test Accuracy Score ->  58.417508417508415


## Support Vector Machine Classifier

In [76]:
svc_clf = SVC()

In [77]:
model_eval(svc_clf)



              precision    recall  f1-score   support

           0       0.52      0.82      0.64       283
           1       0.66      0.32      0.43       311

    accuracy                           0.56       594
   macro avg       0.59      0.57      0.53       594
weighted avg       0.60      0.56      0.53       594

[[233  50]
 [213  98]]
Training Accuracy Score ->  56.155143338954474
Test Accuracy Score ->  55.72390572390572
