# Import Libraries and Data

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load

%run -i 'functions/model_eval.py'

In [2]:
modeling_df = pd.read_csv('/Users/will4856/Downloads/modeling_df.csv', index_col='Unnamed: 0')
modeling_df.head()

Unnamed: 0,Title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,script,Metascore,imdbRating,imdbID,ROI_scaled,Metascore_scaled,imdbRating_scaled,success_metric,cleaned_scripts_lemmatize,success_failure,word_cloud_scripts
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...,45.0,6.6,tt1298650,0.005394,0.408602,0.649351,1.38,sub edit npdv indoheroes gmail com advertise p...,0,"['sub', 'edit', 'npdv', 'indoheroes', 'gmail',..."
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...,66.0,7.3,tt2395427,0.00752,0.634409,0.74026,1.56,distant explosion strucker pa report station i...,1,"['distant', 'explosion', 'strucker', 'pa', 're..."
2,Justice League,2017,300000000,229024295,655945209,355945209,1.19,"There he is! Oh, sorry. Superman, Superman, ca...",45.0,6.4,tt0974015,0.004251,0.408602,0.623377,1.34,oh sorry superman superman ask question podcas...,0,"['oh', 'sorry', 'superman', 'superman', 'ask',..."
3,Spectre,2015,300000000,200074175,879620923,579620923,1.93,"Where are you going? I won't be long. Welcome,...",60.0,6.8,tt2379713,0.005735,0.569892,0.675325,1.45,going long welcome signor soiarra trust pleasa...,1,"['going', 'long', 'welcome', 'signor', 'soiarr..."
4,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,There was a time above. A time before. There w...,44.0,6.5,tt2975590,0.006517,0.397849,0.636364,1.36,time time perfect thing diamond absolute thing...,0,"['time', 'time', 'perfect', 'thing', 'diamond'..."


# Data Pre-Processing

In [3]:
modeling_df = modeling_df.loc[modeling_df['cleaned_scripts_lemmatize'].notna()]

## Vectorizing

In [4]:
vectorizer = TfidfVectorizer()

In [5]:
X = modeling_df['cleaned_scripts_lemmatize']
y = modeling_df['success_failure']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [7]:
X_vect = vectorizer.fit_transform(X)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Dimensionality Reduction with TruncatedSVD

In [8]:
tsvd = TruncatedSVD()

In [9]:
tsvd.fit(X_vect)
X_train_tsvd = tsvd.transform(X_train_tfidf)
X_test_tsvd = tsvd.transform(X_test_tfidf)

# Classification Modeling

## XGBoosted Classifier

In [10]:
xgbc_clf = XGBClassifier()

In [11]:
model_eval(xgbc_clf)

              precision    recall  f1-score   support

           0       0.51      0.59      0.55       279
           1       0.58      0.50      0.54       314

    accuracy                           0.54       593
   macro avg       0.55      0.55      0.54       593
weighted avg       0.55      0.54      0.54       593

[[166 113]
 [158 156]]
Training Accuracy Score ->  66.94772344013491
Test Accuracy Score ->  54.300168634064086


## Support Vector Machine Classifier

In [12]:
svc_clf = SVC()

In [13]:
model_eval(svc_clf)

              precision    recall  f1-score   support

           0       0.54      0.61      0.57       279
           1       0.61      0.55      0.58       314

    accuracy                           0.58       593
   macro avg       0.58      0.58      0.58       593
weighted avg       0.58      0.58      0.58       593

[[169 110]
 [142 172]]
Training Accuracy Score ->  60.286677908937605
Test Accuracy Score ->  57.50421585160203


## Multinomial Naive Bayes Classifier

In [14]:
nbc = MultinomialNB()

In [17]:
c_vectorizer = CountVectorizer()

In [18]:
Xc_vect = c_vectorizer.fit_transform(X)

In [22]:
X_train_cvect, X_test_cvect, y_train, y_test = train_test_split(Xc_vect, y)

In [23]:
nbc.fit(X_train_cvect, y_train)

MultinomialNB()

In [24]:
y_test_preds = nbc.predict(X_test_cvect)
y_train_preds = nbc.predict(X_train_cvect)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Training Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)
print("Test Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)

              precision    recall  f1-score   support

           0       0.57      0.87      0.69       365
           1       0.74      0.36      0.48       377

    accuracy                           0.61       742
   macro avg       0.65      0.61      0.59       742
weighted avg       0.65      0.61      0.58       742

[[317  48]
 [241 136]]
Training Accuracy Score ->  82.99595141700405
Test Accuracy Score ->  61.05121293800539


## Dump Model to PKL File to Deploy

In [29]:
dump(nbc, 'NB_script_model.pkl')

['NB_script_model.pkl']