# Import Libraries and Data

In [63]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from joblib import dump, load

%run -i 'functions/model_eval.py'

In [42]:
modeling_df = pd.read_csv('/Users/will4856/Downloads/modeling_df.csv', index_col='Unnamed: 0')
modeling_df.head()

Unnamed: 0,Title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,script,Metascore,imdbRating,imdbID,ROI_scaled,Metascore_scaled,imdbRating_scaled,success_metric,cleaned_scripts,success_failure,word_cloud_scripts,word_count
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...,45.0,6.6,tt1298650,0.005394,0.408602,0.649351,1.38,sub edit npdv indoheroes gmail com advertise p...,0,"['sub', 'edit', 'npdv', 'indoheroes', 'gmail',...",3992
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...,66.0,7.3,tt2395427,0.00752,0.634409,0.74026,1.56,distant explosion strucker pa report stations ...,1,"['distant', 'explosion', 'strucker', 'pa', 're...",5690
2,Justice League,2017,300000000,229024295,655945209,355945209,1.19,"There he is! Oh, sorry. Superman, Superman, ca...",45.0,6.4,tt0974015,0.004251,0.408602,0.623377,1.34,oh sorry superman superman ask questions podca...,0,"['oh', 'sorry', 'superman', 'superman', 'ask',...",3255
3,Spectre,2015,300000000,200074175,879620923,579620923,1.93,"Where are you going? I won't be long. Welcome,...",60.0,6.8,tt2379713,0.005735,0.569892,0.675325,1.45,going long welcome signor soiarra trust pleasa...,1,"['going', 'long', 'welcome', 'signor', 'soiarr...",2937
4,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,There was a time above. A time before. There w...,44.0,6.5,tt2975590,0.006517,0.397849,0.636364,1.36,time time perfect things diamond absolutes thi...,0,"['time', 'time', 'perfect', 'things', 'diamond...",4451


In [31]:
modeling_df.to_csv('/Users/will4856/Desktop/script_model/modeling_df.csv')

# Data Pre-Processing

In [43]:
modeling_df = modeling_df.loc[modeling_df['cleaned_scripts'].notna()]

In [44]:
modeling_df['cleaned_scripts'] = modeling_df['cleaned_scripts'].map(lambda x: ''.join(x))

In [45]:
modeling_df

Unnamed: 0,Title,year,production_budget,domestic_gross,worldwide_gross,profit,ROI,script,Metascore,imdbRating,imdbID,ROI_scaled,Metascore_scaled,imdbRating_scaled,success_metric,cleaned_scripts,success_failure,word_cloud_scripts,word_count
0,Pirates of the Caribbean: On Stranger Tides,2011,379000000,241063875,1045663875,666663875,1.76,Sub.Re-Edit.by. :: npdv.indoheroes[at]gmail.co...,45.0,6.6,tt1298650,0.005394,0.408602,0.649351,1.38,sub edit npdv indoheroes gmail com advertise p...,0,"['sub', 'edit', 'npdv', 'indoheroes', 'gmail',...",3992
1,Avengers: Age of Ultron,2015,365000000,459005868,1396099202,1031099202,2.82,(DISTANT EXPLOSION) STRUCKER ON PA: Report to ...,66.0,7.3,tt2395427,0.007520,0.634409,0.740260,1.56,distant explosion strucker pa report stations ...,1,"['distant', 'explosion', 'strucker', 'pa', 're...",5690
2,Justice League,2017,300000000,229024295,655945209,355945209,1.19,"There he is! Oh, sorry. Superman, Superman, ca...",45.0,6.4,tt0974015,0.004251,0.408602,0.623377,1.34,oh sorry superman superman ask questions podca...,0,"['oh', 'sorry', 'superman', 'superman', 'ask',...",3255
3,Spectre,2015,300000000,200074175,879620923,579620923,1.93,"Where are you going? I won't be long. Welcome,...",60.0,6.8,tt2379713,0.005735,0.569892,0.675325,1.45,going long welcome signor soiarra trust pleasa...,1,"['going', 'long', 'welcome', 'signor', 'soiarr...",2937
4,Batman v Superman: Dawn of Justice,2016,263000000,330360194,872395091,609395091,2.32,There was a time above. A time before. There w...,44.0,6.5,tt2975590,0.006517,0.397849,0.636364,1.36,time time perfect things diamond absolutes thi...,0,"['time', 'time', 'perfect', 'things', 'diamond...",4451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3004,Princess Mononoke,2005,20000000,2374107,150345863,130345863,6.52,"[In ancient times, the land, lay covered i...",76.0,8.4,tt0119698,0.014939,0.741935,0.883117,1.80,ancient times land lay covered forests ages lo...,1,"['ancient', 'times', 'land', 'lay', 'covered',...",4583
3005,Bad Grandpa,2006,15000000,102003019,160903019,145903019,9.73,"[- Is it going?, - Yes., Okay., Leave a messa...",54.0,6.5,tt3063516,0.021376,0.505376,0.636364,1.39,going yes okay leave message irving zisman fuc...,0,"['going', 'yes', 'okay', 'leave', 'message', '...",5659
3006,High School Musical 3,2000,11000000,90559416,274392880,263392880,23.94,"[Let's go!, - I'm open, I'm open., - I don't s...",57.0,4.8,tt0962726,0.049872,0.537634,0.415584,1.07,let go open open see take time run run go go l...,0,"['let', 'go', 'open', 'open', 'see', 'take', '...",5185
3007,A Nightmare on Elm Street 5: The Dream Child,2014,6000000,22168359,22168359,16168359,2.69,"[Sixty-eight... sixty-nine..., seventy... seve...",54.0,5.2,tt0097981,0.007259,0.505376,0.467532,1.12,sixty eight sixty nine seventy seventy one com...,0,"['sixty', 'eight', 'sixty', 'nine', 'seventy',...",2095


## Vectorizing

In [145]:
vectorizer = TfidfVectorizer()

In [146]:
X = modeling_df['cleaned_scripts']
y = modeling_df['success_failure']

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)

In [148]:
X_vect = vectorizer.fit_transform(X)
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Dimensionality Reduction with TruncatedSVD

In [149]:
tsvd = TruncatedSVD()

In [150]:
tsvd.fit(X_vect)
X_train_tsvd = tsvd.transform(X_train_tfidf)
X_test_tsvd = tsvd.transform(X_test_tfidf)

# Classification Modeling

## XGBoosted Classifier

In [151]:
xgbc_clf = XGBClassifier()

In [152]:
model_eval(xgbc_clf)

              precision    recall  f1-score   support

           0       0.54      0.61      0.57       466
           1       0.60      0.52      0.56       513

    accuracy                           0.56       979
   macro avg       0.57      0.57      0.56       979
weighted avg       0.57      0.56      0.56       979

[[285 181]
 [245 268]]
Training Accuracy Score ->  66.71701913393755
Test Accuracy Score ->  56.48621041879469


In [153]:
params_xgboost = {'max_depth': range(2,10,1),
         'n_estimators': range(60,220,40),
         'learning_rate': [0.1, 0.01, 0.05]}
estimator_xgboost = XGBClassifier(
    objective= 'binary:logistic',
    n_jobs=-1,
    seed=42, 
)
grid_search_xgboost = GridSearchCV(
    estimator=estimator_xgboost,
    param_grid=params_xgboost,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 10,
    verbose=True)

In [154]:
grid_search_xgboost.fit(X_train_tsvd, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:   37.2s finished


GridSearchCV(cv=10, estimator=XGBClassifier(n_jobs=-1, seed=42), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
                         'max_depth': range(2, 10),
                         'n_estimators': range(60, 220, 40)},
             scoring='accuracy', verbose=True)

In [155]:
grid_search_xgboost.best_params_

{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 100}

In [156]:
xgbc_clf = XGBClassifier(objective = 'binary:logistic', learning_rate = 0.01, max_depth = 2, n_estimators = 100)

In [157]:
model_eval(xgbc_clf)

              precision    recall  f1-score   support

           0       0.55      0.60      0.57       466
           1       0.60      0.56      0.58       513

    accuracy                           0.58       979
   macro avg       0.58      0.58      0.58       979
weighted avg       0.58      0.58      0.58       979

[[278 188]
 [228 285]]
Training Accuracy Score ->  61.22860020140987
Test Accuracy Score ->  57.507660878447396


## Support Vector Machine Classifier

In [158]:
svc_clf = SVC()

In [159]:
model_eval(svc_clf)

              precision    recall  f1-score   support

           0       0.56      0.62      0.59       466
           1       0.61      0.55      0.58       513

    accuracy                           0.58       979
   macro avg       0.59      0.58      0.58       979
weighted avg       0.59      0.58      0.58       979

[[289 177]
 [231 282]]
Training Accuracy Score ->  60.87613293051359
Test Accuracy Score ->  58.324821246169556


In [160]:
params_svc = {'C': [0.1, 1, 10, 100], 'gamma':[1,0.1,0.01,0.001]}
estimator_svc = SVC(kernel='linear',
                    class_weight='balanced')
grid_search_svc = GridSearchCV(estimator=estimator_svc, 
                              param_grid=params_svc,
                              scoring = 'accuracy',
                              n_jobs = -1,
                              cv = 10,
                              verbose = True)

In [161]:
grid_search_svc.fit(X_train_tsvd, y_train)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:    2.7s finished


GridSearchCV(cv=10, estimator=SVC(class_weight='balanced', kernel='linear'),
             n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [1, 0.1, 0.01, 0.001]},
             scoring='accuracy', verbose=True)

In [162]:
grid_search_svc.best_params_

{'C': 1, 'gamma': 1}

In [163]:
svc_clf = SVC(kernel = 'linear', C = 1, gamma = 1, class_weight='balanced')

In [164]:
model_eval(svc_clf)

              precision    recall  f1-score   support

           0       0.56      0.49      0.52       466
           1       0.58      0.65      0.62       513

    accuracy                           0.58       979
   macro avg       0.57      0.57      0.57       979
weighted avg       0.57      0.58      0.57       979

[[227 239]
 [177 336]]
Training Accuracy Score ->  57.95568982880162
Test Accuracy Score ->  57.507660878447396


## Multinomial Naive Bayes Classifier

In [137]:
nbc = MultinomialNB()

In [138]:
c_vectorizer = CountVectorizer()

In [139]:
Xc_vect = c_vectorizer.fit_transform(X)

In [140]:
X_train_cvect, X_test_cvect, y_train, y_test = train_test_split(Xc_vect, y, test_size = .5)

In [141]:
nbc.fit(X_train_cvect, y_train)

MultinomialNB()

In [142]:
y_test_preds = nbc.predict(X_test_cvect)
y_train_preds = nbc.predict(X_train_cvect)
print(classification_report(y_test, y_test_preds))
print(confusion_matrix(y_test, y_test_preds))
print("Training Accuracy Score -> ",accuracy_score(y_train, y_train_preds)*100)
print("Test Accuracy Score -> ",accuracy_score(y_test, y_test_preds)*100)

              precision    recall  f1-score   support

           0       0.55      0.88      0.68       731
           1       0.72      0.29      0.41       752

    accuracy                           0.58      1483
   macro avg       0.63      0.59      0.55      1483
weighted avg       0.64      0.58      0.54      1483

[[646  85]
 [533 219]]
Training Accuracy Score ->  82.05128205128204
Test Accuracy Score ->  58.32771409305462


## Dump Model to PKL File to Deploy

In [29]:
dump(nbc, 'NB_script_model.pkl')

['NB_script_model.pkl']