# Project 3: Web APIs & NLP
# Notebook 4: Modeling

https://github.com/pushshift/api<br>
https://api.pushshift.io/reddit/search/comment/

## Contents
- [Import Libraries and Data](#Import-Libraries-and-Data)
- [Run Classification Models with Hyperparameter Tuning](#Run-Classification-Models-with-Hyperparameter-Tuning)
- [Save Best Model](#Save-Best-Model)

## Import Libraries and Data

In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC

import pickle

pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 100)

In [5]:
X_train = pd.read_csv("../datasets/X_train", squeeze = True)
X_test = pd.read_csv("../datasets/X_test", squeeze = True)
y_train = pd.read_csv("../datasets/y_train", squeeze = True)
y_test = pd.read_csv("../datasets/y_test", squeeze = True)

## Run Classification Models with Hyperparameter Tuning

**Non-optimal hyperparameters are removed after each run in order to speed up re-runs.**

### Models with Count Vectorizer

In [6]:
pipe_cvec_lr = Pipeline([("cvec", CountVectorizer()),
                         ("lr", LogisticRegression(max_iter = 1000))])
pipe_cvec_lr_params = {"cvec__max_features": [4500],
                       "cvec__ngram_range": [(1,1)],
                       "cvec__stop_words": ["english", None]}
gs_cvec_lr = GridSearchCV(pipe_cvec_lr,
                          param_grid = pipe_cvec_lr_params,
                          cv = 5, n_jobs = -1)
gs_cvec_lr.fit(X_train, y_train)
gs_cvec_lr_model = gs_cvec_lr.best_estimator_
gs_cvec_lr.best_params_

{'cvec__max_features': 4500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

In [7]:
# not likely the best Naive Bayes model for Count Vectorizer or Tfidf Vectorizer
pipe_cvec_bnb = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("bnb", BernoulliNB())])
pipe_cvec_bnb_params = {"cvec__max_features": [6000],
                        "cvec__ngram_range": [(1,2)],
                        "cvec__stop_words": ["english"]}
gs_cvec_bnb = GridSearchCV(pipe_cvec_bnb,
                           param_grid = pipe_cvec_bnb_params,
                           cv = 5, n_jobs = -1)
gs_cvec_bnb.fit(X_train, y_train)
gs_cvec_bnb_model = gs_cvec_bnb.best_estimator_
gs_cvec_bnb.best_params_

{'cvec__max_features': 6000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [8]:
# likely the best Naive Bayes model for Count Vectorizer
pipe_cvec_mnb = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("mnb", MultinomialNB())])
pipe_cvec_mnb_params = {"cvec__max_features": [7500],
                        "cvec__ngram_range": [(1,1)],
                        "cvec__stop_words": [None]}
gs_cvec_mnb = GridSearchCV(pipe_cvec_mnb,
                           param_grid = pipe_cvec_mnb_params,
                           cv = 5, n_jobs = -1)
gs_cvec_mnb.fit(X_train, y_train)
gs_cvec_mnb_model = gs_cvec_mnb.best_estimator_
gs_cvec_mnb.best_params_

{'cvec__max_features': 7500,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

In [9]:
# not likely the best Naive Bayes model for Count Vectorizer
pipe_cvec_gnb = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("gnb", GaussianNB())])
pipe_cvec_gnb_params = {"cvec__max_features": [3000],
                        "cvec__ngram_range": [(1,2)],
                        "cvec__stop_words": ["english"]}
gs_cvec_gnb = GridSearchCV(pipe_cvec_gnb,
                           param_grid = pipe_cvec_gnb_params,
                           cv = 5, n_jobs = -1)
gs_cvec_gnb.fit(X_train, y_train)
gs_cvec_gnb_model = gs_cvec_gnb.best_estimator_
gs_cvec_gnb.best_params_

{'cvec__max_features': 3000,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': 'english'}

In [10]:
pipe_cvec_knn = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("knn", KNeighborsClassifier())])
pipe_cvec_knn_params = {"cvec__max_features": [500],
                        "cvec__ngram_range": [(1,2)],
                        "cvec__stop_words": [None],
                        "knn__n_neighbors": [15]}
gs_cvec_knn = GridSearchCV(pipe_cvec_knn,
                           param_grid = pipe_cvec_knn_params,
                           cv = 5, n_jobs = -1)
gs_cvec_knn.fit(X_train, y_train)
gs_cvec_knn_model = gs_cvec_knn.best_estimator_
gs_cvec_knn.best_params_

{'cvec__max_features': 500,
 'cvec__ngram_range': (1, 2),
 'cvec__stop_words': None,
 'knn__n_neighbors': 15}

In [11]:
# not likely optimal without bagging or boosting
pipe_cvec_dt = Pipeline([("cvec", CountVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("dt", DecisionTreeClassifier())])
pipe_cvec_dt_params = {"cvec__max_features": [8000],
                       "cvec__ngram_range": [(1,1)],
                       "cvec__stop_words": ["english"],
                       "dt__max_depth": [29],
                       "dt__min_samples_split": [10],
                       "dt__min_samples_leaf": [1]}
gs_cvec_dt = GridSearchCV(pipe_cvec_dt,
                          param_grid = pipe_cvec_dt_params,
                          cv = 5, n_jobs = -1)
gs_cvec_dt.fit(X_train, y_train)
gs_cvec_dt_model = gs_cvec_dt.best_estimator_
gs_cvec_dt.best_params_

{'cvec__max_features': 8000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'dt__max_depth': 29,
 'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 10}

In [12]:
pipe_cvec_dt_bag = Pipeline([("cvec", CountVectorizer()),
                             ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                             ("bag", BaggingClassifier(base_estimator = DecisionTreeClassifier()))])
pipe_cvec_dt_bag_params = {"bag__base_estimator__max_depth": [29],
                           "bag__base_estimator__min_samples_leaf": [1],
                           "bag__base_estimator__min_samples_split": [10],
                           'cvec__max_features': [8000],
                           'cvec__ngram_range': [(1, 1)],
                           'cvec__stop_words': ['english'],
                           "bag__n_estimators": [15],
                           "bag__max_samples": [.5],
                           "bag__max_features": [.3]}
gs_cvec_dt_bag = GridSearchCV(pipe_cvec_dt_bag,
                              param_grid = pipe_cvec_dt_bag_params,
                              cv = 5, n_jobs = -1)
gs_cvec_dt_bag.fit(X_train, y_train)
gs_cvec_dt_bag_model = gs_cvec_dt_bag.best_estimator_
gs_cvec_dt_bag.best_params_

{'bag__base_estimator__max_depth': 29,
 'bag__base_estimator__min_samples_leaf': 1,
 'bag__base_estimator__min_samples_split': 10,
 'bag__max_features': 0.3,
 'bag__max_samples': 0.5,
 'bag__n_estimators': 15,
 'cvec__max_features': 8000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [13]:
pipe_cvec_rf = Pipeline([("cvec", CountVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("rf", RandomForestClassifier())])
pipe_cvec_rf_params = {'cvec__max_features': [7000],
                       'cvec__ngram_range': [(1, 1)],
                       'cvec__stop_words': ['english'],
                       "rf__n_estimators": [100],
                       "rf__max_depth": [None]}
gs_cvec_rf = GridSearchCV(pipe_cvec_rf,
                          param_grid = pipe_cvec_rf_params,
                          cv = 5, n_jobs = -1)
gs_cvec_rf.fit(X_train, y_train)
gs_cvec_rf_model = gs_cvec_rf.best_estimator_
gs_cvec_rf.best_params_

{'cvec__max_features': 7000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': None,
 'rf__n_estimators': 100}

In [14]:
pipe_cvec_et = Pipeline([("cvec", CountVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("et", ExtraTreesClassifier())])
pipe_cvec_et_params = {'cvec__max_features': [6000],
                       'cvec__ngram_range': [(1, 1)],
                       'cvec__stop_words': ['english'],
                       "et__n_estimators": [150],
                       "et__max_depth": [None]}
gs_cvec_et = GridSearchCV(pipe_cvec_et,
                          param_grid = pipe_cvec_et_params,
                          cv = 5, n_jobs = -1)
gs_cvec_et.fit(X_train, y_train)
gs_cvec_et_model = gs_cvec_et.best_estimator_
gs_cvec_et.best_params_

{'cvec__max_features': 6000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'et__max_depth': None,
 'et__n_estimators': 150}

In [15]:
pipe_cvec_ada = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("ada", AdaBoostClassifier(base_estimator = DecisionTreeClassifier()))])
pipe_cvec_ada_params = {"ada__base_estimator__max_depth": [2],
                        'cvec__max_features': [8000],
                        'cvec__ngram_range': [(1, 1)],
                        'cvec__stop_words': ['english'],
                        "ada__n_estimators": [150]}
gs_cvec_ada = GridSearchCV(pipe_cvec_ada,
                           param_grid = pipe_cvec_ada_params,
                           cv = 5, n_jobs = -1)
gs_cvec_ada.fit(X_train, y_train)
gs_cvec_ada_model = gs_cvec_ada.best_estimator_
gs_cvec_ada.best_params_

{'ada__base_estimator__max_depth': 2,
 'ada__n_estimators': 150,
 'cvec__max_features': 8000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english'}

In [16]:
pipe_cvec_svc = Pipeline([("cvec", CountVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("svc", SVC(probability = True))])
pipe_cvec_svc_params = {'cvec__max_features': [7000],
                        'cvec__ngram_range': [(1, 1)],
                        'cvec__stop_words': ['english'],
                        "svc__C": [10]}
gs_cvec_svc = GridSearchCV(pipe_cvec_svc,
                           param_grid = pipe_cvec_svc_params,
                           cv = 5, n_jobs = -1)
gs_cvec_svc.fit(X_train, y_train)
gs_cvec_svc_model = gs_cvec_svc.best_estimator_
gs_cvec_svc.best_params_

{'cvec__max_features': 7000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'svc__C': 10}

### Models with Tfidf Vectorizer

In [17]:
pipe_tfidf_lr = Pipeline([("tfidf", TfidfVectorizer()),
                          ("lr", LogisticRegression(max_iter = 1000))])
pipe_tfidf_lr_params = {"tfidf__max_features": [5500],
                        "tfidf__ngram_range": [(1,1)],
                        "tfidf__stop_words": [None]}
gs_tfidf_lr = GridSearchCV(pipe_tfidf_lr,
                           param_grid = pipe_tfidf_lr_params,
                           cv = 5, n_jobs = -1)
gs_tfidf_lr.fit(X_train, y_train)
gs_tfidf_lr_model = gs_tfidf_lr.best_estimator_
gs_tfidf_lr.best_params_

{'tfidf__max_features': 5500,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

In [18]:
# not likely the best Naive Bayes model for Count Vectorizer or Tfidf Vectorizer
pipe_tfidf_bnb = Pipeline([("tfidf", TfidfVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("bnb", BernoulliNB())])
pipe_tfidf_bnb_params = {"tfidf__max_features": [3000],
                         "tfidf__ngram_range": [(1,2)],
                         "tfidf__stop_words": ["english"]}
gs_tfidf_bnb = GridSearchCV(pipe_tfidf_bnb,
                            param_grid = pipe_tfidf_bnb_params,
                            cv = 5, n_jobs = -1)
gs_tfidf_bnb.fit(X_train, y_train)
gs_tfidf_bnb_model = gs_tfidf_bnb.best_estimator_
gs_tfidf_bnb.best_params_

{'tfidf__max_features': 3000,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [19]:
# not likely the best Naive Bayes model for Tfidf Vectorizer
pipe_tfidf_mnb = Pipeline([("tfidf", TfidfVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("mnb", MultinomialNB())])
pipe_tfidf_mnb_params = {"tfidf__max_features": [3000],
                         "tfidf__ngram_range": [(1,2)],
                         "tfidf__stop_words": ["english"]}
gs_tfidf_mnb = GridSearchCV(pipe_tfidf_mnb,
                            param_grid = pipe_tfidf_mnb_params,
                            cv = 5, n_jobs = -1)
gs_tfidf_mnb.fit(X_train, y_train)
gs_tfidf_mnb_model = gs_tfidf_mnb.best_estimator_
gs_tfidf_mnb.best_params_

{'tfidf__max_features': 3000,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [20]:
# likely the best Naive Bayes model for Tfidf Vectorizer
pipe_tfidf_gnb = Pipeline([("tfidf", TfidfVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("gnb", GaussianNB())])
pipe_tfidf_gnb_params = {"tfidf__max_features": [3000],
                         "tfidf__ngram_range": [(1,2)],
                         "tfidf__stop_words": ["english"]}
gs_tfidf_gnb = GridSearchCV(pipe_tfidf_gnb,
                            param_grid = pipe_tfidf_gnb_params,
                            cv = 5, n_jobs = -1)
gs_tfidf_gnb.fit(X_train, y_train)
gs_tfidf_gnb_model = gs_tfidf_gnb.best_estimator_
gs_tfidf_gnb.best_params_

{'tfidf__max_features': 3000,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': 'english'}

In [21]:
pipe_tfidf_knn = Pipeline([("tfidf", TfidfVectorizer()),
                         ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                         ("knn", KNeighborsClassifier())])
pipe_tfidf_knn_params = {"tfidf__max_features": [100],
                         "tfidf__ngram_range": [(1,1)],
                         "tfidf__stop_words": [None],
                         "knn__n_neighbors": [15]}
gs_tfidf_knn = GridSearchCV(pipe_tfidf_knn,
                            param_grid = pipe_tfidf_knn_params,
                            cv = 5, n_jobs = -1)
gs_tfidf_knn.fit(X_train, y_train)
gs_tfidf_knn_model = gs_tfidf_knn.best_estimator_
gs_tfidf_knn.best_params_

{'knn__n_neighbors': 15,
 'tfidf__max_features': 100,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

In [22]:
# not likely optimal without bagging or boosting
pipe_tfidf_dt = Pipeline([("tfidf", TfidfVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("dt", DecisionTreeClassifier())])
pipe_tfidf_dt_params = {"tfidf__max_features": [500],
                        "tfidf__ngram_range": [(1,1)],
                        "tfidf__stop_words": [None],
                        "dt__max_depth": [10],
                        "dt__min_samples_split": [10],
                        "dt__min_samples_leaf": [3]}
gs_tfidf_dt = GridSearchCV(pipe_tfidf_dt,
                          param_grid = pipe_tfidf_dt_params,
                          cv = 5, n_jobs = -1)
gs_tfidf_dt.fit(X_train, y_train)
gs_tfidf_dt_model = gs_tfidf_dt.best_estimator_
gs_tfidf_dt.best_params_

{'dt__max_depth': 10,
 'dt__min_samples_leaf': 3,
 'dt__min_samples_split': 10,
 'tfidf__max_features': 500,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

In [23]:
pipe_tfidf_dt_bag = Pipeline([("tfidf", TfidfVectorizer()),
                              ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                              ("bag", BaggingClassifier(base_estimator = DecisionTreeClassifier()))])
pipe_tfidf_dt_bag_params = {"bag__base_estimator__max_depth": [10],
                           "bag__base_estimator__min_samples_leaf": [3],
                           "bag__base_estimator__min_samples_split": [10],
                           'tfidf__max_features': [500],
                           'tfidf__ngram_range': [(1, 1)],
                           'tfidf__stop_words': [None],
                           "bag__n_estimators": [20],
                           "bag__max_samples": [.5],
                           "bag__max_features": [.9]}
gs_tfidf_dt_bag = GridSearchCV(pipe_tfidf_dt_bag,
                              param_grid = pipe_tfidf_dt_bag_params,
                              cv = 5, n_jobs = -1)
gs_tfidf_dt_bag.fit(X_train, y_train)
gs_tfidf_dt_bag_model = gs_tfidf_dt_bag.best_estimator_
gs_tfidf_dt_bag.best_params_

{'bag__base_estimator__max_depth': 10,
 'bag__base_estimator__min_samples_leaf': 3,
 'bag__base_estimator__min_samples_split': 10,
 'bag__max_features': 0.9,
 'bag__max_samples': 0.5,
 'bag__n_estimators': 20,
 'tfidf__max_features': 500,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

In [24]:
pipe_tfidf_rf = Pipeline([("tfidf", TfidfVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("rf", RandomForestClassifier())])
pipe_tfidf_rf_params = {'tfidf__max_features': [500],
                        'tfidf__ngram_range': [(1, 1)],
                        'tfidf__stop_words': [None],
                        "rf__n_estimators": [100],
                        "rf__max_depth": [None]}
gs_tfidf_rf = GridSearchCV(pipe_cvec_rf,
                           param_grid = pipe_cvec_rf_params,
                           cv = 5, n_jobs = -1)
gs_tfidf_rf.fit(X_train, y_train)
gs_tfidf_rf_model = gs_tfidf_rf.best_estimator_
gs_tfidf_rf.best_params_

{'cvec__max_features': 7000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'rf__max_depth': None,
 'rf__n_estimators': 100}

In [25]:
pipe_tfidf_et = Pipeline([("tfidf", TfidfVectorizer()),
                          ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                          ("et", ExtraTreesClassifier())])
pipe_tfidf_et_params = {'tfidf__max_features': [500],
                        'tfidf__ngram_range': [(1, 1)],
                        'tfidf__stop_words': [None],
                        "et__n_estimators": [150],
                        "et__max_depth": [None]}
gs_tfidf_et = GridSearchCV(pipe_cvec_et,
                           param_grid = pipe_cvec_et_params,
                           cv = 5, n_jobs = -1)
gs_tfidf_et.fit(X_train, y_train)
gs_tfidf_et_model = gs_tfidf_et.best_estimator_
gs_tfidf_et.best_params_

{'cvec__max_features': 6000,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': 'english',
 'et__max_depth': None,
 'et__n_estimators': 150}

In [26]:
pipe_tfidf_ada = Pipeline([("tfidf", TfidfVectorizer()),
                           ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                           ("ada", AdaBoostClassifier(base_estimator = DecisionTreeClassifier()))])
pipe_tfidf_ada_params = {"ada__base_estimator__max_depth": [2],
                         'tfidf__max_features': [500],
                         'tfidf__ngram_range': [(1, 1)],
                         'tfidf__stop_words': [None],
                         "ada__n_estimators": [150]}
gs_tfidf_ada = GridSearchCV(pipe_tfidf_ada,
                           param_grid = pipe_tfidf_ada_params,
                           cv = 5, n_jobs = -1)
gs_tfidf_ada.fit(X_train, y_train)
gs_tfidf_ada_model = gs_tfidf_ada.best_estimator_
gs_tfidf_ada.best_params_

{'ada__base_estimator__max_depth': 2,
 'ada__n_estimators': 150,
 'tfidf__max_features': 500,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

In [27]:
pipe_tfidf_svc = Pipeline([("tfidf", TfidfVectorizer()),
                           ("ft", FunctionTransformer(lambda x: x.todense(), accept_sparse = True)),
                           ("svc", SVC(probability = True))])
pipe_tfidf_svc_params = {'tfidf__max_features': [500],
                         'tfidf__ngram_range': [(1, 1)],
                         'tfidf__stop_words': [None],
                         "svc__C": [10]}
gs_tfidf_svc = GridSearchCV(pipe_tfidf_svc,
                            param_grid = pipe_tfidf_svc_params,
                            cv = 5, n_jobs = -1)
gs_tfidf_svc.fit(X_train, y_train)
gs_tfidf_svc_model = gs_tfidf_svc.best_estimator_
gs_tfidf_svc.best_params_

{'svc__C': 10,
 'tfidf__max_features': 500,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

### Comparison of Cross-Validation Scores, Training Scores, and Test Scores

In [42]:
results_df = pd.DataFrame(index = range(0,22), columns = ["Feature Extraction", "Model", "CV Accuracy", "Training Accuracy", "Test Accuracy"])
results_df["Feature Extraction"] = ["CVEC", "CVEC", "CVEC", "CVEC", "CVEC", "CVEC", "CVEC", "CVEC", "CVEC", "CVEC", "CVEC",
                                    "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF", "TFIDF"]
results_df["Model"] = ["LR", "BNB", "MNB", "GNB", "KNN", "DT", "DT_BAG", "RF", "ET", "DT_ADA", "SVC",
                       "LR", "BNB", "MNB", "GNB", "KNN", "DT", "DT_BAG", "RF", "ET", "DT_ADA", "SVC"]
results_df["CV Accuracy"] = [gs_cvec_lr.best_score_, 
                             gs_cvec_bnb.best_score_, 
                             gs_cvec_mnb.best_score_, 
                             gs_cvec_gnb.best_score_, 
                             gs_cvec_knn.best_score_,
                             gs_cvec_dt.best_score_,
                             gs_cvec_dt_bag.best_score_,
                             gs_cvec_rf.best_score_,
                             gs_cvec_et.best_score_,
                             gs_cvec_ada.best_score_,
                             gs_cvec_svc.best_score_,
                             gs_tfidf_lr.best_score_,
                             gs_tfidf_bnb.best_score_,
                             gs_tfidf_mnb.best_score_,
                             gs_tfidf_gnb.best_score_,
                             gs_tfidf_knn.best_score_,
                             gs_tfidf_dt.best_score_,
                             gs_tfidf_dt_bag.best_score_,
                             gs_tfidf_rf.best_score_,
                             gs_tfidf_et.best_score_,
                             gs_tfidf_ada.best_score_,
                             gs_tfidf_svc.best_score_]
results_df["Training Accuracy"] = [gs_cvec_lr_model.score(X_train, y_train),
                                   gs_cvec_bnb_model.score(X_train, y_train),
                                   gs_cvec_mnb_model.score(X_train, y_train),
                                   gs_cvec_gnb_model.score(X_train, y_train),
                                   gs_cvec_knn_model.score(X_train, y_train),
                                   gs_cvec_dt_model.score(X_train, y_train),
                                   gs_cvec_dt_bag_model.score(X_train, y_train),
                                   gs_cvec_rf_model.score(X_train, y_train),
                                   gs_cvec_et_model.score(X_train, y_train),
                                   gs_cvec_ada_model.score(X_train, y_train),
                                   gs_cvec_svc_model.score(X_train, y_train),
                                   gs_tfidf_lr_model.score(X_train, y_train),
                                   gs_tfidf_bnb_model.score(X_train, y_train),
                                   gs_tfidf_mnb_model.score(X_train, y_train),
                                   gs_tfidf_gnb_model.score(X_train, y_train),
                                   gs_tfidf_knn_model.score(X_train, y_train),
                                   gs_tfidf_dt_model.score(X_train, y_train),
                                   gs_tfidf_dt_bag_model.score(X_train, y_train),
                                   gs_tfidf_rf_model.score(X_train, y_train),
                                   gs_tfidf_et_model.score(X_train, y_train),
                                   gs_tfidf_ada_model.score(X_train, y_train),
                                   gs_tfidf_svc_model.score(X_train, y_train)]
results_df["Test Accuracy"] = [gs_cvec_lr_model.score(X_test, y_test),
                               gs_cvec_bnb_model.score(X_test, y_test),
                               gs_cvec_mnb_model.score(X_test, y_test),
                               gs_cvec_gnb_model.score(X_test, y_test),
                               gs_cvec_knn_model.score(X_test, y_test),
                               gs_cvec_dt_model.score(X_test, y_test),
                               gs_cvec_dt_bag_model.score(X_test, y_test),
                               gs_cvec_rf_model.score(X_test, y_test),
                               gs_cvec_et_model.score(X_test, y_test),
                               gs_cvec_ada_model.score(X_test, y_test),
                               gs_cvec_svc_model.score(X_test, y_test),
                               gs_tfidf_lr_model.score(X_test, y_test),
                               gs_tfidf_bnb_model.score(X_test, y_test),
                               gs_tfidf_mnb_model.score(X_test, y_test),
                               gs_tfidf_gnb_model.score(X_test, y_test),
                               gs_tfidf_knn_model.score(X_test, y_test),
                               gs_tfidf_dt_model.score(X_test, y_test),
                               gs_tfidf_dt_bag_model.score(X_test, y_test),
                               gs_tfidf_rf_model.score(X_test, y_test),
                               gs_tfidf_et_model.score(X_test, y_test),
                               gs_tfidf_ada_model.score(X_test, y_test),
                               gs_tfidf_svc_model.score(X_test, y_test)]
results_df



Unnamed: 0,Feature Extraction,Model,CV Accuracy,Training Accuracy,Test Accuracy
0,CVEC,LR,0.674434,0.879619,0.681078
1,CVEC,BNB,0.650384,0.726419,0.649034
2,CVEC,MNB,0.694854,0.808092,0.70117
3,CVEC,GNB,0.641991,0.718777,0.641404
4,CVEC,KNN,0.561315,0.635475,0.552899
5,CVEC,DT,0.599275,0.68007,0.592828
6,CVEC,DT_BAG,0.637105,0.727797,0.63708
7,CVEC,RF,0.657273,0.985093,0.67116
8,CVEC,ET,0.664788,0.983465,0.665819
9,CVEC,DT_ADA,0.644871,0.759739,0.640641


## Save Best Model

**All models were overfitted (higher training score than test score), so I selected the model with the highest test score.**

**The function transformer can't be saved by pickling, so it will have to be re-fitted in the next notebook.**

In [29]:
cvec = CountVectorizer(max_features = 7500, ngram_range = (1,1), stop_words = None)
vec_fitted = cvec.fit(X_train)
X_train = cvec.transform(X_train)
X_test = cvec.transform(X_test)

ft = FunctionTransformer(lambda x: x.todense(), accept_sparse = True)
ft_fitted = ft.fit(X_train)
X_train = ft.transform(X_train)
X_test = ft.transform(X_test)

mnb = MultinomialNB()
best_model = mnb.fit(X_train, y_train)

In [30]:
print(best_model.score(X_train, y_train))
print(best_model.score(X_test, y_test))
# same as the Pipeline above, as expected

0.8080921959163222
0.7011698880976602


In [33]:
pickle.dump(vec_fitted, open('../models/vec_fitted.joblib', 'wb'))

pickle.dump(best_model, open('../models/best_model.joblib', 'wb'))

## See Notebook 5 for Testing