# Modeling

In [12]:
# Data manip.
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# scikit-learn
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
                             StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer

# Helpers
import pickle
import os
import sys
module_path = os.path.abspath(os.pardir)
if module_path not in sys.path:
    sys.path.append(module_path)

from src.parse_it import *
from src.modeling import *
from src.EDA import *

In [2]:
# Empty dictionaries to store various models' scores

cv_acc_scores = {}
cv_f1_scores = {}

In [2]:
lemmed_df = pd.read_csv('../../data/lemmed_combined.csv', index_col=0)

In [3]:
lemmed_df.head(3)

Unnamed: 0,text,label,text_lem
0,"Ryan Steven Lochte lkti LOK tee born August 3,...",0,ryan steven lochte lkti lok tee bear august am...
1,CAM ships were World War II era British mercha...,0,cam ship world war ii era british merchant shi...
2,The politics of Vietnam are defined by a singl...,0,politics vietnam define single party socialist...


In [4]:
X = lemmed_df['text_lem']
y = lemmed_df['label']

In [5]:
X

0        ryan steven lochte lkti lok tee bear august am...
1        cam ship world war ii era british merchant shi...
2        politics vietnam define single party socialist...
3        pennsylvania route pa state highway locate mon...
4        clubland tv british free air dance music chann...
                               ...                        
54111    guatemala send delegation compete summer paral...
54112    charles augustus ollivierre july march vincent...
54113    dhanushka jayakody bear july colombo sri lanka...
54114    elmer harrison flick january january american ...
54115    safdarjung tomb sandstone marble mausoleum del...
Name: text_lem, Length: 54116, dtype: object

In [6]:
y

0        0
1        0
2        0
3        0
4        1
        ..
54111    0
54112    0
54113    1
54114    0
54115    0
Name: label, Length: 54116, dtype: int64

In [7]:
(X, y)

(0        ryan steven lochte lkti lok tee bear august am...
 1        cam ship world war ii era british merchant shi...
 2        politics vietnam define single party socialist...
 3        pennsylvania route pa state highway locate mon...
 4        clubland tv british free air dance music chann...
                                ...                        
 54111    guatemala send delegation compete summer paral...
 54112    charles augustus ollivierre july march vincent...
 54113    dhanushka jayakody bear july colombo sri lanka...
 54114    elmer harrison flick january january american ...
 54115    safdarjung tomb sandstone marble mausoleum del...
 Name: text_lem, Length: 54116, dtype: object,
 0        0
 1        0
 2        0
 3        0
 4        1
         ..
 54111    0
 54112    0
 54113    1
 54114    0
 54115    0
 Name: label, Length: 54116, dtype: int64)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    stratify=y)

In [9]:
X_tr_val, X_val, y_tr_val, y_val = train_test_split(X_train, y_train,
                                                    random_state=42,
                                                    stratify=y_train)

In [10]:
y_tr_val.value_counts(normalize=True)

0    0.559527
1    0.440473
Name: label, dtype: float64

In [11]:
y_val.value_counts(normalize=True)

0    0.559476
1    0.440524
Name: label, dtype: float64

## `DummyClassifier`

### Vectorizer: `CountVectorizer`

In [13]:
# Create model with pipeline - vectorizer & classifier
dum_count_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

# Fit model on training data
dum_count_pipe.fit(X_tr_val, y_tr_val)

# Create class object with model and validation data
dum_count_model = ModelForScoring(dum_count_pipe, 'dum_count', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.559
Test accuracy:     0.559
F-1 Score
--------------------------------
Training F1 score: 0.359
Test F1 score:     0.359


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.4min finished


In [14]:
# Save accuracy and F1 scores to dictionaries for comparison/plotting
dum_count_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899}
{'dum_count': 0.3587588337846893}


Results from cross-validation with `DummyClassifier`, count vectorized:
- Validation accuracy = `0.56` - proportion of majority class, and **baseline performance**
- Validation F1 = `0.36`
- **Execution time:** 1m, 30s

### Vectorizer: `TfidfVectorizer`

In [15]:
# Create model with pipeline - vectorizer & classifier
dum_tfidf_pipe = Pipeline(steps=[
    ('tfvec', TfidfVectorizer()),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

# Fit model on training data
dum_tfidf_pipe.fit(X_tr_val, y_tr_val)

# Create class object with model and validation data
dum_tfidf_model = ModelForScoring(dum_tfidf_pipe, 'dum_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.559
Test accuracy:     0.559
F-1 Score
--------------------------------
Training F1 score: 0.359
Test F1 score:     0.359


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished


In [16]:
# Save accuracy and F1 scores to dictionaries for comparison/plotting
dum_tfidf_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893}


Results from cross-validation with `DummyClassifier`, TF-IDF vectorized:
- Validation accuracy = `0.56`
- Validation F1 = `0.36`
- **Execution time:** 2m, 38s

## `DecisionTreeClassifier`

### Vectorizer: `CountVectorizer`

In [17]:
dtc_count_pipe = Pipeline(steps=[
    ('count', CountVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

dtc_count_pipe.fit(X_tr_val, y_tr_val)

dtc_count_model = ModelForScoring(dtc_count_pipe, 'dtc_count', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.853
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.850


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.8min finished


In [18]:
dtc_count_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133}


Results from cross-validation with `DecisionTreeClassifer`, count vectorized:
- Validation accuracy = `0.87`
- Validation F1 = `0.87`
- **Execution time:** 3m, 57s

### Vectorizer: `TfidfVectorizer`

In [19]:
dtc_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

dtc_tfidf_pipe.fit(X_tr_val, y_tr_val)

dtc_tfidf_model = ModelForScoring(dtc_tfidf_pipe, 'dtc_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.848
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.845


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.5min finished


In [20]:
dtc_tfidf_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537}


Results from cross-validation with `DecisionTreeClassifer`, TF-IDF vectorized:
- Validation accuracy = `0.86`
- Validation F1 = `0.86`
- **Execution time:** 8m, 55s

## `MultinomialNB`

### Vectorizer: `CountVectorizer`

In [21]:
mnb_count_pipe = Pipeline(steps=[
    ('count', CountVectorizer()),
    ('mnb', MultinomialNB())
])

mnb_count_pipe.fit(X_tr_val, y_tr_val)

mnb_count_model = ModelForScoring(mnb_count_pipe, 'mnb_count', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.952
Test accuracy:     0.885
F-1 Score
--------------------------------
Training F1 score: 0.951
Test F1 score:     0.882


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.9min finished


In [22]:
mnb_count_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997}


Results from cross-validation with `MultinomialNB`, count vectorized:
- Validation accuracy = `0.90`
- Validation F1 = `0.90`
- **Execution time:** 2m, 25s

### Vectorizer: `TfidfVectorizer`

In [23]:
mnb_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

mnb_tfidf_pipe.fit(X_tr_val, y_tr_val)

mnb_tfidf_model = ModelForScoring(mnb_tfidf_pipe, 'mnb_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.825
Test accuracy:     0.768
F-1 Score
--------------------------------
Training F1 score: 0.809
Test F1 score:     0.737


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.2min finished


In [24]:
mnb_tfidf_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367}


Results from cross-validation with `MultinomialNB`, TF-IDF vectorized:
- Validation accuracy = `0.79`
- Validation F1 = `0.77`
- **Execution time:** 2m, 32s

## `RandomForestClassifier`

### Vectorizer: `CountVectorizer`

In [26]:
rfc_count_pipe = Pipeline(steps=[
    ('count', CountVectorizer()),
    ('rfc', RandomForestClassifier())
])

rfc_count_pipe.fit(X_tr_val, y_tr_val)

rfc_count_model = ModelForScoring(rfc_count_pipe, 'rfc_count', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.917
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.915


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.7min finished


In [27]:
rfc_count_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868, 'rfc_count': 0.9167238587282434}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367, 'rfc_count': 0.9146474229221176}


Results from cross-validation with `RandomForestClassifer`, count vectorized:
- Validation accuracy = `0.93`
- Validation F1 = `0.92`
- **Execution time:** 9m, 44s

### Vectorizer: `TfidfVectorizer`

In [28]:
rfc_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

rfc_tfidf_pipe.fit(X_tr_val, y_tr_val)

rfc_tfidf_model = ModelForScoring(rfc_tfidf_pipe, 'rfc_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.926
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.924


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.4min finished


In [29]:
rfc_tfidf_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868, 'rfc_count': 0.9167238587282434, 'rfc_tfidf': 0.9260862809459877}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367, 'rfc_count': 0.9146474229221176, 'rfc_tfidf': 0.9244487834430511}


Results from cross-validation with `RandomForestClassifer`, TF-IDF vectorized:
- Validation accuracy = `0.93`
- Validation F1 = `0.93`
- **Execution time:** 6m, 37s

## `GradientBoostingClassifier`

### Vectorizer: `CountVectorizer`

In [30]:
gbc_count_pipe = Pipeline(steps=[
    ('count', CountVectorizer()),
    ('gbc', GradientBoostingClassifier())
])

gbc_count_pipe.fit(X_tr_val, y_tr_val)

gbc_count_model = ModelForScoring(gbc_count_pipe, 'gbc_count', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.950
Test accuracy:     0.930
F-1 Score
--------------------------------
Training F1 score: 0.949
Test F1 score:     0.929


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 16.5min finished


In [31]:
gbc_count_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868, 'rfc_count': 0.9167238587282434, 'rfc_tfidf': 0.9260862809459877, 'gbc_count': 0.9300277988865879}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367, 'rfc_count': 0.9146474229221176, 'rfc_tfidf': 0.9244487834430511, 'gbc_count': 0.9287915369932321}


Results from cross-validation with `GradientBoostingClassifier`, count vectorized:
- Validation accuracy = `0.93`
- Validation F1 = `0.93`
- **Execution time:** 12m, 25s

### Vectorizer: `TfidfVectorizer`

In [32]:
gbc_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('gbc', GradientBoostingClassifier())
])

gbc_tfidf_pipe.fit(X_tr_val, y_tr_val)

gbc_tfidf_model = ModelForScoring(gbc_tfidf_pipe, 'gbc_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.957
Test accuracy:     0.929
F-1 Score
--------------------------------
Training F1 score: 0.956
Test F1 score:     0.928


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 37.5min finished


In [33]:
gbc_tfidf_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868, 'rfc_count': 0.9167238587282434, 'rfc_tfidf': 0.9260862809459877, 'gbc_count': 0.9300277988865879, 'gbc_tfidf': 0.9294371028947259}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367, 'rfc_count': 0.9146474229221176, 'rfc_tfidf': 0.9244487834430511, 'gbc_count': 0.9287915369932321, 'gbc_tfidf': 0.928154578464653}


Results from cross-validation with `GradientBoostingClassifier`, TF-IDF vectorized:
- Validation accuracy = `0.94`
- Validation F1 = `0.94`
- **Execution time:** 25m, 5s

## `XGBClassifier`

### Vectorizer: `CountVectorizer`

In [36]:
xgb_count_pipe = Pipeline(steps=[
    ('count', CountVectorizer()),
    ('xgb', XGBClassifier())
])

xgb_count_pipe.fit(X_tr_val, y_tr_val)

xgb_count_model = ModelForScoring(xgb_count_pipe, 'xgb_count', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.940
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.939


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.2min finished


In [37]:
xgb_count_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868, 'rfc_count': 0.9167238587282434, 'rfc_tfidf': 0.9260862809459877, 'gbc_count': 0.9300277988865879, 'gbc_tfidf': 0.9294371028947259, 'xgb_count': 0.9399817425653152}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367, 'rfc_count': 0.9146474229221176, 'rfc_tfidf': 0.9244487834430511, 'gbc_count': 0.9287915369932321, 'gbc_tfidf': 0.928154578464653, 'xgb_count': 0.9388099916941115}


Results from cross-validation with `XGBClassifier`, count vectorized:
- Validation accuracy = `0.95`
- Validation F1 = `0.95`
- **Execution time:** 8m, 47s

### Vectorizer: `TfidfVectorizer`

In [13]:
xgb_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100000)),
    ('xgb', XGBClassifier())
])

xgb_tfidf_pipe.fit(X_tr_val, y_tr_val)

xgb_tfidf_model = ModelForScoring(xgb_tfidf_pipe, 'xgb_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.942
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.941


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  6.0min finished


In [39]:
xgb_tfidf_model.update_scores(cv_acc_scores, cv_f1_scores)

{'dum_count': 0.559475681436899, 'dum_tfidf': 0.559475681436899, 'dtc_count': 0.8526654155144493, 'dtc_tfidf': 0.8475409032088898, 'mnb_count': 0.8849904464088452, 'mnb_tfidf': 0.7680087499726868, 'rfc_count': 0.9167238587282434, 'rfc_tfidf': 0.9260862809459877, 'gbc_count': 0.9300277988865879, 'gbc_tfidf': 0.9294371028947259, 'xgb_count': 0.9399817425653152, 'xgb_tfidf': 0.9399817425653152}
{'dum_count': 0.3587588337846893, 'dum_tfidf': 0.3587588337846893, 'dtc_count': 0.8503901995883133, 'dtc_tfidf': 0.8452663394024537, 'mnb_count': 0.8815240791990997, 'mnb_tfidf': 0.7374698110271367, 'rfc_count': 0.9146474229221176, 'rfc_tfidf': 0.9244487834430511, 'gbc_count': 0.9287915369932321, 'gbc_tfidf': 0.928154578464653, 'xgb_count': 0.9388099916941115, 'xgb_tfidf': 0.9388099916941115}


In [44]:
cv_acc_scores

{'dum_count': 0.559475681436899,
 'dum_tfidf': 0.559475681436899,
 'dtc_count': 0.8526654155144493,
 'dtc_tfidf': 0.8475409032088898,
 'mnb_count': 0.8849904464088452,
 'mnb_tfidf': 0.7680087499726868,
 'rfc_count': 0.9167238587282434,
 'rfc_tfidf': 0.9260862809459877,
 'gbc_count': 0.9300277988865879,
 'gbc_tfidf': 0.9294371028947259,
 'xgb_count': 0.9399817425653152,
 'xgb_tfidf': 0.9399817425653152}

In [45]:
cv_f1_scores

{'dum_count': 0.3587588337846893,
 'dum_tfidf': 0.3587588337846893,
 'dtc_count': 0.8503901995883133,
 'dtc_tfidf': 0.8452663394024537,
 'mnb_count': 0.8815240791990997,
 'mnb_tfidf': 0.7374698110271367,
 'rfc_count': 0.9146474229221176,
 'rfc_tfidf': 0.9244487834430511,
 'gbc_count': 0.9287915369932321,
 'gbc_tfidf': 0.928154578464653,
 'xgb_count': 0.9388099916941115,
 'xgb_tfidf': 0.9388099916941115}

In [46]:
with open ('../models/acc_scores.pkl', 'wb') as f:
    pickle.dump(cv_acc_scores, f)

In [47]:
with open ('../models/f1_scores.pkl', 'wb') as f:
    pickle.dump(cv_f1_scores, f)

## `StackingClassifier`

In [14]:
mnb_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100000)),
    ('mnb', MultinomialNB())
])

xgb_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=100000)),
    ('xgb', XGBClassifier())
])

final_estimators = [
    ('mnb_model', mnb_tfidf_pipe),
    ('xgb_model', xgb_tfidf_pipe)
]

In [15]:
sc_tfidf_pipe = StackingClassifier(final_estimators)

sc_tfidf_pipe.fit(X_tr_val, y_tr_val)

sc_tfidf_model = ModelForScoring(sc_tfidf_pipe, 'sc_tfidf', X_val, y_val)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.944
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.943


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 29.4min finished
