# Modeling

In [3]:
# Data manip.
import pandas as pd
import numpy as np

# Vizz
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# scikit-learn
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
# nltk.download('stopwords')
sw = stopwords.words('english')

# Keras

# etc.
import sys
sys.path.append( '../src' )
from xgboost import XGBClassifier, XGBRFClassifier
from parse_it import get_wordnet_pos, parse_doc
from pretty_results import *

In [4]:
lemmed_df = pd.read_csv('../../data/lemmed_combined.csv', index_col=0)

In [5]:
lemmed_df.head(3)

Unnamed: 0,text,label,text_lem
0,"Ryan Steven Lochte lkti LOK tee born August 3,...",0,ryan steven lochte lkti lok tee bear august am...
1,CAM ships were World War II era British mercha...,0,cam ship world war ii era british merchant shi...
2,The politics of Vietnam are defined by a singl...,0,politics vietnam define single party socialist...


In [6]:
# additional_sw = ['january',
#                  'february',
#                  'april', # 'march' and 'may' are English verbs and
#                           #  are thus excluded
#                  'june',
#                  'july',
#                  'august',
#                  'september',
#                  'october',
#                  'november',
#                  'december']

In [7]:
X = lemmed_df['text_lem']
y = lemmed_df['label']

In [8]:
X

0        ryan steven lochte lkti lok tee bear august am...
1        cam ship world war ii era british merchant shi...
2        politics vietnam define single party socialist...
3        pennsylvania route pa state highway locate mon...
4        clubland tv british free air dance music chann...
                               ...                        
54111    guatemala send delegation compete summer paral...
54112    charles augustus ollivierre july march vincent...
54113    dhanushka jayakody bear july colombo sri lanka...
54114    elmer harrison flick january january american ...
54115    safdarjung tomb sandstone marble mausoleum del...
Name: text_lem, Length: 54116, dtype: object

In [9]:
y

0        0
1        0
2        0
3        0
4        1
        ..
54111    0
54112    0
54113    1
54114    0
54115    0
Name: label, Length: 54116, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify=y)

In [11]:
y_train.value_counts(normalize=True)

0    0.559533
1    0.440467
Name: label, dtype: float64

In [13]:
y_test.value_counts(normalize=True)

0    0.559494
1    0.440506
Name: label, dtype: float64

## `DummyClassifier`

### Vectorizer: `CountVectorizer`

In [14]:
dum_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

dum_cv_res = cross_validate(dum_pipe,
                            X_train,
                            y_train,
                            scoring=('accuracy', 'f1_macro'),
                            cv=5,
                            verbose=1,
                            n_jobs=-2,
                            return_train_score=True)

pretty_cv(dum_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.560
Test accuracy:     0.560
F-1 Score
--------------------------------
Training F1 score: 0.359
Test F1 score:     0.359


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  2.4min finished


Results from cross-validation with `DummyClassifier`, count vectorized:
- Validation accuracy = `0.56` - proportion of majority class, and **baseline performance**
- Validation F1 = `0.36`
- **Execution time:** 1m, 30s

### Vectorizer: `TfidfVectorizer`

In [15]:
dum_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('dum', DummyClassifier(strategy='most_frequent'))
])

dum_tfidf_cv_res = cross_validate(dum_tfidf_pipe,
                                  X_train,
                                  y_train,
                                  scoring=('accuracy', 'f1_macro'),
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-2,
                                  return_train_score=True)

pretty_cv(dum_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.560
Test accuracy:     0.560
F-1 Score
--------------------------------
Training F1 score: 0.359
Test F1 score:     0.359


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  2.7min finished


Results from cross-validation with `DummyClassifier`, TF-IDF vectorized:
- Validation accuracy = `0.56`
- Validation F1 = `0.36`
- **Execution time:** 2m, 38s

## `DecisionTreeClassifier`

### Vectorizer: `CountVectorizer`

In [16]:
dtc_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

dtc_cv_res = cross_validate(dtc_pipe,
                            X_train,
                            y_train,
                            scoring=('accuracy', 'f1_macro'),
                            cv=5,
                            verbose=1,
                            n_jobs=-2,
                            return_train_score=True)

pretty_cv(dtc_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.872
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.870


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  6.3min finished


Results from cross-validation with `DecisionTreeClassifer`, count vectorized:
- Validation accuracy = `0.87`
- Validation F1 = `0.87`
- **Execution time:** 3m, 57s

### Vectorizer: `TfidfVectorizer`

In [17]:
dtc_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('dtc', DecisionTreeClassifier())
])

dtc_tfidf_cv_res = cross_validate(dtc_tfidf_pipe,
                                  X_train,
                                  y_train,
                                  scoring=('accuracy', 'f1_macro'),
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-2,
                                  return_train_score=True)

pretty_cv(dtc_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.864
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.861


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  7.9min finished


Results from cross-validation with `DecisionTreeClassifer`, TF-IDF vectorized:
- Validation accuracy = `0.86`
- Validation F1 = `0.86`
- **Execution time:** 8m, 55s

## `MultinomialNB`

### Vectorizer: `CountVectorizer`

In [18]:
mnb_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
])

mnb_cv_res = cross_validate(mnb_pipe,
                            X_train,
                            y_train,
                            scoring=('accuracy', 'f1_macro'),
                            cv=5,
                            verbose=1,
                            n_jobs=-2,
                            return_train_score=True)

pretty_cv(mnb_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.940
Test accuracy:     0.898
F-1 Score
--------------------------------
Training F1 score: 0.939
Test F1 score:     0.896


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  2.5min finished


Results from cross-validation with `MultinomialNB`, count vectorized:
- Validation accuracy = `0.90`
- Validation F1 = `0.90`
- **Execution time:** 2m, 25s

### Vectorizer: `TfidfVectorizer`

In [19]:
mnb_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

mnb_tfidf_cv_res = cross_validate(mnb_tfidf_pipe,
                                  X_train,
                                  y_train,
                                  scoring=('accuracy', 'f1_macro'),
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-2,
                                  return_train_score=True)

pretty_cv(mnb_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.833
Test accuracy:     0.793
F-1 Score
--------------------------------
Training F1 score: 0.819
Test F1 score:     0.770


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  2.4min finished


Results from cross-validation with `MultinomialNB`, TF-IDF vectorized:
- Validation accuracy = `0.79`
- Validation F1 = `0.77`
- **Execution time:** 2m, 32s

## `RandomForestClassifier`

### Vectorizer: `CountVectorizer`

In [20]:
rfc_cvec_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('rfc', RandomForestClassifier())
])

rfc_cvec_cv_res = cross_validate(rfc_cvec_pipe,
                                 X_train,
                                 y_train,
                                 scoring=('accuracy', 'f1_macro'),
                                 cv=5,
                                 verbose=1,
                                 n_jobs=-2,
                                 return_train_score=True)

pretty_cv(rfc_cvec_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.924
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.923


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  6.9min finished


Results from cross-validation with `RandomForestClassifer`, count vectorized:
- Validation accuracy = `0.93`
- Validation F1 = `0.92`
- **Execution time:** 9m, 44s

### Vectorizer: `TfidfVectorizer`

In [21]:
rfc_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('rfc', RandomForestClassifier())
])

rfc_tfidf_cv_res = cross_validate(rfc_tfidf_pipe,
                                  X_train,
                                  y_train,
                                  scoring=('accuracy', 'f1_macro'),
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-2,
                                  return_train_score=True)

pretty_cv(rfc_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 1.000
Test accuracy:     0.930
F-1 Score
--------------------------------
Training F1 score: 1.000
Test F1 score:     0.929


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  8.8min finished


Results from cross-validation with `RandomForestClassifer`, TF-IDF vectorized:
- Validation accuracy = `0.93`
- Validation F1 = `0.93`
- **Execution time:** 6m, 37s

## `GradientBoostingClassifier`

### Vectorizer: `CountVectorizer`

In [22]:
gbc_cvec_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('gbc', GradientBoostingClassifier())
])

gbc_cvec_cv_res = cross_validate(gbc_cvec_pipe,
                                 X_train,
                                 y_train,
                                 scoring=('accuracy', 'f1_macro'),
                                 cv=5,
                                 verbose=1,
                                 n_jobs=-2,
                                 return_train_score=True)

pretty_cv(gbc_cvec_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.942
Test accuracy:     0.935
F-1 Score
--------------------------------
Training F1 score: 0.941
Test F1 score:     0.934


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed: 14.0min finished


Results from cross-validation with `GradientBoostingClassifier`, count vectorized:
- Validation accuracy = `0.93`
- Validation F1 = `0.93`
- **Execution time:** 12m, 25s

In [23]:
gbc_cvec_cv_res

{'fit_time': array([755.08771443, 761.89255166, 758.2399776 , 756.41465449,
        756.84861135]),
 'score_time': array([14.88831711, 15.63303924, 16.02415776, 15.69638395, 15.86716509]),
 'test_accuracy': array([0.93629344, 0.93408715, 0.9368363 , 0.93462971, 0.93435388]),
 'train_accuracy': array([0.94152732, 0.94256163, 0.94249466, 0.94121906, 0.9421499 ]),
 'test_f1_macro': array([0.93520951, 0.9328965 , 0.9357355 , 0.93356722, 0.93329635]),
 'train_f1_macro': array([0.94055742, 0.94161097, 0.94153405, 0.94023338, 0.94117187])}

### Vectorizer: `TfidfVectorizer`

In [24]:
gbc_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('gbc', GradientBoostingClassifier())
])

gbc_tfidf_cv_res = cross_validate(gbc_tfidf_pipe,
                                  X_train,
                                  y_train,
                                  scoring=('accuracy', 'f1_macro'),
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-2,
                                  return_train_score=True)

pretty_cv(gbc_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.946
Test accuracy:     0.937
F-1 Score
--------------------------------
Training F1 score: 0.945
Test F1 score:     0.936


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed: 48.2min finished


Results from cross-validation with `GradientBoostingClassifier`, TF-IDF vectorized:
- Validation accuracy = `0.94`
- Validation F1 = `0.94`
- **Execution time:** 25m, 5s

## `XGBClassifier`

### Vectorizer: `CountVectorizer`

In [25]:
xgb_cvec_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('xgb', XGBClassifier())
])

xgb_cvec_cv_res = cross_validate(xgb_cvec_pipe,
                                 X_train,
                                 y_train,
                                 scoring=('accuracy', 'f1_macro'),
                                 cv=5,
                                 verbose=1,
                                 n_jobs=-2,
                                 return_train_score=True)

pretty_cv(xgb_cvec_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.998
Test accuracy:     0.952
F-1 Score
--------------------------------
Training F1 score: 0.998
Test F1 score:     0.952


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed: 43.9min finished


Results from cross-validation with `XGBClassifier`, count vectorized:
- Validation accuracy = `0.95`
- Validation F1 = `0.95`
- **Execution time:** 8m, 47s

### Vectorizer: `TfidfVectorizer`

In [26]:
xgb_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('xgb', XGBClassifier())
])

xgb_tfidf_cv_res = cross_validate(xgb_tfidf_pipe,
                                  X_train,
                                  y_train,
                                  scoring=('accuracy', 'f1_macro'),
                                  cv=5,
                                  verbose=1,
                                  n_jobs=-2,
                                  return_train_score=True)

pretty_cv(xgb_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.998
Test accuracy:     0.954
F-1 Score
--------------------------------
Training F1 score: 0.998
Test F1 score:     0.953


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed: 23.5min finished


Results from cross-validation with `XGBClassifier`, TF-IDF vectorized:
- Validation accuracy = `0.95`
- Validation F1 = `0.95`
- **Execution time:** 18m, 22s

## `XGBRFClassifier`

### Vectorizer: `CountVectorizer`

In [27]:
xgbrf_cvec_pipe = Pipeline(steps=[
    ('cvec', CountVectorizer()),
    ('xgbrf', XGBRFClassifier())
])

xgbrf_cvec_cv_res = cross_validate(xgbrf_cvec_pipe,
                                   X_train,
                                   y_train,
                                   scoring=('accuracy', 'f1_macro'),
                                   cv=5,
                                   verbose=1,
                                   n_jobs=-2,
                                   return_train_score=True)

pretty_cv(xgbrf_cvec_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.894
Test accuracy:     0.882
F-1 Score
--------------------------------
Training F1 score: 0.892
Test F1 score:     0.879


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed:  8.5min finished


Results from cross-validation with `XGBRFClassifier`, count vectorized:
- Validation accuracy = `0.88`
- Validation F1 = `0.88`
- **Execution time:** 8m, 24s

### Vectorizer: `TfidfVectorizer`

In [28]:
xgbrf_tfidf_pipe = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('xgbrf', XGBRFClassifier())
])

xgbrf_tfidf_cv_res = cross_validate(xgbrf_tfidf_pipe,
                                    X_train,
                                    y_train,
                                    scoring=('accuracy', 'f1_macro'),
                                    cv=5,
                                    verbose=1,
                                    n_jobs=-2,
                                    return_train_score=True)

pretty_cv(xgbrf_tfidf_cv_res)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


CV Results
Accuracy
--------------------------------
Training accuracy: 0.902
Test accuracy:     0.888
F-1 Score
--------------------------------
Training F1 score: 0.900
Test F1 score:     0.886


[Parallel(n_jobs=-2)]: Done   5 out of   5 | elapsed: 15.7min finished


Results from cross-validation with `XGBRFClassifier`, TF-IDF vectorized:
- Validation accuracy = `0.89`
- Validation F1 = `0.89`
- **Execution time:** 14m, 0s