In [1]:
import pandas as pd
import numpy as np

import re
from text_helper import word_splitter, sentence_count, stop_word_counter, punc_counter
from nltk.corpus import stopwords


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import make_column_selector, ColumnTransformer

import pickle

rs = 91923 

In [2]:
df = pd.read_csv('../data/data_final.csv')

df['response_cleaned'] = df['response'].apply(lambda x: re.sub('[\\n]{2,}', '\n', x))
df['response_cleaned'] = df['response_cleaned'].apply(lambda x: re.sub(r"\/*u\/[\S]+", 'they', x))

df['num_words'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[0])
df['stop_words'] = df['response_cleaned'].apply(stop_word_counter)
df['num_sentences'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[0]) 
df['sentence_length'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[1])
df['word_length'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[1])

punc_count = df['response_cleaned'].apply(punc_counter)
df['punc_ratio'] = punc_count / df['num_words']

X = df[['subreddit', 'response_cleaned', 'num_words', 'stop_words', 'num_sentences', 'sentence_length', 'word_length', 'punc_ratio']]
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = rs, stratify = y)

In [5]:
# Recreating our pipelines and our pre-tuning baselines for our three models
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # Since we're done testing with Naive Bayes we can use the default parameters here
    ]
)

text_pipe = Pipeline(
    [
        ('vect', CountVectorizer())
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)


et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)
adbc_pipe.fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adbc_pipe.score(X_train, y_train), adbc_pipe.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9043221278167713
----
RSVC Train: 0.9125508067495997
RSVC Test: 0.8880679719246398
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [15]:
# Redefining our text_pipe from before to include a standard scaler
text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('ss', StandardScaler(with_mean = False)) # scaling with mean doesn't work on sparse arrays so we'll have to do without
    ]
)

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

In [16]:
vect_params = {
    'ct__text__vect' : [TfidfVectorizer()]
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer()
ET Train: 1.0
ET Test: 0.9017362393793867
----
TfidfVectorizer()
RSVC Train: 0.970070205690356
RSVC Test: 0.6893239748799409
----
TfidfVectorizer()
AdaBoost Train: 0.9022047050129326
AdaBoost Test: 0.8766161802733653
----


We see a slight improvement going to a standard scaler with TF-IDF vectorizing for Extra Trees but we see that Adaptive Boosting's performance has dropped actually and Radial SVC has dropped dramatically as well. For Radial SVC it seems likely that the standard scaler is the issue so we can retry without that and for Adaptive Boosting let's try both TF-IDF and count vectorizer without a scaler before moving on.

In [17]:
text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()) # No SS
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

vect_params = {
    'ct__text__vect' : [TfidfVectorizer()]
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer()
RSVC Train: 0.9338588496120211
RSVC Test: 0.902844477281123
----
TfidfVectorizer()
AdaBoost Train: 0.9022047050129326
AdaBoost Test: 0.8766161802733653
----


We see both models prefer the Tf-IDF vectorizer over the count vectorizer. Despite the slightly lower test performance for Adaptive Boosting let's trust that cross-validation is a better approximate for our model's true performance and proceed with TF-IDF vectorizer. Our slightly increased overfit should give us some pause but let's set that aside for now. We also see that the standard scaler had no effect for Adaptive Boosting which is interesting. Radial SVC does have a notable performance bump bringing it barely to first place at the moment.

In [18]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # No need to worry about negative values anymore
    ]
)

et_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('ss', StandardScaler(with_mean = False)) # It helps 2 models and doesn't affect the third so we can re-use the same pipe for all 3
    ]
)

text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer())
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

et_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', et_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

In [19]:
vect_params = {
    'ct__text__vect__ngram_range' : ((1,1), (1,2), (1,3)),
    'ct__text__vect__stop_words' : (None, 'english', stopwords.words('english'))
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9050609530845954
----
TfidfVectorizer()
RSVC Train: 0.9338588496120211
RSVC Test: 0.902844477281123
----
TfidfVectorizer(ngram_range=(1, 2))
AdaBoost Train: 0.9033132159132898
AdaBoost Test: 0.8832656076837828
----


All models prefer having the stopwords included which was expected, we also see that RSVC prefers 1-grams but Extra Trees and Adaptive Boosting prefers bigrams included.

In [20]:
vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (1000, 2000, 3000),
    'ct__text__vect__max_df' : (1.0, 0.9, 0.8),
    'ct__text__vect__min_df' : (1, 10, 100)
}

rsvc_vect_params = {
    'ct__text__vect__max_features' : (1000, 2000, 3000),
    'ct__text__vect__max_df' : (1.0, 0.9, 0.8),
    'ct__text__vect__min_df' : (1, 10, 100)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5, 
    error_score = 'raise'
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9298115995567049
----
TfidfVectorizer(max_features=1000, min_df=100)
RSVC Train: 0.9358295356571006
RSVC Test: 0.9142962689323975
----
TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
AdaBoost Train: 0.906269244980909
AdaBoost Test: 0.8810491318803103
----


Each model has its own preferences for hyperparameters so we'll need to tune them separately moving forward. Let's start with Extra Trees.

### Tuning TF-IDF Vectorizer for Extra Trees

In [21]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (2000, 2500, 3000, 4000, 5000),
    'ct__text__vect__max_df' : (1.0, 0.5),
    'ct__text__vect__min_df' : (1, 0.1, 0.2)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

TfidfVectorizer(max_df=0.5, max_features=3000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9264868858514961
----


This best model has very slightly worse test performance but it also has a very extreme value for max_df so let's try and narrow down the true ideal value.

In [23]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (2750, 3000, 3250, 3500),
    'ct__text__vect__max_df' : (1.0, 0.75, 0.7, 0.6)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

TfidfVectorizer(max_df=0.7, max_features=2750, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9301810121906169
----


In [26]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : [(2750)],
    'ct__text__vect__max_df' : [(0.75)],
    'ct__text__vect__min_df' : (1, 10, 100, 0.15)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

TfidfVectorizer(max_df=0.75, max_features=2750, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.925748060583672
----


This looks as close to optimal as we can get so let's set aside Extra Trees and move on to Radial SVC.

### Tuning TF-IDF Vectorizer for Radial SVC

In [27]:
vect_params = {
    'ct__text__vect__max_features' : (1000, 500, 1500, 750),
    'ct__text__vect__min_df' : (100, 50, 75, 150, 200)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=1000, min_df=100)
RSVC Train: 0.9358295356571006
RSVC Test: 0.9142962689323975
----


In [28]:
vect_params = {
    'ct__text__vect__max_features' : (800, 900, 1000, 1100, 1250),
    'ct__text__vect__min_df' : (100, 500)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=800, min_df=100)
RSVC Train: 0.9358295356571006
RSVC Test: 0.9142962689323975
----


This seems like a good stopping point for RSVC let's move on to Adaptive Boosting before getting into tuning the hyperparameters of our model.s

### Tuning TF-IDF Vectorizer for Adaptive Boosting

In [29]:
vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (1750, 2000, 2250, 2500),
    'ct__text__vect__max_df' : (1.0, 0.7, 0.6, 0.5),
    'ct__text__vect__min_df' : (1, 10, 100)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
AdaBoost Train: 0.906269244980909
AdaBoost Test: 0.8810491318803103
----


In [30]:
vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : [(2000)],
    'ct__text__vect__min_df' : (1, 0.1, 0.2, 0.05)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
AdaBoost Train: 0.906269244980909
AdaBoost Test: 0.8810491318803103
----


Our model seems mostly happy with this combination but this is relatively high overfitting compared to how this model was doing earlier so let's try ignoring this set of max_features and trying to find a comparable one with less overfit.

In [32]:
vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (500, 600, 700, 800, 900, 1000, 1250)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=1250, ngram_range=(1, 2))
AdaBoost Train: 0.9004803547234881
AdaBoost Test: 0.8865903213889915
----


This actually seems to have worked, slightly less train accuracy but slightly higher test accuracy and less overfit even if only slightly. Moreover this model uses less features which is preferable where possible for time, memory, and variance reasons.

Now that we have our Count Vectorizer parameters, let's redefine our pipes with the new hyper parameters and get ready to tune hyperparameters for our models.

In [4]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # No need to worry about negative values anymore
    ]
)

et_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer(max_df = 0.75, max_features = 2750, ngram_range = (1, 2))),
        ('ss', StandardScaler(with_mean = False)) # It helps 2 models and doesn't affect the third so we can re-use the same pipe for all 3
    ]
)

rsvc_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer(max_features = 800, min_df = 100))
    ]
)

adb_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer(max_features = 1250, ngram_range = (1, 2)))
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

et_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', et_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

rsvc_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', rsvc_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

adb_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', adb_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', rsvc_col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', adb_col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

### Tuning hyperparameters for RSVC

Let's start with our SVC model because it has the fewest hyperparameters and the worst performance.

In [5]:
# Used https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/ along with the documentation to understand these parameters
rsvc_params = {
    'rsvc__C' : (1.0, 0.1, 10, 0.5, 5), # This parameter is functionally the same as it is for Ridge regression since this is the l2 regularization coefficient
    'rsvc__gamma' : ('scale', 'auto', 1.0, 0.1, 0.01, 0.001) # This parameter controls how far away the support vectors we consider can be (lower allows for further vectors)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=5)
RSVC Train: 0.9798004680379357
RSVC Test: 0.9283339490210565
----


Regularization has improved this model by quite a bit and we see that the default gamma is currently preferred. The default setting of 'scale' uses $\frac{1}{n_features * X.var()}$ which means that its picking an appropriate and already specific value that is scaled to this model. Let's continue this approach a bit longer and see if we get anything noticably better.

In [6]:
rsvc_params = {
    'rsvc__C' : (2.5, 3, 5, 6, 7.5, 9), 
    'rsvc__gamma' : (0.1, 0.2, 0.5, 0.05) 
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=5, gamma=0.2)
RSVC Train: 0.9938416061091268
RSVC Test: 0.930550424824529
----


While this model does perform at the slightly better than our previous one, it seems that changing gamma has actually led to a harsher overfit, instead let's stop tuning gamma and use the default while we tune only C.

In [7]:
rsvc_params = {
    'rsvc__C' : (4, 4.5, 5, 5.5, 8)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=8)
RSVC Train: 0.9884222194851583
RSVC Test: 0.9272257111193203
----


Again, we see a bit of the overfit issue here but its actually less than we had before so let's try a bit of gamma tuning at this level of C.

In [8]:
rsvc_params = {
    'rsvc__C' : [(8)],
    'rsvc__gamma' : ('scale', 'auto', 0.1, 0.5, 1.0, 2.0, 5.0, 50.0)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=8)
RSVC Train: 0.9884222194851583
RSVC Test: 0.9272257111193203
----


This seems like a good stopping point. We could find a slightly better value of C and we have the overfit issue still but this is much better than where we started and not the most overfit we've had so far.

### Tuning hyper parameters for Adaptive Boosting

In [9]:
adb_params = {
    'adbc__n_estimators' : (50, 25, 75, 100, 150, 200), # The number of rounds of boosting, more estimators means more simple models trained in sequence
    'adbc__learning_rate' : (1.0, 2.0, 5.0, 10.0, 50.0) # Changes the weight applied to each estimator, a higher rate means each classifier is individually a greater vote
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9497475058504742
AdaBoost Test: 0.9080162541558922
----


We see our model prefers the original learning rate but let's investigate that further before we move on to further tuning the number of estimators.

In [10]:
adb_params = {
    'adbc__n_estimators' : (50, 25, 75, 100, 150, 200), 
    'adbc__learning_rate' : (1.0, 1.1, 1.5, 1.75) 
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9497475058504742
AdaBoost Test: 0.9080162541558922
----


It looks like slight changes on learning rate are still not changing much so let's set it aside.

In [11]:
adb_params = {
    'adbc__n_estimators' : (250, 300, 400, 500, 1000)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.980416307427023
AdaBoost Test: 0.9069080162541558
----


In [12]:
adb_params = {
    'adbc__n_estimators' : (400, 450, 350, 300)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.980416307427023
AdaBoost Test: 0.9069080162541558
----


We're not really getting anywhere here so let's step back and adjust learning rate to see if we can find any improvements there.

In [14]:
adb_params = {
    'adbc__n_estimators' : [(400)],
    'adbc__learning_rate' : (1.0, 5, 2.5, 1.1, 100, 500)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.980416307427023
AdaBoost Test: 0.9069080162541558
----


At this point we can say that this line of reasoning simply isn't working. Let's try some more values for number of estimators because its currently lagging behind our other model's performance. We could be at a soft maximum for performance and be missing a sizable performance gain.

In [15]:
adb_params = {
    'adbc__n_estimators' : (75, 150, 400, 1250, 1500, 1750, 2500, 5000),
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.980416307427023
AdaBoost Test: 0.9069080162541558
----


We spent a bit longer tuning just to get a better feel for how the number of estimators and learning rate change our accuracy over time. We hadn't covered this in particular in class so spending the extra time to build that familiarity seemed warranted. One thing that stands out her is that while our test score went up, so did the gap between our training and test splits. This is notable but not surprising since more estimators intuitively should lead to more overfit. Since our perfomance went up a fair bit and these models are doing better through cross-validation we can comfortably accept these are better models. We will proceed with the 200 estimator model solely for the reason that it performed about the same with less gap between training and test split. Let's move on to Extra Trees.

### Tuning hyper parameters for Extra Trees

In [16]:
# There are a LOT of parameters for Extra Trees so tuning many values at once is difficult
et_vect_params = {
    'et__n_estimators' : (50, 100, 200), # Number of trees
    'et__max_depth' : (None, 5, 10), # How deep each tree can be (higher is better at predicting but can lead to overfit)
    'et__bootstrap' : (False, True), # Whether or not each tree is trained on bootstrapped data or the original data
    'et__max_features' : (None, 'sqrt', 'log2'), # The maximum number of features to use in a given tree
    'et__min_samples_split' : (2, 10, 50) # The minimum number of elements in a leaf node
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_estimators=200, n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9287033616549686
----


Our best model seems to be using the highest number of estimators so let's keep increasing there and try a few caps on maximum depth while we're at it to try and combat overfitting. The test accurary is actually a bit lower than the default so we should keep in mind that the default is at least comparable to this model despite being simpler with fewer estimators.

In [17]:
et_vect_params = {
    'et__n_estimators' : (150, 200, 300, 500),
    'et__max_depth' : (None, 15, 50)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_estimators=500, n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9301810121906169
----


Again, we see the highest number of estimators is best and the model does not like having a maximum depth. Let's try more estimators again.

In [18]:
et_vect_params = {
    'et__n_estimators' : (450, 500, 750, 1000),
    'et__max_depth' : (None, 15, 25)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_estimators=750, n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9290727742888807
----


We've honed in a bit on a prefered number of estimators so let's try the other hyper parameters some more.

In [19]:
et_vect_params = {
    'et__n_estimators' : (600, 700, 750, 800, 900),
    'et__max_depth' : (5, 15, 25, 50),
    'et__min_samples_split' : (2, 4, 6),
    'et__min_samples_leaf' : (1, 2, 3)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=50, min_samples_split=4, n_estimators=900,
                     n_jobs=6, random_state=91923)
ET Train: 0.9956891242763887
ET Test: 0.9194680458071666
----


It doesn't seem like this experiment has yielded any good results so let's make one more attempt before we move on. This time we'll limit max depth again but provided more estimators.

In [20]:
et_vect_params = {
    'et__n_estimators' : (900, 1000, 1250, 1500),
    'et__max_depth' : (40, 50, 60, 75)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=75, n_estimators=1250, n_jobs=6,
                     random_state=91923)
ET Train: 0.9995073284887301
ET Test: 0.9287033616549686
----


This is an interesting result and does bear some further consideration. Since it seems like we can get comparable results this way let's try and improve this further before going back to our best performing model overall.

In [21]:
et_vect_params = {
    'et__n_estimators' : [(1250)],
    'et__max_depth' : (75, 80, 100),
    'et__min_samples_split' : (4, 8, 12),
    'et__min_samples_leaf' : (2, 5, 7)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=100, min_samples_leaf=2, min_samples_split=8,
                     n_estimators=1250, n_jobs=6, random_state=91923)
ET Train: 0.9954427885207537
ET Test: 0.9253786479497599
----


At this point its become evident that this approach is unlikely to show better performance that our earlier model with fewer non-default parameters. Between the 500 and the 750 estimator models we will err on the side that fewer estimators is better within the same tier of performance since they have near identical accuracies. This is in part for the theoretical benefit of avoiding overfit but mainly we want the simpler model that will run a bit faster if only marginally.

## Training Models and Pickling

In [22]:
et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(n_estimators = 500, random_state = rs, n_jobs = 6))
    ]
).fit(X_train, y_train)

rsvc_pipe = Pipeline(
    [
        ('ct', rsvc_col_trans),
        ('rsvc', SVC(kernel = 'rbf', C = 8))
    ]
).fit(X_train, y_train)

adbc_pipe = Pipeline(
    [
        ('ct', adb_col_trans),
        ('adbc', AdaBoostClassifier(n_estimators = 200))
    ]
).fit(X_train, y_train)

In [23]:
with open('./pickle_jar/ExtraTrees.pkl', 'wb') as f:
    pickle.dump(et_pipe, f)

with open('./pickle_jar/RSVC.pkl', 'wb') as f:
    pickle.dump(rsvc_pipe, f)

with open('./pickle_jar/AdaBoost.pkl', 'wb') as f:
    pickle.dump(adbc_pipe, f)