In [1]:
import pandas as pd
import numpy as np

import re
from text_helper import word_splitter, sentence_count, stop_word_counter, punc_counter
from nltk.corpus import stopwords


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import make_column_selector, ColumnTransformer

import pickle

rs = 91923 

In [2]:
df = pd.read_csv('../data/data_final.csv')

df['response_cleaned'] = df['response'].apply(lambda x: re.sub('[\\n]{2,}', '\n', x))
df['response_cleaned'] = df['response_cleaned'].apply(lambda x: re.sub(r"\/*u\/[\S]+", 'they', x))

df['num_words'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[0])
df['stop_words'] = df['response_cleaned'].apply(stop_word_counter)
df['num_sentences'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[0]) 
df['sentence_length'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[1])
df['word_length'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[1])

punc_count = df['response_cleaned'].apply(punc_counter)
df['punc_ratio'] = punc_count / df['num_words']

X = df[['subreddit', 'response_cleaned', 'num_words', 'stop_words', 'num_sentences', 'sentence_length', 'word_length', 'punc_ratio']]
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = rs, stratify = y)

In [5]:
# Recreating our pipelines and our pre-tuning baselines for our three models
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # Since we're done testing with Naive Bayes we can use the default parameters here
    ]
)

text_pipe = Pipeline(
    [
        ('vect', CountVectorizer())
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)


et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)
adbc_pipe.fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adbc_pipe.score(X_train, y_train), adbc_pipe.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9043221278167713
----
RSVC Train: 0.9125508067495997
RSVC Test: 0.8880679719246398
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


For the purposes of our tuning, the above performance levels are our new baseline now that we know for certain we can easily beat the 50% baseline.

### Deciding between count and TF-IDF vectorizers

In [6]:
# Redefining our text_pipe from before to include a standard scaler
text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('ss', StandardScaler(with_mean = False)) # scaling with mean doesn't work on sparse arrays so we'll have to do without
    ]
)

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

In [16]:
vect_params = {
    'ct__text__vect' : [TfidfVectorizer()]
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer()
ET Train: 1.0
ET Test: 0.9017362393793867
----
TfidfVectorizer()
RSVC Train: 0.970070205690356
RSVC Test: 0.6893239748799409
----
TfidfVectorizer()
AdaBoost Train: 0.9022047050129326
AdaBoost Test: 0.8766161802733653
----


We see a slight drop in perfomance going to a standard scaler with TF-IDF vectorizing for Extra Trees and Adaptive Boosting and Radial SVC has dropped dramatically as well. For Radial SVC it seems likely that the standard scaler is the issue so we can retry without that and for the other two let's try both TF-IDF and count vectorizer without a scaler before moving on.

In [7]:
text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()) # No SS
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

vect_params = {
    'ct__text__vect' : [TfidfVectorizer()]
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer()
ET Train: 1.0
ET Test: 0.9009974141115626
----
TfidfVectorizer()
RSVC Train: 0.9338588496120211
RSVC Test: 0.902844477281123
----
TfidfVectorizer()
AdaBoost Train: 0.9022047050129326
AdaBoost Test: 0.8766161802733653
----


Removing the standardization does bring our SVC model back to its original performance and the TF-IDF vectorizer performs better than our count vectorizer was before. Now with Adaptive Boosting we see TF-IDF is performing better on the training set but slightly worse on the test set. There is also a bit more overfitting going on so let's continue with the count vectorizer here despite the cross-validation prefering TF-IDF vectorizer. For Extra Trees we see something similar but less extreme, TF-IDF performs slightly worse on test data but is prefered by cross-validation over the count vectorizer. Because the performance is so close between the two we will proceed with TF-IDF here.

In [8]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler())
    ]
)

et_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('ss', StandardScaler(with_mean = False))
    ]
)

rsvc_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer())
    ]
)

adb_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer())
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

et_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', et_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

rsvc_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', rsvc_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

adb_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', adb_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

### Checking different ngrams and stopwords

In [9]:
vect_params = {
    'ct__text__vect__ngram_range' : ((1,1), (1,2), (1,3)),
    'ct__text__vect__stop_words' : (None, 'english', stopwords.words('english'))
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9050609530845954
----
CountVectorizer()
RSVC Train: 0.9125508067495997
RSVC Test: 0.8880679719246398
----
CountVectorizer()
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


All models prefer having the stopwords included which was expected, we also see that Extra Trees prefers bigrams included but RSVC and Adaptive Boosting don't get any increased performance out of them.

In [10]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (1000, 2000, 3000),
    'ct__text__vect__max_df' : (1.0, 0.9, 0.8),
    'ct__text__vect__min_df' : (1, 10, 100)
}

vect_params = {
    'ct__text__vect__max_features' : (1000, 2000, 3000),
    'ct__text__vect__max_df' : (1.0, 0.9, 0.8),
    'ct__text__vect__min_df' : (1, 10, 100)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5, 
    error_score = 'raise'
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

TfidfVectorizer(max_features=3000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9298115995567049
----
CountVectorizer(max_features=3000, min_df=10)
RSVC Train: 0.9099642813154329
RSVC Test: 0.8902844477281123
----
CountVectorizer(max_features=1000)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


Each model has its own preferences for hyperparameters so we'll need to tune them separately moving forward. Let's start with Extra Trees.

### Tuning TF-IDF Vectorizer for Extra Trees

In [21]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (2000, 2500, 3000, 4000, 5000),
    'ct__text__vect__max_df' : (1.0, 0.5),
    'ct__text__vect__min_df' : (1, 0.1, 0.2)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

TfidfVectorizer(max_df=0.5, max_features=3000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9264868858514961
----


This best model has very slightly worse test performance but it also has a very extreme value for max_df so let's try and narrow down the true ideal value.

In [23]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (2750, 3000, 3250, 3500),
    'ct__text__vect__max_df' : (1.0, 0.75, 0.7, 0.6)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

TfidfVectorizer(max_df=0.7, max_features=2750, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9301810121906169
----


In [26]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : [(2750)],
    'ct__text__vect__max_df' : [(0.75)],
    'ct__text__vect__min_df' : (1, 10, 100, 0.15)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

TfidfVectorizer(max_df=0.75, max_features=2750, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.925748060583672
----


This looks as close to optimal as we can get so let's set aside Extra Trees and move on to Radial SVC.

### Tuning Count Vectorizer for Radial SVC

In [11]:
vect_params = {
    'ct__text__vect__max_features' : (3000, 5000, 4000, 2500),
    'ct__text__vect__min_df' : (10, 50, 75, 25)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

CountVectorizer(max_features=3000, min_df=25)
RSVC Train: 0.9103337849488853
RSVC Test: 0.8910232729959364
----


In [12]:
vect_params = {
    'ct__text__vect__max_features' : (2750, 3000, 3250, 3500),
    'ct__text__vect__min_df' : (15, 20, 25, 30, 40)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

CountVectorizer(max_features=2750, min_df=30)
RSVC Train: 0.9095947776819805
RSVC Test: 0.8913926856298485
----


This seems like a good stopping point for RSVC let's move on to Adaptive Boosting before getting into tuning the hyperparameters of our model.s

### Tuning Count Vectorizer for Adaptive Boosting

In [13]:
vect_params = {
    'ct__text__vect__max_features' : (500, 750, 1000, 1250, 1500),
    'ct__text__vect__min_df' : (0.1, 0.05, 0.01, 0.2)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(max_features=750, min_df=0.01)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [14]:
vect_params = {
    'ct__text__vect__max_features' : (650, 700, 750, 800),
    'ct__text__vect__min_df' : (0.01, 0.02, 0.005)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(max_features=700, min_df=0.01)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


Our model performance on the test set is about the same but limit features here can help us combat overfitting and keep our model computationally simple which will help while tuning the parameters for Adaptive Boosting later on.

Now that we have our parameters for our vectorizers, let's redefine our pipes with the new hyper parameters and get ready to tune hyper parameters for our models.

In [15]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # No need to worry about negative values anymore
    ]
)

et_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer(max_df = 0.75, max_features = 2750, ngram_range = (1, 2))),
        ('ss', StandardScaler(with_mean = False))
    ]
)

rsvc_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 2750, min_df = 30))
    ]
)

adb_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 700, min_df = 0.01))
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

et_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', et_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

rsvc_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', rsvc_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

adb_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', adb_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', rsvc_col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', adb_col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

### Tuning hyperparameters for RSVC


In [16]:
# Used https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/ along with the documentation to understand these parameters
rsvc_params = {
    'rsvc__C' : (1.0, 0.1, 10, 0.5, 5), # This parameter is functionally the same as it is for Ridge regression since this is the l2 regularization coefficient
    'rsvc__gamma' : ('scale', 'auto', 1.0, 0.1, 0.01, 0.001) # This parameter controls how far away the support vectors we consider can be (lower allows for further vectors)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=5, gamma=0.01)
RSVC Train: 0.9945806133760315
RSVC Test: 0.9246398226819357
----


Regularization has improved this model by quite a bit but we see a lot of overfitting compared to earlier. The default setting for gamma is 'scale' which uses $\frac{1}{n_features * X.var()}$ which means that its picking an appropriate and already specific value that is scaled to this model. While we are performing better with a gamma of 0.01, this is likely also causing our overfit so let's try continuing without it.

In [17]:
rsvc_params = {
    'rsvc__C' : (2.5, 3, 5, 6, 7.5, 9),
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=7.5)
RSVC Train: 0.9620642936322207
RSVC Test: 0.9227927595123753
----


We see much less overfitting here compared to with gamma of 0.01 but we also see slightly lower test accuracy with our new C. Let's test what C = 5 gives us with the default gamma before proceeding.

In [18]:
rsvc_params = {
    'rsvc__C' : [(5)]
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=5)
RSVC Train: 0.9533193743071807
RSVC Test: 0.9172515700036942
----


With this result we can comfortably go with C = 7.5 for the final model. We could continue tuning for a slightly better value but there isn't likely to be much gain. Before we move on we should check the other kernel options just to be safe.

In [19]:
rsvc_params = {
    'rsvc__kernel' : ('poly', 'rbf', 'sigmoid'),
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC()
RSVC Train: 0.9095947776819805
RSVC Test: 0.8913926856298485
----


Our lesson on SVM suggested that radial was generally the best option but it doesn't hurt to check. here we see that by default radial performs best so we can proceed with radial and C = 7.5.

### Tuning hyper parameters for Adaptive Boosting

In [20]:
adb_params = {
    'adbc__n_estimators' : (50, 25, 75, 100, 150, 200), # The number of rounds of boosting, more estimators means more simple models trained in sequence
    'adbc__learning_rate' : (1.0, 2.0, 5.0, 10.0, 50.0) # Changes the weight applied to each estimator, a higher rate means each classifier is individually a greater vote
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9370612144352752
AdaBoost Test: 0.907277428888068
----


We see our model prefers the original learning rate but let's investigate that further before we move on to further tuning the number of estimators.

In [21]:
adb_params = {
    'adbc__n_estimators' : (50, 25, 75, 100, 150, 200), 
    'adbc__learning_rate' : (1.0, 1.1, 1.5, 1.75) 
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9370612144352752
AdaBoost Test: 0.907277428888068
----


It looks like slight changes on learning rate are still not changing much so let's set it aside.

In [22]:
adb_params = {
    'adbc__n_estimators' : (200, 250, 300, 400, 500, 1000)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.9552900603522602
AdaBoost Test: 0.9157739194680458
----


In [23]:
adb_params = {
    'adbc__n_estimators' : (350, 400, 450, 350, 300)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.9552900603522602
AdaBoost Test: 0.9157739194680458
----


At this point we've mostly converged on 400 estimators but let's look a bit deeper before moving on. Let's try some more extreme values for number of estimators because its currently lagging behind our other model's performance. We could be at a soft maximum for performance and be missing a sizable performance gain.

In [24]:
adb_params = {
    'adbc__n_estimators' : (75, 150, 400, 1250, 1500, 1750, 2500, 5000),
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=400)
AdaBoost Train: 0.9552900603522602
AdaBoost Test: 0.9157739194680458
----


We spent a bit longer tuning just to get a better feel for how the number of estimators and learning rate change our accuracy over time. We hadn't covered this in particular in class so spending the extra time to build that familiarity seemed warranted. One thing that stands out her is that while our test score went up, so did the gap between our training and test splits. This is notable but not surprising since more estimators intuitively should lead to more overfit. Since our perfomance went up a fair bit and these models are doing better through cross-validation we can comfortably accept these are better models. Our best performing model seems to be the one with 400 estimators so we will proceed with that model.

### Tuning hyper parameters for Extra Trees

In [16]:
# There are a LOT of parameters for Extra Trees so tuning many values at once is difficult
et_vect_params = {
    'et__n_estimators' : (50, 100, 200), # Number of trees
    'et__max_depth' : (None, 5, 10), # How deep each tree can be (higher is better at predicting but can lead to overfit)
    'et__bootstrap' : (False, True), # Whether or not each tree is trained on bootstrapped data or the original data
    'et__max_features' : (None, 'sqrt', 'log2'), # The maximum number of features to use in a given tree
    'et__min_samples_split' : (2, 10, 50) # The minimum number of elements in a leaf node
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_estimators=200, n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9287033616549686
----


Our best model seems to be using the highest number of estimators so let's keep increasing there and try a few caps on maximum depth while we're at it to try and combat overfitting. The test accurary is actually a bit lower than the default so we should keep in mind that the default is at least comparable to this model despite being simpler with fewer estimators.

In [17]:
et_vect_params = {
    'et__n_estimators' : (150, 200, 300, 500),
    'et__max_depth' : (None, 15, 50)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_estimators=500, n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9301810121906169
----


Again, we see the highest number of estimators is best and the model does not like having a maximum depth. Let's try more estimators again.

In [18]:
et_vect_params = {
    'et__n_estimators' : (450, 500, 750, 1000),
    'et__max_depth' : (None, 15, 25)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_estimators=750, n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9290727742888807
----


We've honed in a bit on a prefered number of estimators so let's try the other hyper parameters some more.

In [19]:
et_vect_params = {
    'et__n_estimators' : (600, 700, 750, 800, 900),
    'et__max_depth' : (5, 15, 25, 50),
    'et__min_samples_split' : (2, 4, 6),
    'et__min_samples_leaf' : (1, 2, 3)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=50, min_samples_split=4, n_estimators=900,
                     n_jobs=6, random_state=91923)
ET Train: 0.9956891242763887
ET Test: 0.9194680458071666
----


It doesn't seem like this experiment has yielded any good results so let's make one more attempt before we move on. This time we'll limit max depth again but provided more estimators.

In [20]:
et_vect_params = {
    'et__n_estimators' : (900, 1000, 1250, 1500),
    'et__max_depth' : (40, 50, 60, 75)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=75, n_estimators=1250, n_jobs=6,
                     random_state=91923)
ET Train: 0.9995073284887301
ET Test: 0.9287033616549686
----


This is an interesting result and does bear some further consideration. Since it seems like we can get comparable results this way let's try and improve this further before going back to our best performing model overall.

In [21]:
et_vect_params = {
    'et__n_estimators' : [(1250)],
    'et__max_depth' : (75, 80, 100),
    'et__min_samples_split' : (4, 8, 12),
    'et__min_samples_leaf' : (2, 5, 7)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=100, min_samples_leaf=2, min_samples_split=8,
                     n_estimators=1250, n_jobs=6, random_state=91923)
ET Train: 0.9954427885207537
ET Test: 0.9253786479497599
----


At this point its become evident that this approach is unlikely to show better performance that our earlier model with fewer non-default parameters. Between the 500 and the 750 estimator models we will err on the side that fewer estimators is better within the same tier of performance since they have near identical accuracies. This is in part for the theoretical benefit of avoiding overfit but mainly we want the simpler model that will run a bit faster if only marginally.

## Training and Pickling Tuned Models

In [26]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler())
    ]
)

et_text_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer(max_df = 0.75, max_features = 2750, ngram_range = (1, 2))),
        ('ss', StandardScaler(with_mean = False))
    ]
)

rsvc_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 2750, min_df = 30))
    ]
)

adb_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 700, min_df = 0.01))
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

et_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', et_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

rsvc_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', rsvc_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

adb_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', adb_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(n_estimators = 500, random_state = rs, n_jobs = 6))
    ]
).fit(X_train, y_train)

rsvc_pipe = Pipeline(
    [
        ('ct', rsvc_col_trans),
        ('rsvc', SVC(kernel = 'rbf', C = 7.5))
    ]
).fit(X_train, y_train)

adbc_pipe = Pipeline(
    [
        ('ct', adb_col_trans),
        ('adbc', AdaBoostClassifier(n_estimators = 400))
    ]
).fit(X_train, y_train)

In [27]:
with open('./pickle_jar/ExtraTrees.pkl', 'wb') as f:
    pickle.dump(et_pipe, f)

with open('./pickle_jar/RSVC.pkl', 'wb') as f:
    pickle.dump(rsvc_pipe, f)

with open('./pickle_jar/AdaBoost.pkl', 'wb') as f:
    pickle.dump(adbc_pipe, f)