In [57]:
import pandas as pd
import numpy as np

import re
from text_helper import word_splitter, sentence_count, stop_word_counter, punc_counter
from nltk.corpus import stopwords


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import make_column_selector, ColumnTransformer

import pickle

rs = 91923 

In [2]:
df = pd.read_csv('../data/data_final.csv')

df['response_cleaned'] = df['response'].apply(lambda x: re.sub('[\\n]{2,}', '\n', x))
df['response_cleaned'] = df['response_cleaned'].apply(lambda x: re.sub(r"\/*u\/[\S]+", 'they', x))

df['num_words'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[0])
df['stop_words'] = df['response_cleaned'].apply(stop_word_counter)
df['num_sentences'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[0]) 
df['sentence_length'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[1])
df['word_length'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[1])

punc_count = df['response_cleaned'].apply(punc_counter)
df['punc_ratio'] = punc_count / df['num_words']

X = df[['subreddit', 'response_cleaned', 'num_words', 'stop_words', 'num_sentences', 'sentence_length', 'word_length', 'punc_ratio']]
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = rs, stratify = y)

In [3]:
# Recreating out pipelines and our pre-tuning baselines for our three models
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # Since we're done testing with Naive Bayes we can use the default parameters here
    ]
)

text_pipe = Pipeline(
    [
        ('vect', CountVectorizer())
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)


et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)
adbc_pipe.fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adbc_pipe.score(X_train, y_train), adbc_pipe.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9043221278167713
----
RSVC Train: 0.9125508067495997
RSVC Test: 0.8880679719246398
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [4]:
# Redefining our text_pipe from before to include a standard scaler
text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('ss', StandardScaler(with_mean = False)) # scaling with mean doesn't work on sparse arrays so we'll have to do without
    ]
)

In [6]:
vect_params = {
    'ct__text__vect' : [TfidfVectorizer()]
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9009974141115626
----
RSVC Train: 0.9338588496120211
RSVC Test: 0.902844477281123
----
AdaBoost Train: 0.9022047050129326
AdaBoost Test: 0.8766161802733653
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer()
TfidfVectorizer()
CountVectorizer()


We see that our tree-based models prefer a count vectorizer but our support vector classifier performs better with the TF-IDF vectorizer. However, we see a sharp performance drop off for the support vector model now that our data is scaled which is disconcerting. Our actual performance for the other two models is identical with or without scaling so let's not scale since it doesn't give us anything it seems.

In [None]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # No need to worry about negative values anymore
    ]
)

text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()) # Removing scaling
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

In [None]:
vect_params = {
    'ct__text__vect__ngram_range' : ((1,1), (1,2), (1,3)),
    'ct__text__vect__stop_words' : (None, 'english', stopwords.words('english'))
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9128186183967492
----
RSVC Train: 0.9100874491932504
RSVC Test: 0.8862209087550794
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer(ngram_range=(1, 2))
CountVectorizer()
CountVectorizer()


Interestingly, only our Extra Trees models prefers the non-default parameters. Both Radial SVC and Adaptive Boosting prefer no stop words excluded and no ngrams (single sets of words instead of pairs). Extra Trees prefers bigrams included but otherwise still prefers not removing stop words. This is to be expected since we saw a large difference in stop word usage for real comments vs AI generated ones.

In [9]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (1000, 2000, 3000),
    'ct__text__vect__max_df' : (1.0, 0.9, 0.8),
    'ct__text__vect__min_df' : (1, 10, 100)
}

vect_params = {
    'ct__text__vect__max_features' : (1000, 2000, 3000),
    'ct__text__vect__max_df' : (1.0, 0.9, 0.8),
    'ct__text__vect__min_df' : (1, 10, 100)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5, 
    error_score = 'raise'
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(max_features=3000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9275951237532324
----
CountVectorizer(max_features=3000, min_df=10)
RSVC Train: 0.9099642813154329
RSVC Test: 0.8902844477281123
----
CountVectorizer(max_features=1000)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


Each model has its own preferences for hyperparamters so we'll need to tune them separately moving forward. Let's start with Extra Trees since its currently doing the best.

### Tuning Count Vectorizer for Extra Trees

In [10]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (2000, 2500, 3000, 4000, 5000),
    'ct__text__vect__max_df' : (1.0, 0.5),
    'ct__text__vect__min_df' : (1, 0.1, 0.2)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

CountVectorizer(max_features=4000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9268562984854082
----


It seems like no amount of fiddling with min and max document frequency is helping here so let's narrow in on what the best value of maximum features is. As a note, we see that while 4000 features is better by cross-validation, 3000 is slightly better on the actual test set. Since the two values are close, let's err on the side of trusting cross-validation to have selected the more robust hyper parameter value.

In [12]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (3500, 3750, 4000, 4250, 4500)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

CountVectorizer(max_features=4000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9268562984854082
----


In [13]:
et_vect_params = {
    'ct__text__vect__ngram_range' : [(1,2)],
    'ct__text__vect__max_features' : (3750, 3800, 3900, 4000, 4100, 4200, 4250)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

CountVectorizer(max_features=4000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9268562984854082
----


This looks as close to optimal as we can get so let's set aside Extra Trees and move on to Radial SVC.

### Tuning Count Vectorizer for Radial SVC

In [14]:
vect_params = {
    'ct__text__vect__max_features' : (2500, 3000, 4000, 5000),
    'ct__text__vect__min_df' : (5, 10, 20, 30)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

CountVectorizer(max_features=2500, min_df=30)
RSVC Train: 0.9095947776819805
RSVC Test: 0.8913926856298485
----


In [15]:
vect_params = {
    'ct__text__vect__max_features' : (2250, 2500, 2750),
    'ct__text__vect__min_df' : (25, 30, 35, 40)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

CountVectorizer(max_features=2250, min_df=30)
RSVC Train: 0.9094716098041631
RSVC Test: 0.8913926856298485
----


This seems like a good stopping point for RSVC let's move on to Adaptive Boosting before getting into tuning the hyperparameters of our model.s

### Tuning Count Vectorizer for Adaptive Boosting

In [17]:
vect_params = {
    'ct__text__vect__max_features' : (500, 750, 1000, 1250, 1500),
    'ct__text__vect__max_df' : (1.0, 0.7, 0.6, 0.5),
    'ct__text__vect__min_df' : (1, 10, 100)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(max_features=750)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [19]:
vect_params = {
    'ct__text__vect__max_features' : (700, 750, 800)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(max_features=700)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [20]:
vect_params = {
    'ct__text__vect__max_features' : (600, 650, 700)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(max_features=700)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


Adaptive Boosting seems about stable with as few as 700 features at most and performs best with the default values for min and max document frequency. Intuitively, this makes sense because the model isn't taking a huge number of features so its not trending towards overfitting and needing to be corralled in by limiting which features it can take. Out of pure curiosity, does no limit to max features perform better or worse?

In [21]:
vect_params = {
    'ct__text__vect__max_features' : [(None)]
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer()
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


With this we can definitively see that Adaptive Boosting does not need a huge number of features in order to reach its maximum predictive power, even leaving it uncapped we end up with the same performance as before.

Now that we have our Count Vectorizer parameters, let's redefine our pipes with the new hyper parameters and get ready to tune hyperparameters for our models.

In [34]:
et_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 4000, ngram_range = (1, 2)))
    ]
)

rsvc_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 2250, min_df = 30))
    ]
)

adb_text_pipe = Pipeline(
    [
        ('vect', CountVectorizer(max_features = 700))
    ]
)

et_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', et_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

rsvc_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', rsvc_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

adb_col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', adb_text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', rsvc_col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', adb_col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

### Tuning hyperparameters for RSVC

Let's start with our SVC model because it has the fewest hyperparameters and the worst performance.

In [35]:
# Used https://www.vebuso.com/2020/03/svm-hyperparameter-tuning-using-gridsearchcv/ along with the documentation to understand these parameters
rsvc_params = {
    'rsvc__C' : (1.0, 0.1, 10, 0.5, 5), # This parameter is functionally the same as it is for Ridge regression since this is the l2 regularization coefficient
    'rsvc__gamma' : ('scale', 'auto', 1.0, 0.1, 0.01, 0.001) # This parameter controls how far away the support vectors we consider can be (lower allows for further vectors)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=5, gamma=0.01)
RSVC Train: 0.994457445498214
RSVC Test: 0.9224233468784633
----


Regularization has improved this model by quite a bit and we see that a gamma of 0.1 is currently preferred. The default setting of 'scale' uses $\frac{1}{n_features * X.var()}$ which means that its picking an appropriate and already specific value that is specific to this model. Let's continue this approach a bit longer and see if we get anything noticably better.

In [36]:
rsvc_params = {
    'rsvc__C' : (2.5, 5, 7.5), 
    'rsvc__gamma' : (0.1, 0.2, 0.5, 0.05) 
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=5, gamma=0.05)
RSVC Train: 0.9996304963665475
RSVC Test: 0.8282231252308829
----


Our best model here actually performs much worse on the test set than our prior best model. This is quite interesting and will need to be kept in mind moving forward. Let's try another round of tuning but if we don't see a marked improvement over our first round we will take that model.

In [38]:
rsvc_params = {
    'rsvc__C' : (4, 5, 6), 
    'rsvc__gamma' : (0.075, 0.05, 0.025) 
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=6, gamma=0.025)
RSVC Train: 0.9988914890996428
RSVC Test: 0.8799408939785741
----


Again, we see the overfit issue here. My current intuition is that shrinking gamma is leading to this overfit so let's move back to the default gamma and find the best C for that value.

In [40]:
rsvc_params = {
    'rsvc__C' : (2.5, 5, 10, 15)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=10)
RSVC Train: 0.9676068481340067
RSVC Test: 0.9231621721462874
----


In [41]:
rsvc_params = {
    'rsvc__C' : (7.5, 10, 12.5)
}

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = rsvc_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(rsvc_grid.best_estimator_['rsvc'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))

SVC(C=10)
RSVC Train: 0.9676068481340067
RSVC Test: 0.9231621721462874
----


It looks like with default scale value our model performs best with C = 10. We do see less overfit for the same predictive power as our best gamma-tuned model so let's proceed with this as the final RSVC model.

### Tuning hyper parameters for Adaptive Boosting

In [42]:
adb_params = {
    'adbc__n_estimators' : (50, 25, 75, 100, 150, 200), # The number of rounds of boosting, more estimators means more simple models trained in sequence
    'adbc__learning_rate' : (1.0, 2.0, 5.0, 10.0, 50.0) # Changes the weight applied to each estimator, a higher rate means each classifier is individually a greater vote
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9370612144352752
AdaBoost Test: 0.907277428888068
----


We see our model prefers the original learning rate but let's investigate that further before we move on to further tuning the number of estimators.

In [43]:
adb_params = {
    'adbc__n_estimators' : (50, 25, 75, 100, 150, 200), 
    'adbc__learning_rate' : (1.0, 1.1, 1.5, 1.75) 
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9370612144352752
AdaBoost Test: 0.907277428888068
----


At this point we can feel comfortable setting learning rate aside and instead tuning n_estimators further. 

In [44]:
adb_params = {
    'adbc__n_estimators' : (175, 190, 200, 225, 250)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=200)
AdaBoost Train: 0.9370612144352752
AdaBoost Test: 0.907277428888068
----


In [45]:
adb_params = {
    'adbc__n_estimators' : (200, 250, 300, 400, 500, 1000)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=500)
AdaBoost Train: 0.960832614854046
AdaBoost Test: 0.9183598079054304
----


In [46]:
adb_params = {
    'adbc__n_estimators' : (450, 500, 600, 700)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=500)
AdaBoost Train: 0.960832614854046
AdaBoost Test: 0.9183598079054304
----


In [47]:
adb_params = {
    'adbc__n_estimators' : (475, 500, 525, 550)
}

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = adb_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)


print(adb_grid.best_estimator_['adbc'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

AdaBoostClassifier(n_estimators=475)
AdaBoost Train: 0.9602167754649588
AdaBoost Test: 0.9216845216106391
----


We spent a bit longer tuning just to get a better feel for how the n_estimators parameter changes our accuracy over time. We hadn't covered this in particular in class so spending the extra time to build that familiarity seemed warranted. One thing that stands out her is that while our test score went up, so did the gap between our training and test splits. This is notable but not surprising since more estimators intuitively should lead to more overfit. Since our perfomance went up a lot and these models are doing better through cross-validation we can comfortably accept this is the best model. Let's move on to Extra Trees and see if we can get the performance any better.

### Tuning hyper parameters for Extra Trees

In [48]:
# There are a LOT of parameters for Extra Trees so tuning many values at once is difficult
et_vect_params = {
    'et__n_estimators' : (50, 100, 200), # Number of trees
    'et__max_depth' : (None, 5, 10), # How deep each tree can be (higher is better at predicting but can lead to overfit)
    'et__bootstrap' : (False, True), # Whether or not each tree is trained on bootstrapped data or the original data
    'et__max_features' : (None, 'sqrt', 'log2'), # The maximum number of features to use in a given tree
    'et__min_samples_split' : (2, 10, 50) # The minimum number of elements in a leaf node
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])# print(et_grid.best_estimator_['ct'].transformers[1][1]['vect']) This was a typo and shows the wrong info, unfortunately this took 35 minutes to run so it may just be staying here
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

CountVectorizer(max_features=4000, ngram_range=(1, 2))
ET Train: 1.0
ET Test: 0.9268562984854082
----


In [50]:
print(et_grid.best_estimator_['et'])

ExtraTreesClassifier(n_jobs=6, random_state=91923)


So far our best model is the original one. Let's narrow down only on a couple of parameters and try again. We'll focus on n_estimators and max_depth.

In [51]:
et_vect_params = {
    'et__n_estimators' : (75, 100, 125),
    'et__max_depth' : (None, 15, 50)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9268562984854082
----


Again we don't see any gains so let's change gears and look into the hyper parameters that control when and if we create a further split, min_samples_split and min_samples_leaf. These control the number of samples required for a split to happen and the minimum number of observations in each leaf node.

In [52]:
et_vect_params = {
    'et__min_samples_split' : (2, 10, 50),
    'et__min_samples_leaf' : (1, 5, 25)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9268562984854082
----


In [53]:
et_vect_params = {
    'et__min_samples_split' : (2, 4, 6),
    'et__min_samples_leaf' : (1, 2, 3)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(n_jobs=6, random_state=91923)
ET Train: 1.0
ET Test: 0.9268562984854082
----


At this point we can concede that no tuning is particularly helping our model. That being said, very likely this is caused by our training accuracy being perfect and if that is consistent across a number of other models then many different ones can be equally effective but technically more robust to overfitting issues.

In [54]:
et_vect_params = {
    'et__n_estimators' : (75, 125, 200),
    'et__max_depth' : (5, 10, 15, 50)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=50, n_estimators=200, n_jobs=6,
                     random_state=91923)
ET Train: 0.9953196206429363
ET Test: 0.9272257111193203
----


This looks like it actually worked, lesson learned default values lead to overfit and can cause better models to be 'hidden'.

In [55]:
et_vect_params = {
    'et__n_estimators' : (150, 200, 250, 300),
    'et__max_depth' : (25, 50, 75)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=75, n_estimators=300, n_jobs=6,
                     random_state=91923)
ET Train: 0.9993841606109126
ET Test: 0.9335057258958256
----


In [56]:
et_vect_params = {
    'et__n_estimators' : (300, 400, 500),
    'et__max_depth' : (70, 75, 80)
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = et_vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['et'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))

ExtraTreesClassifier(max_depth=80, n_estimators=500, n_jobs=6,
                     random_state=91923)
ET Train: 0.9995073284887301
ET Test: 0.9331363132619136
----


We're not seeing any gains really here and its quite hard to decide which model to use. For this reason, let's choose the model we trained with the lowest overfit even if only slightl because we cannot guarantee that further gains, or even this far in the first place, are truly better than what we had before. We're going to proceed with the 75 max depth and 300 estimators model for the above reasons. The model is still very overfit but that comes with the territory as we saw near perfect training accuraccy across the board.

## Training Models and Pickling

In [58]:
et_pipe = Pipeline(
    [
        ('ct', et_col_trans),
        ('et', ExtraTreesClassifier(max_depth = 75, n_estimators = 300, random_state = rs, n_jobs = 6))
    ]
).fit(X_train, y_train)

rsvc_pipe = Pipeline(
    [
        ('ct', rsvc_col_transcol_trans),
        ('rsvc', SVC(kernel = 'rbf', C = 10))
    ]
).fit(X_train, y_train)

adbc_pipe = Pipeline(
    [
        ('ct', adb_col_trans),
        ('adbc', AdaBoostClassifier(n_estimators = 475))
    ]
).fit(X_train, y_train)

In [60]:
with open('./pickle_jar/ExtraTrees.pkl', 'wb') as f:
    pickle.dump(et_pipe, f)

with open('./pickle_jar/RSVC.pkl', 'wb') as f:
    pickle.dump(rsvc_pipe, f)

with open('./pickle_jar/AdaBoost.pkl', 'wb') as f:
    pickle.dump(adbc_pipe, f)