In [1]:
import pandas as pd
import numpy as np

import re
from text_helper import word_splitter, sentence_count, stop_word_counter, punc_counter
from nltk.corpus import stopwords


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.compose import make_column_selector, ColumnTransformer

rs = 91923 

In [3]:
df = pd.read_csv('../data/data_final.csv')

df['response_cleaned'] = df['response'].apply(lambda x: re.sub('[\\n]{2,}', '\n', x))
df['response_cleaned'] = df['response_cleaned'].apply(lambda x: re.sub(r"\/*u\/[\S]+", 'they', x))

df['num_words'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[0])
df['stop_words'] = df['response_cleaned'].apply(stop_word_counter)
df['num_sentences'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[0]) 
df['sentence_length'] = df['response_cleaned'].apply(lambda x: sentence_count(x)[1])
df['word_length'] = df['response_cleaned'].apply(lambda x: word_splitter(x)[1])

punc_count = df['response_cleaned'].apply(punc_counter)
df['punc_ratio'] = punc_count / df['num_words']

X = df[['subreddit', 'response_cleaned', 'num_words', 'stop_words', 'num_sentences', 'sentence_length', 'word_length', 'punc_ratio']]
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = rs, stratify = y)

In [4]:
# Recreating out pipelines and our pre-tuning baselines for our three models
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # Since we're done testing with Naive Bayes we can use the default parameters here
    ]
)

text_pipe = Pipeline(
    [
        ('vect', CountVectorizer())
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)


et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = 6))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)
adbc_pipe.fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adbc_pipe.score(X_train, y_train), adbc_pipe.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9043221278167713
----
RSVC Train: 0.9125508067495997
RSVC Test: 0.8880679719246398
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [5]:
# Redefining our text_pipe from before to include a standard scaler
text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('ss', StandardScaler(with_mean = False)) # scaling with mean doesn't work on sparse arrays so we'll have to do without
    ]
)

In [6]:
vect_params = {
    'ct__text__vect' : [TfidfVectorizer()]
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 2,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 2,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 2,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9009974141115626
----
RSVC Train: 0.9338588496120211
RSVC Test: 0.902844477281123
----
AdaBoost Train: 0.9022047050129326
AdaBoost Test: 0.8766161802733653
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer()
TfidfVectorizer()
CountVectorizer()


We see that our tree-based models prefer a count vectorizer but our support vector classifier performs better with the TF-IDF vectorizer. However, we see a sharp performance drop off for the support vector model now that our data is scaled which is disconcerting. Our actual performance for the other two models is identical with or without scaling so let's not scale since it doesn't give us anything it seems.

In [None]:
meta_pipe = Pipeline(
    [
        ('ss', StandardScaler()) # No need to worry about negative values anymore
    ]
)

text_pipe = Pipeline(
    [
        ('vect', CountVectorizer()) # Removing scaling
    ]
)

subreddit_pipe = Pipeline(
    [
        ('ohe', OneHotEncoder())
    ]
)

col_trans = ColumnTransformer(
    [
        ('meta', meta_pipe, make_column_selector(dtype_include = np.number)),
        ('text', text_pipe, 'response_cleaned'),
        ('ohe', subreddit_pipe, ['subreddit'])
    ],
    n_jobs = 6
)

In [None]:
vect_params = {
    'ct__text__vect' : [TfidfVectorizer(), CountVectorizer()]
}

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

In [None]:
vect_params = {
    'ct__text__vect__ngram_range' : ((1,1), (1,2), (1,3)),
    'ct__text__vect__stop_words' : (None, 'english', stopwords.words('english'))
}

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9128186183967492
----
RSVC Train: 0.9100874491932504
RSVC Test: 0.8862209087550794
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer(ngram_range=(1, 2))
CountVectorizer()
CountVectorizer()


Interestingly, only our Extra Trees models prefers the non-default parameters. Both Radial SVC and Adaptive Boosting prefer no stop words excluded and no ngrams (single sets of words instead of pairs). Extra Trees prefers bigrams included but otherwise still prefers not removing stop words. This is to be expected since we saw a large difference in stop word usage for real comments vs AI generated ones.

In [None]:
vect_params = {
    'ct__text__vect__max_features' : (None, 1000, 2000, 4000, 5000)
}

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9179903952715183
----
RSVC Train: 0.9061460771030915
RSVC Test: 0.8899150350942002
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer(max_features=2000)
CountVectorizer(max_features=2000)
CountVectorizer(max_features=1000)


Let's try to narrow this down a bit further, maybe they'll end up wanting the same number of max features once we've gotten more specific.

In [None]:
vect_params = {
    'ct__text__vect__max_features' : (1000, 1500, 2000, 2500, 3000)
}

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9157739194680458
----
RSVC Train: 0.9038058874245597
RSVC Test: 0.8902844477281123
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer(max_features=2500)
CountVectorizer(max_features=1500)
CountVectorizer(max_features=1000)


We actually see more divergence in prefered number of features here. Let's do one last set of tests before moving on.

In [None]:
vect_params = {
    'ct__text__vect__max_features' : (1000, 1250, 1500, 2250, 2500, 2750)
}

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

ET Train: 1.0
ET Test: 0.9157739194680458
----
RSVC Train: 0.9038058874245597
RSVC Test: 0.8902844477281123
----
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [None]:
print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])

CountVectorizer(max_features=2500)
CountVectorizer(max_features=1500)
CountVectorizer(max_features=1000)


We have the same parameters here as before so let's move on to the next few parameters.

In [None]:
vect_params = {
    'ct__text__vect__max_df' : (1.0, .9, .8, .7),
    'ct__text__vect__min_df' : (0, 0.05, 0.1, 0.2)
}

et_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('et', ExtraTreesClassifier(random_state = rs))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

adbc_pipe = Pipeline(
    [
        ('ct', col_trans),
        ('adbc', AdaBoostClassifier())
    ]
)

et_grid = GridSearchCV(
    et_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

rsvc_grid = GridSearchCV(
    rsvc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

adb_grid = GridSearchCV(
    adbc_pipe,
    param_grid = vect_params,
    n_jobs = 6,
    cv = 5
).fit(X_train, y_train)

print(et_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
print(rsvc_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
print(adb_grid.best_estimator_['ct'].transformers[1][1]['vect'])
print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

CountVectorizer(min_df=0.05)
ET Train: 1.0
ET Test: 0.9146656815663096
----
CountVectorizer(max_df=0.7, min_df=0)
RSVC Train: 0.9300406453996798
RSVC Test: 0.9009974141115626
----
CountVectorizer(min_df=0)
AdaBoost Train: 0.8949378002217022
AdaBoost Test: 0.888806797192464
----


In [None]:
# vect_params = [
#     {
#         'ct__text__vect__max_features' : (None, 1000, 2000, 4000, 5000),
#         'ct__text__vect__ngram_range' : ((1,1), (1,2), (1,3)),
#         'ct__text__vect__stop_words' : (None, 'english', stopwords.words('english')),
#         'ct__text__vect__max_df' : (1.0, .9, .8, .7),
#         'ct__text__vect__min_df' : (0, 0.05, 0.1, 0.2)
#     },
#     {
#         'vect' : [TfidfVectorizer()],
#         'ct__text__vect__max_features' : (None, 1000, 2000, 4000, 5000),
#         'ct__text__vect__ngram_range' : ((1,1), (1,2), (1,3)),
#         'ct__text__vect__stop_words' : (None, 'english', stopwords.words('english')),
#         'ct__text__vect__max_df' : (1.0, .9, .8, .7),
#         'ct__text__vect__min_df' : (0, 0.05, 0.1, 0.2)
#     }
# ]

# et_pipe = Pipeline(
#     [
#         ('ct', col_trans),
#         ('et', ExtraTreesClassifier(random_state = rs, n_jobs = -1))
#     ]
# )

# rsvc_pipe = Pipeline(
#     [
#         ('ct', col_trans),
#         ('rsvc', SVC(kernel = 'rbf'))
#     ]
# )

# adbc_pipe = Pipeline(
#     [
#         ('ct', col_trans),
#         ('adbc', AdaBoostClassifier())
#     ]
# )

# et_grid = GridSearchCV(
#     et_pipe,
#     param_grid = vect_params,
#     n_jobs = -1,
#     cv = 5
# ).fit(X_train, y_train)

# rsvc_grid = GridSearchCV(
#     rsvc_pipe,
#     param_grid = vect_params,
#     n_jobs = -1,
#     cv = 5
# ).fit(X_train, y_train)

# adb_grid = GridSearchCV(
#     adbc_pipe,
#     param_grid = vect_params,
#     n_jobs = -1,
#     cv = 5
# ).fit(X_train, y_train)

# print("ET Train: {}\nET Test: {}\n----".format(et_grid.score(X_train, y_train), et_grid.score(X_test, y_test)))
# print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_grid.score(X_train, y_train), rsvc_grid.score(X_test, y_test)))
# print("AdaBoost Train: {}\nAdaBoost Test: {}\n----".format(adb_grid.score(X_train, y_train), adb_grid.score(X_test, y_test)))

KeyboardInterrupt: 