In [61]:
#dat[dat['response'].duplicated()] # The one duplicated response is the duplicated one from before
#re.sub(r"\/*u\/[\S]+", 'they', TEXT) # This works to replace usernames with they
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer

rs = 91923 

In [5]:
df = pd.read_csv('../data/data_final.csv')

In [8]:
X = df['response']
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = rs, stratify = y)

In [10]:
y_train.value_counts(normalize = True)

1    0.500062
0    0.499938
Name: fake, dtype: float64

Obviously, our baseline is 50% since half our data came from AI responses and half was real data that was gathered from reddit.

In [72]:
mnb_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('densify', FunctionTransformer(lambda x: x.toarray(), accept_sparse = True)),
        ('mnb', MultinomialNB())
    ]
)

bnb_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('densify', FunctionTransformer(lambda x: x.toarray(), accept_sparse = True)),
        ('bnb', BernoulliNB())
    ]
)

gnb_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('densify', FunctionTransformer(lambda x: x.toarray(), accept_sparse = True)),
        ('gnb', GaussianNB())
    ]
)

logr_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('logr', LogisticRegression(max_iter = 500, n_jobs = -1))
    ]
    )

rf_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('rf', RandomForestClassifier(random_state = rs, n_jobs = -1))
    ])

et_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = -1))
    ]
)

linear_svc_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('lsvc', SVC(kernel = 'linear'))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('vect', CountVectorizer()),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

In [73]:
mnb_pipe.fit(X_train, y_train)
bnb_pipe.fit(X_train, y_train)
gnb_pipe.fit(X_train, y_train)
logr_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)
et_pipe.fit(X_train, y_train)
linear_svc_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)

print("MNB Train: {}\nMNB Test: {}\n----".format(mnb_pipe.score(X_train, y_train), mnb_pipe.score(X_test, y_test)))
print("BNB Train: {}\nBNB Test: {}\n----".format(bnb_pipe.score(X_train, y_train), bnb_pipe.score(X_test, y_test)))
print("GNB Train: {}\nGNB Test: {}\n----".format(gnb_pipe.score(X_train, y_train), gnb_pipe.score(X_test, y_test)))
print("LogReg Train: {}\nLogReg Test: {}\n----".format(logr_pipe.score(X_train, y_train), logr_pipe.score(X_test, y_test)))
print("RF Train: {}\nRF Test: {}\n----".format(rf_pipe.score(X_train, y_train), rf_pipe.score(X_test, y_test)))
print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("LSVC Train: {}\nLSVC Test: {}\n----".format(linear_svc_pipe.score(X_train, y_train), linear_svc_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))

MNB Train: 0.8555240793201133
MNB Test: 0.7661618027336535
----
BNB Train: 0.6961448454243133
BNB Test: 0.6734392316217215
----
GNB Train: 0.9052839019583693
GNB Test: 0.5755448836350203
----
LogReg Train: 0.9959354600320236
LogReg Test: 0.879571481344662
----
RF Train: 1.0
RF Test: 0.8762467676394533
----
ET Train: 1.0
ET Test: 0.8943479867011451
----
LSVC Train: 0.9991378248552777
LSVC Test: 0.8507572958995198
----
RSVC Train: 0.906269244980909
RSVC Test: 0.8799408939785741
----


With no attempt at tuning or cleaning the data structurally, Multiomial Naive Bayes has the best test performance but is moderately overfit while Bernoulli Naive Bayes performs worse by a lot but is barely overfit. Gaussian Naive Bayes defies all expectations and manages to perform worse than the other two methods on the test data while being the most overfit. Generally, this points towards Multinomial Naive Bayes being the best model of the Naive Bayes approaches but I'm not ready to commit to that just yet in case I can get the Bernoulli model to be somewhat competitive. Logistic Regression performs quite well, beating Multinomil Naive Bayes by 11% on the test set, however we do see that it is very overfit again with nearly a perfect score on our training set. With proper regularization and other hyperparameter tuning Logisitic Regression could end up being one of the best models for detecting AI responses but that will be investigated later on. Our ensemble classifiers perform similarly to Logistic Regression with Extra Trees beating it slightly. Both have perfect scores on the training data so tuning will be needed to address the overfit issue. Linear SVC has good results but doesn't match our higher test rates while still having an overfit issue however radial SVC is just a bit short of our ensemble methods but has much lower overfit compared to the other models. Radial SVC is a model that will definitely warrant further attention.

Before getting deeper into modelling I want to try cleaning our responses slightly. There are two things I noted in our data: Reddit users tend to use two newline characters since reddit formatting requires it while the AI responses typically don't and real Reddit comments will sometimes reference other users by name in the format '/u/\<name here\>' or 'u/\<name here\>'. I plan to replace the former with a single newline character and the latter with 'they' instead and then I will see if that makes it a bit harder to predict which responses are fake.

In [243]:
print('Double newline count: {} \nA lot of observations have this double newline in them'.format(df[df['response'].str.contains('\n\n')].shape[0]))
print('AI double newline count: {}'.format(df[(df['response'].str.contains('\n\n')) & (df['fake'] == 1)].shape[0])) # Not many AI models have the double newline
print('Real double newline count: {}'.format(df[(df['response'].str.contains('\n\n')) & (df['fake'] == 0)].shape[0]))

Double newline count: 2961 
A lot of observations have this double newline in them
AI double newline count: 218
Real double newline count: 2743


Most of our double newlines are in the real comments so we'll proceed with removing them from our responses.

In [209]:
df[df['response'].str.contains('\n\n\n')].shape # For some reason there are responses that use triple newline

(165, 4)

In [210]:
df['response_cleaned'] = df['response'].apply(lambda x: re.sub('[\\n]{2,}', '\n', x))
df[df['response_cleaned'].str.contains('\n\n')].shape # All gone now

In [217]:
df['response_cleaned'] = df['response_cleaned'].apply(lambda x: re.sub(r"\/*u\/[\S]+", 'they', x))

In [218]:
df[df['response_cleaned'].str.contains('u/')] # All usernames gone

Unnamed: 0,subreddit,response,fake,response_cleaned


With that out of the way, let's see if our performance or overfitting has changed on any of our models from earlier. The assumption is that the newlines were relevant so our predictive power should have taken some hit but overfitting may not be effected since these problems should have affected both training and test sets evenly.

In [219]:
X = df['response_cleaned']
y = df['fake']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = rs, stratify = y)

In [220]:
mnb_pipe.fit(X_train, y_train)
bnb_pipe.fit(X_train, y_train)
gnb_pipe.fit(X_train, y_train)
logr_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)
et_pipe.fit(X_train, y_train)
linear_svc_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)

print("MNB Train: {}\nMNB Test: {}\n----".format(mnb_pipe.score(X_train, y_train), mnb_pipe.score(X_test, y_test)))
print("BNB Train: {}\nBNB Test: {}\n----".format(bnb_pipe.score(X_train, y_train), bnb_pipe.score(X_test, y_test)))
print("GNB Train: {}\nGNB Test: {}\n----".format(gnb_pipe.score(X_train, y_train), gnb_pipe.score(X_test, y_test)))
print("LogReg Train: {}\nLogReg Test: {}\n----".format(logr_pipe.score(X_train, y_train), logr_pipe.score(X_test, y_test)))
print("RF Train: {}\nRF Test: {}\n----".format(rf_pipe.score(X_train, y_train), rf_pipe.score(X_test, y_test)))
print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("LSVC Train: {}\nLSVC Test: {}\n----".format(linear_svc_pipe.score(X_train, y_train), linear_svc_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))

MNB Train: 0.8557704150757482
MNB Test: 0.7669006280014776
----
BNB Train: 0.6963911811799482
BNB Test: 0.6738086442556336
----
GNB Train: 0.9051607340805518
GNB Test: 0.5755448836350203
----
LogReg Train: 0.9959354600320236
LogReg Test: 0.8792020687107499
----
RF Train: 1.0
RF Test: 0.8825267824159586
----
ET Train: 1.0
ET Test: 0.8980421130402659
----
LSVC Train: 0.9991378248552777
LSVC Test: 0.8507572958995198
----
RSVC Train: 0.9065155807365439
RSVC Test: 0.8792020687107499
----


Our accuracy scores are about the same so this cleaning probably wasn't necessary but I can feel confident that if some tuning later on would have left our models more reliant on the double newline, now it won't be an issue.

We've been using a count vectorizer so far which has worked best for all the data in labs and lessons we've worked with so far but for the sake of it, let's see how a text frequency vectorizer works.

In [239]:
mnb_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('densify', FunctionTransformer(lambda x: x.toarray(), accept_sparse = True)),
        ('mnb', MultinomialNB())
    ]
)

bnb_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('densify', FunctionTransformer(lambda x: x.toarray(), accept_sparse = True)),
        ('bnb', BernoulliNB())
    ]
)

gnb_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('densify', FunctionTransformer(lambda x: x.toarray(), accept_sparse = True)),
        ('gnb', GaussianNB())
    ]
)

logr_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('logr', LogisticRegression(max_iter = 500, n_jobs = -1))
    ]
    )

rf_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('rf', RandomForestClassifier(random_state = rs, n_jobs = -1))
    ])

et_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('et', ExtraTreesClassifier(random_state = rs, n_jobs = -1))
    ]
)

linear_svc_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('lsvc', SVC(kernel = 'linear'))
    ]
)

rsvc_pipe = Pipeline(
    [
        ('vect', TfidfVectorizer()),
        ('rsvc', SVC(kernel = 'rbf'))
    ]
)

mnb_pipe.fit(X_train, y_train)
bnb_pipe.fit(X_train, y_train)
gnb_pipe.fit(X_train, y_train)
logr_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)
et_pipe.fit(X_train, y_train)
linear_svc_pipe.fit(X_train, y_train)
rsvc_pipe.fit(X_train, y_train)

print("MNB Train: {}\nMNB Test: {}\n----".format(mnb_pipe.score(X_train, y_train), mnb_pipe.score(X_test, y_test)))
print("BNB Train: {}\nBNB Test: {}\n----".format(bnb_pipe.score(X_train, y_train), bnb_pipe.score(X_test, y_test)))
print("GNB Train: {}\nGNB Test: {}\n----".format(gnb_pipe.score(X_train, y_train), gnb_pipe.score(X_test, y_test)))
print("LogReg Train: {}\nLogReg Test: {}\n----".format(logr_pipe.score(X_train, y_train), logr_pipe.score(X_test, y_test)))
print("RF Train: {}\nRF Test: {}\n----".format(rf_pipe.score(X_train, y_train), rf_pipe.score(X_test, y_test)))
print("ET Train: {}\nET Test: {}\n----".format(et_pipe.score(X_train, y_train), et_pipe.score(X_test, y_test)))
print("LSVC Train: {}\nLSVC Test: {}\n----".format(linear_svc_pipe.score(X_train, y_train), linear_svc_pipe.score(X_test, y_test)))
print("RSVC Train: {}\nRSVC Test: {}\n----".format(rsvc_pipe.score(X_train, y_train), rsvc_pipe.score(X_test, y_test)))

MNB Train: 0.9332430102229339
MNB Test: 0.8248984115256742
----
BNB Train: 0.6963911811799482
BNB Test: 0.6738086442556336
----
GNB Train: 0.9051607340805518
GNB Test: 0.5711119320280753
----
LogReg Train: 0.9389087326025373
LogReg Test: 0.8651643886220909
----
RF Train: 1.0
RF Test: 0.8836350203176949
----
ET Train: 1.0
ET Test: 0.8987809383080901
----
LSVC Train: 0.973518906269245
LSVC Test: 0.8714444033985962
----
RSVC Train: 0.9959354600320236
RSVC Test: 0.8762467676394533
----


Using a TF-IDF vectorizer instead of a count vectorizer gives slightly worse results for most models though we haven't done any work tuning hyperparameters or scaling so the fact that both are mostly competitive means we'll want to continue to test with both methods. The standout here is Multinomial Naive Bayes which performs 5% better on both sets. No other model saw a change similar but it does seem like if we wanted to consider a Multinomial Naive Bayes model for our final model it would likely use a TF-IDF vectorizer.

In [286]:
# https://stackoverflow.com/questions/36216665/find-there-is-an-emoji-in-a-string-in-python3
df[(df['response'].str.contains('😀')) & (df['fake'] == 1)]

Unnamed: 0,subreddit,response,fake,response_cleaned


In [None]:
# Meta Features
# number of sentences
# average word length
# average sentence length
# punctuation count (exclude apostrophe?)
# newline count? might overlap with sentences
# presence of emoji
# 