# Modeling: Discord Spam Detector

### Tamara Frances

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline

In [2]:
df_notspam1 = pd.read_excel('../data/data-for-capstone-limited.xlsx', 'Crypto - not spam') #crypto chat no spam
df_notspam2 = pd.read_excel('../data/data-for-capstone-limited.xlsx', 'Not spam') #not crypto chat no spam

In [3]:
df_notspam1['Spam?'] = df_notspam1['Spam?'].fillna('N')
df_notspam2['Spam?'] = df_notspam2['Spam?'].fillna('N')

In [4]:
notspam = [df_notspam1, df_notspam2]
df_notspam = pd.concat(notspam)

In [5]:
df_spam = pd.read_excel('../data/data-for-capstone-limited.xlsx', 'Spam') #spam

In [6]:
#joining dataframes

dataframes = [df_notspam, df_spam]
df = pd.concat(dataframes)
df = df.rename(columns={'Text': 'text', 'Spam?': 'spam'})

df.head()

Unnamed: 0,text,spam
0,"they got a privacy setting on there, you might...",N
1,Not sure if anyone is still interested in step...,N
2,did anyone win the raffle for Akumu? Winners h...,N
3,Good stuff! We should see some wild price acti...,N
4,Makes me happy I'm still holding my foxes and ...,N


<br>

---

<br>

## Baseline
<br>

In [7]:
X = df['text']
y = df['spam']
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify = y, random_state=42)

In [8]:
y.value_counts(normalize=True)


N    0.87963
Y    0.12037
Name: spam, dtype: float64

<br>
<br>

---

<br>
<br>

## Models
<br>

In [9]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [10]:
lr = LogisticRegression(max_iter=10_000)
lr.fit(X_train_cv, y_train)
lr.score(X_train_cv, y_train), lr.score(X_test_cv, y_test)

(0.9938271604938271, 0.9135802469135802)

In [22]:
pipeline = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('lr', LogisticRegression(max_iter = 10_000))
])

parameters = {
    'cv__min_df': (1,2),
    'cv__ngram_range': ((1, 1), (1, 2)),
    'lr__C': [0.25, 0.5, 0.75, 1.0]}
    
gs_lr = GridSearchCV(pipeline, param_grid = parameters)
gs_lr.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words='english')),
                                       ('lr',
                                        LogisticRegression(max_iter=10000))]),
             param_grid={'cv__min_df': (1, 2),
                         'cv__ngram_range': ((1, 1), (1, 2)),
                         'lr__C': [0.25, 0.5, 0.75, 1.0]})

In [23]:
print('gs_lr train score : ', gs_lr.score(X_train, y_train))
print('gs_lr test score : ', gs_lr.score(X_test, y_test))
print('The accuracy score with hyperparameter tuning increased by : '
      f'{round((gs_lr.score(X_test, y_test)) - (lr.score(X_test_cv, y_test)),4)}')

gs_lr train score :  0.9732510288065843
gs_lr test score :  0.9135802469135802
The accuracy score with hyperparameter tuning increased by : 0.0


<br>
<br>

---

<br>
<br>

In [24]:
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train_cv, y_train)
rfc.score(X_train_cv, y_train), rfc.score(X_test_cv, y_test)

(1.0, 0.9074074074074074)

In [27]:
pipeline = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('rfc', RandomForestClassifier(random_state=42))
])

parameters = {
    'cv__min_df': (1,3),
    'cv__ngram_range': ((1, 1), (1, 2)),
    'rfc__n_estimators': (300, 500),
    'rfc__max_depth': (None, 3, 5),
    'rfc__min_samples_leaf': (1, 3)
}
    
gs_rfc = GridSearchCV(pipeline, param_grid = parameters, n_jobs=-1)
gs_rfc.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words='english')),
                                       ('rfc',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'cv__min_df': (1, 3),
                         'cv__ngram_range': ((1, 1), (1, 2)),
                         'rfc__max_depth': (None, 3, 5),
                         'rfc__min_samples_leaf': (1, 3),
                         'rfc__n_estimators': (300, 500)})

In [28]:
print('gs_rfc train score : ', gs_rfc.score(X_train, y_train))
print('gs_rfc test score : ', gs_rfc.score(X_test, y_test))
print('The accuracy score with hyperparameter tuning increased by : '
      f'{round((gs_rfc.score(X_test, y_test)) - (rfc.score(X_test_cv, y_test)),4)}')

gs_rfc train score :  0.9958847736625515
gs_rfc test score :  0.9197530864197531
The accuracy score with hyperparameter tuning increased by : 0.0123


<br>
<br>

---

<br>
<br>

In [33]:
mnb = MultinomialNB()
mnb.fit(X_train_cv, y_train)
mnb.score(X_train_cv, y_train),  mnb.score(X_test_cv, y_test)

(0.9958847736625515, 0.9012345679012346)

In [34]:
pipeline = Pipeline([
    ('cv', CountVectorizer(stop_words='english')),
    ('mnb', MultinomialNB())
])

parameters = {
    'cv__min_df': (1,2,3),
    'cv__ngram_range': ((1, 1), (1, 2), (2,2)),
    'mnb__alpha': [0.01, 0.05, 0.1]
}
    
gs_mnb = GridSearchCV(pipeline, param_grid = parameters, n_jobs=-1)
gs_mnb.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv',
                                        CountVectorizer(stop_words='english')),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'cv__min_df': (1, 2, 3),
                         'cv__ngram_range': ((1, 1), (1, 2), (2, 2)),
                         'mnb__alpha': [0.01, 0.05, 0.1]})

In [35]:
print('gs_mnb train score : ', gs_mnb.score(X_train, y_train))
print('gs_mnb test score : ', gs_mnb.score(X_test, y_test))
print('The accuracy score with hyperparameter tuning increased by : '
      f'{round((gs_mnb.score(X_test, y_test)) - (mnb.score(X_test_cv, y_test)),4)}')

gs_mnb train score :  1.0
gs_mnb test score :  0.9320987654320988
The accuracy score with hyperparameter tuning increased by : 0.0309
