In [1]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs4
import matplotlib.pyplot as plt

# Importing and cleaning data
Only run when adding new stuff to cleaner, otherwise use spam_clean.csv

In [None]:
df_1 = pd.read_csv('spam.csv', usecols=[0,1])
df_1.columns=['spam', 'text']

df_1['spam'] = df_1['spam'].apply(lambda x: 1 if x=='spam' else 0) # spam = 1, ham = 0

In [None]:
df_1.isnull().sum()

In [None]:
stops = stopwords.words("english")
porter = PorterStemmer()

nr_reviews = df_1['text'].size

def clean_text(text, index):
    text = bs4(text).get_text()
    words = word_tokenize(text)
    words_no_punc = [word.lower() for word in words if word.isalpha()]
    no_stop = [word for word in words_no_punc if word not in stops]
    stems = [porter.stem(word) for word in no_stop]
    clean = ' '.join(stems)

    index = index + 1
    if ((index)%500 == 0):
        print('\r', end='')
        progress = int(100*index/nr_reviews)
        print(f'Processing: {progress}%', end=' ')

    return clean

df_1["clean"] = df_1.apply(lambda row: clean_text(row['text'], row.name), axis=1)
print('\rProcessing: 100%')
df_1.to_csv('spam_clean.csv', index=False)
print('Done!')

# Train, test and vectorizing

In [2]:
df = pd.read_csv('spam_clean.csv', usecols=["spam", "clean"])
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) # removing NaNs

X_train, X_test, y_train, y_test = train_test_split(df['clean'], df['spam'], test_size=0.2)

In [3]:
# Bag of Words
bow = CountVectorizer()
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [4]:
# TF-IDF
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic regression
BoW: 98%, TFIDF: 96% accuracy

In [6]:
# BoW
logreg_bow = LogisticRegression(max_iter=1000, verbose=2)
logreg_bow.fit(X_train_bow, y_train)
pred_logreg_bow = logreg_bow.predict(X_test_bow)

print(classification_report(pred_logreg_bow, y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       980
           1       0.89      0.98      0.94       132

    accuracy                           0.98      1112
   macro avg       0.94      0.98      0.96      1112
weighted avg       0.99      0.98      0.98      1112

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [7]:
# TFIDF
logreg_tfidf = LogisticRegression(max_iter=1000, verbose=2)
logreg_tfidf.fit(X_train_tfidf, y_train)
pred_logreg_tfidf = logreg_tfidf.predict(X_test_tfidf)

print(classification_report(pred_logreg_tfidf, y_test))

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       997
           1       0.75      0.96      0.84       115

    accuracy                           0.96      1112
   macro avg       0.87      0.96      0.91      1112
weighted avg       0.97      0.96      0.97      1112

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


# ANN
BoW: 99%, TFIDF: 

In [9]:
ann_bow = Sequential()
ann_bow.add(Dense(10, activation = 'relu'))
ann_bow.add(Dense(10, activation = 'relu'))
ann_bow.add(Dense(1, activation = 'sigmoid'))
ann_bow.compile(optimizer = 'Adam', loss ='binary_crossentropy', metrics = ['accuracy'])
ann_bow.fit(X_train_bow.toarray(), y_train, batch_size=32, epochs=5, verbose=2)

pred_ann_bow = ann_bow.predict(X_test_bow) > 0.5
print(classification_report(pred_ann_bow, y_test))

Epoch 1/5
139/139 - 1s - loss: 0.4457 - accuracy: 0.8747
Epoch 2/5
139/139 - 0s - loss: 0.1336 - accuracy: 0.9732
Epoch 3/5
139/139 - 0s - loss: 0.0499 - accuracy: 0.9894
Epoch 4/5
139/139 - 0s - loss: 0.0259 - accuracy: 0.9942
Epoch 5/5
139/139 - 0s - loss: 0.0156 - accuracy: 0.9966
              precision    recall  f1-score   support

       False       1.00      0.99      0.99       975
        True       0.93      0.99      0.96       137

    accuracy                           0.99      1112
   macro avg       0.97      0.99      0.98      1112
weighted avg       0.99      0.99      0.99      1112



In [11]:
ann_tfidf = Sequential()
ann_tfidf.add(Dense(10, activation = 'relu'))
ann_tfidf.add(Dense(10, activation = 'relu'))
ann_tfidf.add(Dense(1, activation = 'sigmoid'))
ann_tfidf.compile(optimizer = 'Adam', loss ='binary_crossentropy', metrics = ['accuracy'])
ann_tfidf.fit(X_train_tfidf.toarray(), y_train, batch_size=32, epochs=5, verbose=2)

pred_ann_tfidf = ann_tfidf.predict(X_test_tfidf) > 0.5
print(classification_report(pred_ann_tfidf, y_test))

Epoch 1/5
139/139 - 1s - loss: 0.5549 - accuracy: 0.8439
Epoch 2/5
139/139 - 0s - loss: 0.2149 - accuracy: 0.9395
Epoch 3/5
139/139 - 0s - loss: 0.0883 - accuracy: 0.9784
Epoch 4/5
139/139 - 0s - loss: 0.0503 - accuracy: 0.9885
Epoch 5/5
139/139 - 0s - loss: 0.0333 - accuracy: 0.9919


InvalidArgumentError: indices[2] = [1,5015] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]