In [64]:
import pandas as pd
import nltk
import numpy as np

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.svm import SVC

In [65]:
#nltk.download('wordnet')
#nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [66]:
path = "/content/drive/MyDrive/Career Growth/Projects/Spam NLP Classification/Spam Email raw text for NLP.csv"
df = pd.read_csv(path)

df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [67]:
df.tail()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0
5795,0,"If you run Pick, and then use the ""New FTOC"" b...",00747.352d424267d36975a7b40b85ffd0885e


In [68]:
#counts of spam (0) vs not spam (1) emails

df ['CATEGORY'].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [69]:
# converting strings to tokens

tokenizer = nltk.RegexpTokenizer(r"\w+")

#testing token 
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [70]:
# making everything lowercase

test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [71]:
# linking similar words to each other, for example: rocks -> rock

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [72]:
# addressing stopwords, for example: from, it

stopwords = stopwords.words('english')

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [73]:
# function for data processing

def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [74]:
#partition dataframe into train and test

df = df.sample(frac=1, random_state=1)
df = df.reset_index(drop=True)

split_index = int(len(df) * 0.8) #first 80% goes to training dataset, last 20% goes to testing dataset
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(      CATEGORY                                            MESSAGE  \
 0            1  \n\n<HTML><FONT  BACK="#ffffff" style="BACKGRO...   
 1            1  <html><body bgColor="#CCCCCC" topmargin=1 onMo...   
 2            0  Quoting Paul Linehan (plinehan@yahoo.com):\n\n...   
 3            0  <a href=http://www.aaronsw.com/weblog/>\n\nAar...   
 4            0  Oh yeah, the link for more info:\n\n\n\nhttp:/...   
 ...        ...                                                ...   
 4631         0  Gregory Alan Bolcer:\n\n>I'm not sure since I ...   
 4632         1  New Account For: zzzz@spamassassin.taint.org\n...   
 4633         0  >>>>> "O" == Owen Byrne <owen@permafrost.net> ...   
 4634         0  This is an automated response to a message you...   
 4635         0  http://www.ouchytheclown.com/welcome.html\n\n\...   
 
                                    FILE_NAME  
 0     00118.141d803810acd9d4fc23db103dddfcd9  
 1     00463.0bc4e08af0529dd773d9f10f922547db  
 2     00358.8

In [75]:
token_counter = {}

for message in train_df['MESSAGE']:
  message_as_token_lst = message_to_token_list(message)

  for token in message_as_token_lst:
    if token in token_counter:
      token_counter[token] += 1
    else:
      token_counter[token] = 1

# count of total tokens
len(token_counter) 

86415

In [76]:
token_counter

{'html': 4175,
 'font': 35005,
 'back': 1055,
 'ffffff': 2535,
 'style': 3349,
 'background': 789,
 'color': 9642,
 'size': 13107,
 '3': 3581,
 'ptsize': 450,
 '12': 985,
 'b': 12856,
 'viagra': 66,
 '000000': 1923,
 '2': 7993,
 '10': 2182,
 'family': 1491,
 'sansserif': 314,
 'face': 9950,
 'arial': 6187,
 'lang': 419,
 '0': 9445,
 'br': 16013,
 'breakthrough': 22,
 'medication': 50,
 'impotence': 13,
 'delivered': 79,
 'mailbox': 71,
 'without': 658,
 'leaving': 50,
 'computer': 640,
 'simply': 377,
 'click': 2144,
 'href': 3875,
 'http': 14926,
 'host': 158,
 '1bulk': 12,
 'email': 4015,
 'software': 1129,
 'com': 11675,
 'ch4': 12,
 'pharm': 12,
 'blue': 181,
 'le': 680,
 '5': 2932,
 'minute': 366,
 'complete': 403,
 'line': 1307,
 'consultation': 68,
 'many': 1004,
 'case': 681,
 '24': 575,
 'nbsp': 9732,
 'hour': 589,
 'gt': 108,
 'website': 488,
 'treatment': 33,
 'compromised': 12,
 'sexual': 120,
 'function': 202,
 'convenient': 36,
 'affordable': 55,
 'confidential': 135,
 's

In [77]:
def keep_token(processed_token, threshold):
  if processed_token not in token_counter:
    return False
  else:
    return token_counter[processed_token] > threshold
  
keep_token('random', 100)

False

In [78]:
features = set()

for token in token_counter:
  if keep_token(token, 6000): #change the threshold number to see how model performs
    features.add(token)

features

{'0',
 '1',
 '2',
 '20',
 '3d',
 'align',
 'arial',
 'b',
 'br',
 'color',
 'com',
 'face',
 'font',
 'http',
 'list',
 'nbsp',
 'p',
 'size',
 'td',
 'tr',
 'width',
 'www'}

In [79]:
features = list(features)
features

['1',
 '0',
 '20',
 'list',
 'br',
 'td',
 'http',
 'b',
 'arial',
 'p',
 'width',
 'align',
 'com',
 'face',
 '2',
 'color',
 'size',
 'nbsp',
 'www',
 'tr',
 'font',
 '3d']

In [80]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping

{'1': 0,
 '0': 1,
 '20': 2,
 'list': 3,
 'br': 4,
 'td': 5,
 'http': 6,
 'b': 7,
 'arial': 8,
 'p': 9,
 'width': 10,
 'align': 11,
 'com': 12,
 'face': 13,
 '2': 14,
 'color': 15,
 'size': 16,
 'nbsp': 17,
 'www': 18,
 'tr': 19,
 'font': 20,
 '3d': 21}

In [81]:
message_to_token_list('3d b <br> .com bad font font com randoms')

['3d', 'b', 'br', 'com', 'bad', 'font', 'font', 'com', 'randoms']

In [82]:
# Bag of Words vector or representation

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1
  
  return count_vector

message_to_count_vector('3d b <br> .com bad font font com randoms')

array([0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 2., 0., 0., 0., 0.,
       0., 0., 0., 2., 1.])

In [83]:
# estimating if it's spam or not spam
# vectors with mainly zeros are probs low chance of being spam

message_to_count_vector(train_df['MESSAGE'].iloc[5])

array([0., 0., 0., 1., 0., 0., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [84]:
train_df.iloc[5]

# if CATEGORY = 1 then spam, if CATEGORY = 0 then not spam

CATEGORY                                                     1
MESSAGE      Low-Cost Term-Life Insurance!\n\nSAVE up to 70...
FILE_NAME               00303.7d749e4a46ceb169ea1af5b9e5ab39a9
Name: 5, dtype: object

In [85]:
def df_to_X_y(dff):
  y = dff['CATEGORY'].to_numpy().astype(int)

  message_col = dff['MESSAGE']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)
  
  X= np.array(count_vectors).astype(int)

  return X, y

In [86]:
X_train, y_train = df_to_X_y(train_df)

X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4636, 22), (4636,), (1160, 22), (1160,))

In [87]:
#from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.        , 0.00307692, 0.        , ..., 0.        , 0.00245851,
        0.        ],
       [0.00363636, 0.        , 0.        , ..., 0.00787402, 0.00860479,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [88]:
# ML Classification Model
#Logistic Regression

# 0 = not spam, 1 = spam
lr = LogisticRegression().fit(X_train, y_train)
lr_report = classification_report(y_test, lr.predict(X_test))
print("Logistic Regression:\n", lr_report)

Logistic Regression:
               precision    recall  f1-score   support

           0       0.77      1.00      0.87       788
           1       0.99      0.36      0.53       372

    accuracy                           0.79      1160
   macro avg       0.88      0.68      0.70      1160
weighted avg       0.84      0.79      0.76      1160



In [89]:
# Compare logistic regression to random forest

rf = RandomForestClassifier().fit(X_train, y_train)
rf_report = classification_report(y_test, rf.predict(X_test))
print("Random Forest Classifier report:\n", rf_report)

Random Forest Classifier report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92       788
           1       0.88      0.74      0.81       372

    accuracy                           0.89      1160
   macro avg       0.88      0.85      0.86      1160
weighted avg       0.89      0.89      0.88      1160



In [90]:
# Compare previous models to Support Vector Machine

svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(X_train, y_train)
svm_report = classification_report(y_test, svm.predict(X_test))
print("SVM Classifier report:\n", svm_report)

SVM Classifier report:
               precision    recall  f1-score   support

           0       0.77      1.00      0.87       788
           1       0.99      0.37      0.54       372

    accuracy                           0.80      1160
   macro avg       0.88      0.69      0.71      1160
weighted avg       0.84      0.80      0.77      1160

