In [42]:
# Import all required modules

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem.wordnet import WordNetLemmatizer

# Instantiate lemmetizer and stopwords
lem = WordNetLemmatizer()

In [70]:
# Read in the data

data = pd.read_csv('spam_ham_dataset.csv')
data = data.dropna()          # Drop any rows with missing data

# Keep labels represented as numbers
data = data.drop('label',axis=1)
# data['class'] = data['label'].astype(int) # Convert the label column to integers (may not be required)
print(f"Data types:\n{data.dtypes}\n\nClass counts:\n{data['class'].value_counts()}\n\nData:\n{data.head()}")

print(f"\nCheck for missing value:\n{data.isna().sum()}\n\nShape of data{data.shape}")

     label                                            message  class
0      ham  Subject: enron methanol ; meter # : 988291\r\n...      0
1      ham  Subject: hpl nom for january 9 , 2001\r\n( see...      0
2      ham  Subject: neon retreat\r\nho ho ho , we ' re ar...      0
3     spam  Subject: photoshop , windows , office . cheap ...      1
4      ham  Subject: re : indian springs\r\nthis deal is t...      0
...    ...                                                ...    ...
5166   ham  Subject: put the 10 on the ft\r\nthe transport...      0
5167   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...      0
5168   ham  Subject: calpine daily gas nomination\r\n>\r\n...      0
5169   ham  Subject: industrial worksheets for august 2000...      0
5170  spam  Subject: important online banking alert\r\ndea...      1

[5171 rows x 3 columns]
Data types:
message    object
class       int64
dtype: object

Class counts:
class
0    3672
1    1499
Name: count, dtype: int64

Data:
           

In [48]:
# Text preprocessing function

def text_process(df):
    df['message'] = df.message.str.lower()   # Convert to lowercase
    df['message'] = df.message.apply(lambda x: ' '.join([word.replace(',','').replace("'",'') for word in x.split()])) # Remove specific Punctuations
    df['message'] = df.message.apply(lambda x: ' '.join([lem.lemmatize(word) for word in x.split()])) #Lemmatize
    return df

In [49]:
# Preprocess text

data = text_process(data)

print(f"Preprocessed data:\n{data.head()}")

Preprocessed data:
                                             message  class
0  subject: enron methanol ; meter # : 988291 thi...      0
1  subject: hpl nom for january 9 2001 ( see atta...      0
2  subject: neon retreat ho ho ho we re around to...      0
3  subject: photoshop window office . cheap . mai...      1
4  subject: re : indian spring this deal is to bo...      0


In [50]:
# Import sklearn modules

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [52]:
# Separate to X and y

X = data.message
y = data['class']
X.head()

0    subject: enron methanol ; meter # : 988291 thi...
1    subject: hpl nom for january 9 2001 ( see atta...
2    subject: neon retreat ho ho ho we re around to...
3    subject: photoshop window office . cheap . mai...
4    subject: re : indian spring this deal is to bo...
Name: message, dtype: object

In [53]:
# Split data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

# Instantiate TF-IDF vectorizer with stop_words and fit on train data
vectorize = TfidfVectorizer(stop_words='english',max_df=0.5)
vectorize.fit(X_train)

print(f"Number of data points for training: {X_train.shape}")

Number of data points for training: (3878,)


In [54]:
# Transform train and test data to tfidf matrices

X_train = vectorize.transform(X_train)
X_test = vectorize.transform(X_test)

print(f"Shape{X_train.todense().shape}\n\nNumber of words in vocabilary: {len(vectorize.vocabulary_)}")

Shape(3878, 40693)

Number of words in vocabilary: 40693


In [55]:
# Print sample of vocabulary
count = 0
for key in vectorize.vocabulary_:
    if count < 11:
        print(key,"  ",vectorize.vocabulary_[key])
        count = count + 1
    else:
        break
    
        
    

buy    8972
cialis    10331
phentermine    28899
viagra    38389
levitra    23604
valium    38112
xanax    39837
prescription    29822
doctor    14081
pharmacy    28876
overnight    28019


In [56]:
# Convert to array matrices

X_train = X_train.toarray()
X_test =  X_test.toarray()

print(f"Glimpse of array:\n{X_train[:5]}")

Glimpse of array:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.19046898 0.         ... 0.         0.         0.        ]
 [0.         0.19470216 0.         ... 0.         0.         0.        ]
 [0.         0.37625006 0.         ... 0.         0.         0.        ]]


In [57]:
# Model with Multinomomial Naive Bayes and metrics

mnb = MultinomialNB() 
mnb.fit(X_train,y_train)
mnb_pred = mnb.predict(X_test)

print(f"Accuracy of model: {accuracy_score(mnb_pred,y_test)}\n\nConfusion Matrix:\n{confusion_matrix(mnb_pred,y_test)}")

Accuracy of model: 0.9296210363495746

Confusion Matrix:
[[929  90]
 [  1 273]]


In [58]:
# Model with Gaussian Naive Bayes and metrics

gnb =  GaussianNB()
gnb.fit(X_train,y_train)
gnb_pred = gnb.predict(X_test)

print(f"Accuracy of model: {accuracy_score(gnb_pred,y_test)}\n\nConfusion Matrix:\n{confusion_matrix(gnb_pred,y_test)}")

Accuracy of model: 0.9466357308584686

Confusion Matrix:
[[899  38]
 [ 31 325]]


In [59]:
# Model with RFC and metrics

rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)

print(f"Accuracy of model: {accuracy_score(rfc_pred,y_test)}\n\nConfusion Matrix:\n{confusion_matrix(rfc_pred,y_test)}")

Accuracy of model: 0.9791183294663574

Confusion Matrix:
[[909   6]
 [ 21 357]]


In [67]:
# New data to predict on

email = ["Hello George, how about a game of tennis tomorrow?",
         "Hello, click here if you want drugs tonight",
         "We offer free viagra!!! Click here now!!!",
         "Dear Sara, I prepared the annual report.",
         "Hi David, will we go for cinema tonight?",
         "Best holidays offers only here!!!",'Sir, Waiting for your mail.',
         '#@Photoshop a fake image!',
         'No problem. How are you doing?']

In [68]:
# Create a dataframe for new data
emails = pd.DataFrame(email,columns=['message'])

# Preprocess the same way as the original
emails = text_process(emails)

print(emails)

                                             message
0  hello george how about a game of tennis tomorrow?
1          hello click here if you want drug tonight
2          we offer free viagra!!! click here now!!!
3            dear sara i prepared the annual report.
4            hi david will we go for cinema tonight?
5                    best holiday offer only here!!!
6                         sir waiting for your mail.
7                          #@photoshop a fake image!
8                     no problem. how are you doing?


In [69]:
# TF-IDF with the same vectorizer (always use the vectorizer that was used to fit the training data)
# For me the best model was the Multinomial Naive Bayes with RandomForest being a very close 2nd.

mapping = {1:"spam",0:"ham"}

examples = vectorize.transform(emails['message'])
predictions = mnb.predict(examples)

# Print predictions

for i,j in enumerate(predictions):
    print(emails['message'].loc[i] + 5*" " + mapping[j])

hello george how about a game of tennis tomorrow?     ham
hello click here if you want drug tonight     spam
we offer free viagra!!! click here now!!!     spam
dear sara i prepared the annual report.     ham
hi david will we go for cinema tonight?     ham
best holiday offer only here!!!     spam
sir waiting for your mail.     ham
#@photoshop a fake image!     spam
no problem. how are you doing?     ham
