In [87]:
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [74]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wiledw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [75]:
# Load dataset
df = pd.read_csv('spam_ham_dataset.csv')

In [76]:
# Remove \r\n
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

In [77]:
# Check for missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [78]:
# Preprocessing
# 1. Remove all the punctuations
# 2. Make everything into lowercase
# 3. Stemming process

stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
    text = df['text'].iloc[i].lower()
    text = text.translate(str.maketrans('','',string.punctuation)).split()
    text = [stemmer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)                                       

In [79]:
df.text.iloc[0]

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

In [80]:
corpus[0]

'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

In [81]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [82]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(X_train, y_train)

In [83]:
test_accuracy = clf.score(X_test, y_test) 
print("Model accuracy on test set: ", test_accuracy)

Model accuracy on test set:  0.9835748792270531


In [85]:
# Predictions on the training set
y_train_pred = clf.predict(X_train)

# Predictions on the testing set
y_test_pred = clf.predict(X_test)

# Calculate accuracy on both sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

(train_accuracy, test_accuracy)

(1.0, 0.9835748792270531)

In [88]:
# Perform K-fold cross-validation to check for overfit
scores = cross_val_score(clf, X, y, cv=5)
# Print the scores for each fold
print("Accuracy scores for each fold:", scores)
# Print the mean accuracy and the 95% confidence interval of the score estimate
print("Mean accuracy: %0.2f (± %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores for each fold: [0.98357488 0.97388781 0.97775629 0.96615087 0.9729207 ]
Mean accuracy: 0.97 (± 0.01)


In [72]:
# Try to classify random email from the dataset
email_id = 2
email_to_classify = df.text.values[email_id]

In [61]:
email_to_classify

"Subject: neon retreat ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute . on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about . i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a pote

In [62]:
email_text = email_to_classify.lower().translate(str.maketrans('','',string.punctuation)).split()
email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]
X_email = vectorizer.transform(email_corpus)

In [63]:
result = clf.predict(X_email)

In [64]:
if result[0] == 1 :
    print("Spam")
else:
    print("Not Spam")

Not Spam


In [65]:
df.iloc[email_id]

Unnamed: 0                                                 3624
label                                                       ham
text          Subject: neon retreat ho ho ho , we ' re aroun...
label_num                                                     0
Name: 2, dtype: object