In [2]:
### Import Libraries ###
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [3]:
### Text Cleaning Functions ###
def lemmatization2(text):    

    stemmer = WordNetLemmatizer()
    corpus_lem = []
    for i,row in enumerate(text):
        document = row.split()
        lem_doc=[]
        for token,tag in nltk.pos_tag(document):
            if tag.startswith('J'):
                lem_doc.append(stemmer.lemmatize(token,wordnet.ADJ))
            elif tag.startswith('V'):
                lem_doc.append(stemmer.lemmatize(token,wordnet.VERB))
            elif tag.startswith('R'):
                lem_doc.append(stemmer.lemmatize(token,wordnet.ADV))
            else:
                lem_doc.append(stemmer.lemmatize(token))        
                
        corpus_lem.append(' '.join(lem_doc))
        
    return corpus_lem

def remove_stopwords(text):
    nltk.download('stopwords')    
    stop_words = set(stopwords.words('english'))
    corpus_sw=[]
    for i in range(0,len(text)):
        review = [word for word in text[i].lower().split() if not word in stop_words]
        review = ' '.join(review)
        corpus_sw.append(review)     
        
    return corpus_sw


def clean_text(texts):
    
    clean = []
    for text in texts:
        # Removing the @
        text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
        # Removing the URL links
        text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
        # Keeping only letters
        text = re.sub(r"[^a-zA-Z.!?']", ' ', text)
        # Removing additional whitespaces
        text = re.sub(r" +", ' ', text)
        clean.append(text)
        
    return clean

In [6]:
# Read the data
raw_data = pd.read_csv('./spam_or_not_spam.csv')
# Drop the nans
raw_data = raw_data.dropna()
# Clean text
cleaned_text = clean_text(raw_data['email'].tolist())
# Remove stopwords
cleaned_text = remove_stopwords(cleaned_text)
# Lemmatize
cleaned_text = lemmatization2(cleaned_text)
# Pull the target variable (binary)
target = raw_data['label'].tolist()
# Quick glance at the dataset
raw_data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ta496711\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [4]:
### TF-IDF fit&transform ###
tfidfconverter = TfidfVectorizer(max_features = 20000,min_df = 5, max_df = 0.75)
X = tfidfconverter.fit_transform(cleaned_text).toarray() 
# Splitting the dataset into the Training set and Test set
X_train,X_test,y_train,y_test = train_test_split(X, target,  test_size = 0.20, random_state = 0)

In [7]:
# Naive-Bayes Classifier
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# Confusion Matrix
#cm = confusion_matrix(y_test,y_pred)
cl_report = classification_report(y_test,y_pred)
print(cl_report)

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       510
           1       0.88      0.73      0.80        90

    accuracy                           0.94       600
   macro avg       0.92      0.86      0.88       600
weighted avg       0.94      0.94      0.94       600

