# Supervised Learning Lab

In this lab you will train and test a binary classification maching learning model using the Scikit-Learn modules.


In [2]:
# import libraries
import numpy as np
import pandas as pd
import nltk
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import datetime
import os 
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,f1_score,classification_report,ConfusionMatrixDisplay,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
import warnings

In [3]:
# Set numpy random seed for reproducible numbers
np.random.seed(42)

# Dataset

In [4]:
# note that this data was obtained from https://zenodo.org/records/8339691
# required citation at the end of the notebook

# Read in dataset 
df = pd.read_csv("../datasets/CEAS_08.csv")
df

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1
...,...,...,...,...,...,...,...
39149,CNN Alerts <charlene-detecton@btcmarketing.com>,email1007@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 10:34:50 -0400",CNN Alerts: My Custom Alert,\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\n ...,1,0
39150,CNN Alerts <idgetily1971@careplusnj.org>,email104@gvc.ceas-challenge.cc,"Fri, 08 Aug 2008 10:35:11 -0400",CNN Alerts: My Custom Alert,\n\nCNN Alerts: My Custom Alert\n\n\n\n\n\n\n ...,1,0
39151,Abhijit Vyas <xpojhbz@gmail.com>,fxgmqwjn@triptracker.net,"Fri, 08 Aug 2008 22:00:43 +0800",Slideshow viewer,Hello there ! \nGreat work on the slide show v...,0,0
39152,Joseph Brennan <vupzesm@columbia.edu>,zqoqi@spamassassin.apache.org,"Fri, 08 Aug 2008 09:00:46 -0500",Note on 2-digit years,"\nMail from sender , coming from intuit.com\ns...",0,0


In [5]:
df['combined'] = df['subject'] + " " + df['body']

In [6]:
new_df = df.iloc[:, [7,5]]
new_df = new_df.rename(columns={'combined':'Email Text', 'label':'Email Type'})
new_df.dropna(inplace=True,axis=0)
new_df.drop_duplicates(inplace=True)
new_df

Unnamed: 0,Email Text,Email Type
0,"Never agree to be a loser Buck up, your troubl...",1
1,Befriend Jenna Jameson \nUpgrade your sex and ...,1
2,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...,1
3,Re: svn commit: r619753 - in /spamassassin/tru...,0
4,SpecialPricesPharmMoreinfo \nWelcomeFastShippi...,1
...,...,...
39149,CNN Alerts: My Custom Alert \n\nCNN Alerts: My...,1
39150,CNN Alerts: My Custom Alert \n\nCNN Alerts: My...,1
39151,Slideshow viewer Hello there ! \nGreat work on...,0
39152,"Note on 2-digit years \nMail from sender , com...",0


In [10]:
# Label Email Type
lbl = LabelEncoder()
new_df['Email Type'] = lbl.fit_transform(new_df['Email Type'])

In [11]:
# Function to preprocess text.
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text
new_df['Email Text']=new_df['Email Text'].apply(preprocess_text)

In [12]:
new_df

Unnamed: 0,Email Text,Email Type
0,never agree to be a loser buck up your trouble...,1
1,befriend jenna jameson upgrade your sex and pl...,1
2,cnncom daily top 10 the daily top 10 from cnnc...,1
3,re svn commit r619753 in spamassassintrunk lib...,0
4,specialpricespharmmoreinfo welcomefastshipping...,1
...,...,...
39149,cnn alerts my custom alert cnn alerts my custo...,1
39150,cnn alerts my custom alert cnn alerts my custo...,1
39151,slideshow viewer hello there great work on the...,0
39152,note on 2digit years mail from sender coming f...,0


In [14]:

# Convert email text to an array of vectors, removing stop words
tf = TfidfVectorizer(stop_words='english',max_features=10000) #dimension reduction
feature_x = tf.fit_transform(new_df['Email Text']).toarray()

# convert the label into numpy array
y_tf = np.array(new_df['Email Type']) 

In [15]:
#split dataset into training and testing data groups, 80 percent training, 20 percent testing
X_tr,X_tst,y_tr,y_tst = train_test_split(feature_x,y_tf,test_size=0.2,random_state=0)

In [16]:
svm = LinearSVC()
svm.fit(X_tr,y_tr)

pred_svm = svm.predict(X_tst)



In [18]:
svm_accu = accuracy_score(y_tst,pred_svm)*100
svm_f1 = f1_score(y_tst,pred_svm)*100

print(classification_report(y_tst,pred_svm))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3449
           1       1.00      1.00      1.00      4377

    accuracy                           1.00      7826
   macro avg       1.00      1.00      1.00      7826
weighted avg       1.00      1.00      1.00      7826



In [19]:
print(svm_accu)

99.61666240736008
