# Supervised Learning Lab

In this lab you will train and test a binary classification maching learning model using the Scikit-Learn modules.


In [None]:
# import libraries
import numpy as np
import pandas as pd
import nltk
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import datetime
import os 
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,f1_score,classification_report,ConfusionMatrixDisplay,confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
import warnings

In [None]:
# Set numpy random seed for reproducible numbers
np.random.seed(42)

# Dataset

In [None]:
# note that this data was obtained from https://zenodo.org/records/8339691
# required citation at the end of the notebook

# Read in dataset 
df = pd.read_csv("https://raw.githubusercontent.com/RiverGumSecurity/Datasets/refs/heads/main/Zenodo/CEAS_08.csv.gz")
df

In [None]:
df['combined'] = df['subject'] + " " + df['body']

In [None]:
new_df = df.iloc[:, [7,5]]
new_df = new_df.rename(columns={'combined':'Email Text', 'label':'Email Type'})
new_df.dropna(inplace=True,axis=0)
new_df.drop_duplicates(inplace=True)
new_df

In [None]:
# Label Email Type
lbl = LabelEncoder()
new_df['Email Type'] = lbl.fit_transform(new_df['Email Type'])

In [None]:
# Function to preprocess text.
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text
new_df['Email Text']=new_df['Email Text'].apply(preprocess_text)

In [None]:
new_df

In [None]:

# Convert email text to an array of vectors, removing stop words
tf = TfidfVectorizer(stop_words='english',max_features=10000) #dimension reduction
feature_x = tf.fit_transform(new_df['Email Text']).toarray()

# convert the label into numpy array
y_tf = np.array(new_df['Email Type']) 

In [None]:
#split dataset into training and testing data groups, 80 percent training, 20 percent testing
X_tr,X_tst,y_tr,y_tst = train_test_split(feature_x,y_tf,test_size=0.2,random_state=0)

In [None]:
svm = LinearSVC()
svm.fit(X_tr,y_tr)

pred_svm = svm.predict(X_tst)

In [None]:
svm_accu = accuracy_score(y_tst,pred_svm)*100
svm_f1 = f1_score(y_tst,pred_svm)*100

print(classification_report(y_tst,pred_svm))

In [None]:
print(svm_accu)