IMPORTS

In [None]:
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
from sklearn.model_selection import train_test_split, TunedThresholdClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn import set_config
import os
import pandas as pd

LOAD

In [None]:
# Download latest version
path = kagglehub.dataset_download("advaithsrao/enron-fraud-email-dataset")
print("Path to dataset files:", path)
dataset= os.listdir(path)
full_path= os.path.join(path, dataset[0])
print(full_path)
data= pd.read_csv(full_path)
data.head()

In [None]:
print(data.describe())
print('---------------------------------------------------------')
print(f"ROWS: {data.shape[0]}, COLUMNS: {data.shape[1]}")
print('---------------------------------------------------------')
print(f"Labels-\n Ham: {data['Label'].unique()[0]}\n Spam: {data['Label'].unique()[1]} ")
print('---------------------------------------------------------')
print(f"Null value columns:\n{[col for col in data.columns if data[col].isnull().values.any()]}")
print('---------------------------------------------------------')
print(data.dtypes)

Grouping same data type columns into one list.

In [None]:
obj_col= [col for col in data.columns if data[col].dtypes=='object']
num_col= [col for col in data.columns if data[col].dtypes=='int64' or data[col].dtypes=='float64']
bool_col= [col for col in data.columns if data[col].dtypes=='bool']


In [None]:
data_copy= data.copy()
data_copy['Sender-Type']= data_copy['Sender-Type'].map({'Internal': 1, 'External': 0}).astype(int)
data_copy.drop(columns= ['Mime-Version', 'Content-Transfer-Encoding', 'Mail-ID', 'Folder-User','Folder-Name','From', 
                         'To','Suspicious-Folders', 'Date', 'Message-ID','Content-Type','X-From',
                        'X-To', 'X-cc', 'X-bcc', 'X-Origin', 'X-Folder',  'Cc', 'Attendees',
                       'Bcc', 'Time', 'X-FileName', 'Re', 'Source', 'POI-Present', 'Suspicious-Folders', 'Low-Comm',
                    'Contains-Reply-Forwards', 'Sender-Type', 'Unique-Mails-From-Sender'], axis= 1, inplace= True)


In [None]:
plt.figure(figsize= (4,4))
sns.countplot(x= 'Label', data= data_copy, color= 'yellow', edgecolor= 'black')
plt.title('Target')
plt.show()

In [None]:
print(data_copy['Subject'].isnull().sum())

In [None]:
ham_emails_count= int(data_copy.groupby('Label').describe().iloc[0,0])
spam_emails_count= int(data_copy.groupby('Label').describe().iloc[1,0])
total_count= ham_emails_count+spam_emails_count
print(f"Total count of Ham emails: {ham_emails_count}\nTotal count of Spam emails: {spam_emails_count}")
print('---------------------------------------------------------')
print(f"Proportion of Ham emails: {round(ham_emails_count/total_count *100, 2)}%\nProportion of Spam emails: {round(spam_emails_count/total_count *100, 2)}%")
print('---------------------------------------------------------')
print(f"Count of NULL valued Subject in Ham emails: {int(data[data['Subject'].isnull()].groupby('Label').describe().iloc[0,0])}\nCount of NULL valued Subject in Spam emails: {int(data[data['Subject'].isnull()].groupby('Label').describe().iloc[1,0])}")

The dataset is highly imbalanced. Additionaly, we have Null values in our 'Subject' column, which can be filled with 'no_subject' string.

In [None]:
data_copy['Subject']= data_copy['Subject'].fillna('no_subject')
print(f"Count of NULL valued Subject in Ham emails: {int(data[data['Subject'].isnull()].groupby('Label').describe().iloc[0,0])}\nCount of NULL valued Subject in Spam emails: {int(data[data['Subject'].isnull()].groupby('Label').describe().iloc[1,0])}")
data_copy['Subject Length']= data_copy.Subject.apply(len)
data_copy['Body Length']= data_copy.Body.apply(len)

In [None]:
data_copy[data_copy['Body Length'] > 300000]
data_copy= data_copy.drop(index= [259341, 387463, 147232, 147338])
data_copy['Text']= data_copy['Subject'] + data_copy['Body']
data_copy= data_copy.drop(columns= ['Subject', 'Body', 'Subject Length', 'Body Length']) 
print(f"Null values in Text column: {data_copy['Text'].isnull().sum()}")

In [None]:
y= data_copy['Label']
X= data_copy['Text']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 7, stratify= y) 

In [None]:
steps= [('tfidf1', TfidfVectorizer(ngram_range= (1, 2), lowercase= True, norm= 'l2', max_df= 0.85, min_df= 2, max_features= 2000)), ('svm_classifier', LinearSVC(C= 0.90, max_iter= 3000, penalty='l1', dual= False, random_state=7))]
svm_pipe= Pipeline(steps)

set_config(display= 'diagram')
svm_pipe

In [None]:
svm_thresh_tuned= TunedThresholdClassifierCV(estimator= svm_pipe, scoring= 'f1', cv= 3,  random_state= 7)
svm_thresh_tuned.fit(X_train, y_train)

# TEST

In [None]:
preds= svm_thresh_tuned.predict(X_test)

print(f"F1-SCORE= {round(f1_score(y_test, preds)*100, 2)}")
print('---------------------------------------------------------')
print(f"PRECISION= {round(precision_score(y_test, preds)*100, 2)}")
print('---------------------------------------------------------')
print(f"RECALL= {round(recall_score(y_test, preds)*100, 2)}")
print('---------------------------------------------------------')
print(f"CONFUSION MATRIX=\n{confusion_matrix(y_test, preds)}")
print('---------------------------------------------------------')

# Performance on Train Set

In [None]:
preds_on_train= svm_thresh_tuned.predict(X_train)

print(f"F1-SCORE= {round(f1_score(y_train, preds_on_train)*100, 2)}")
print('---------------------------------------------------------')
print(f"PRECISION= {round(precision_score(y_train, preds_on_train)*100, 2)}")
print('---------------------------------------------------------')
print(f"RECALL= {round(recall_score(y_train, preds_on_train)*100, 2)}")
print('---------------------------------------------------------')
print(f"CONFUSION MATRIX=\n{confusion_matrix(y_train, preds_on_train)}")
print('---------------------------------------------------------')

In [None]:
print(svm_thresh_tuned.best_score_)

In [None]:
steps= [('tfidf2', TfidfVectorizer(ngram_range= (1, 2), lowercase= True, norm= 'l2', max_df= 0.85, min_df= 2, max_features= 2000)), ('log_classifier', LogisticRegression(C= 0.90, solver= 'saga', penalty= 'l1', dual= False, random_state= 7, n_jobs= -1, max_iter= 3000))]
log_pipe= Pipeline(steps)

set_config(display= 'diagram')
log_pipe

In [None]:
log_thresh_tuned= TunedThresholdClassifierCV(estimator= log_pipe, scoring= 'f1',cv= 3, random_state= 7)
log_thresh_tuned.fit(X_train, y_train)

# TEST

In [None]:
preds= log_thresh_tuned.predict(X_test)

print(f"F1-SCORE= {round(f1_score(y_test, preds)*100, 2)}")
print('---------------------------------------------------------')
print(f"PRECISION= {round(precision_score(y_test, preds)*100, 2)}")
print('---------------------------------------------------------')
print(f"RECALL= {round(recall_score(y_test, preds)*100, 2)}")
print('---------------------------------------------------------')
print(f"CONFUSION MATRIX=\n{confusion_matrix(y_test, preds)}")
print('---------------------------------------------------------')

# Performance on Train Set

In [None]:
preds_on_train= log_thresh_tuned.predict(X_train)

print(f"F1-SCORE= {round(f1_score(y_train, preds_on_train)*100, 2)}")
print('---------------------------------------------------------')
print(f"PRECISION= {round(precision_score(y_train, preds_on_train)*100, 2)}")
print('---------------------------------------------------------')
print(f"RECALL= {round(recall_score(y_train, preds_on_train)*100, 2)}")
print('---------------------------------------------------------')
print(f"CONFUSION MATRIX=\n{confusion_matrix(y_train, preds_on_train)}")
print('---------------------------------------------------------')

In [None]:
print(log_thresh_tuned.best_score_)

# Conclusion

In this project, F1-score is selected as the primary evaluation metric because it provides a balance between precision and recall, crucial for our imbalanced dataset. In a general spam classifier case, we prioritize minimizing false positives (legitimate emails classified as spam) and increasing precision score. But in the case of SVM model, using precision led to precision collapse.
Optimizing for F1-score ensures that model maintains high precision while also identifying a meaningful portion of spam emails, which is essential for effective email classification.