In [61]:
# Importing necessary libraries
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis

import csv

# Importing WordCloud for text visualization
from wordcloud import WordCloud

In [62]:
csv.field_size_limit(10**7)
USE_PERSONAL_DATA = False
USE_SPAM_HAM_DATA = True
if USE_SPAM_HAM_DATA:
    df = pd.read_csv('data/spam_ham_dataset.csv', encoding='latin1', engine='python')
else:
    df = pd.read_csv('data/TREC-06.csv', encoding='latin1', engine='python')
if USE_PERSONAL_DATA:
    df_personal = pd.read_csv('data/personal_spam_ham.csv', encoding='latin1', engine='python')

In [63]:
df.info()
if USE_PERSONAL_DATA:
   df_personal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [64]:
# Transform every body to "Subject: " + subject + "\n" + body
if USE_SPAM_HAM_DATA:
    df.drop(columns=['Unnamed: 0', 'label_num'], inplace=True, errors='ignore')
    df.rename(columns = {'label': 'target'}, inplace = True)
else:
    df['body'] = df.apply(lambda x: 'Subject: ' + str(x['subject']) + '\n' + str(x['body']), axis=1)
    df.drop(columns=['sender', 'receiver', 'date', 'subject', 'urls'], inplace=True, errors='ignore')
    df.rename(columns = {'label': 'target', 'body': 'text'}, inplace = True)
df.info()

if USE_PERSONAL_DATA:
    df_personal.drop(columns=['Unnamed: 0', 'label'], inplace=True, errors='ignore')
    df_personal.rename(columns = {'label_num': 'target'}, inplace = True)
    df_personal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5171 non-null   object
 1   text    5171 non-null   object
dtypes: object(2)
memory usage: 80.9+ KB


In [65]:
if USE_PERSONAL_DATA:
    df = pd.concat([df, df_personal], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5171 non-null   object
 1   text    5171 non-null   object
dtypes: object(2)
memory usage: 80.9+ KB


In [66]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [67]:
df.isnull().sum()

target    0
text      0
dtype: int64

In [68]:
df.duplicated().sum()


np.int64(178)

In [69]:
#remove Duplicate
df = df.drop_duplicates(keep = 'first')

In [70]:
df.shape

(4993, 2)

In [71]:
from utils.transformText import transform_text

In [72]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [73]:
df['transformed_text'] = df['text'].apply(transform_text)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfid = TfidfVectorizer(max_features = 3000)

In [75]:
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [76]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [78]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)

In [79]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt    
}


In [80]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [81]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.980980980980981
Precision:  0.9639344262295082

For:  KNN
Accuracy:  0.954954954954955
Precision:  0.9326599326599326

For:  NB
Accuracy:  0.9329329329329329
Precision:  0.8549848942598187

For:  LR
Accuracy:  0.9569569569569569
Precision:  0.9085173501577287

For:  RF
Accuracy:  0.9669669669669669
Precision:  0.940983606557377

For:  Adaboost
Accuracy:  0.9229229229229229
Precision:  0.8338278931750742

For:  Bgc
Accuracy:  0.9429429429429429
Precision:  0.8888888888888888

For:  ETC
Accuracy:  0.975975975975976
Precision:  0.9572368421052632

For:  GBDT
Accuracy:  0.9359359359359359
Precision:  0.916083916083916


In [None]:
# Ensemble Learning
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt    
}
y_pred = np.zeros_like(y_test)
for name, clfs in clfs.items():
    y_pred += clfs.predict(X_test)
y_pred = (y_pred > 7).astype(int)  # Majority voting

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print("Accuracy:", accuracy, " Precision:", precision)

Accuracy: 0.95995995995996  Precision: 0.9924812030075187


In [None]:
# Save the models for future use
from joblib import dump, load

clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt    
}
for name, clfs in clfs.items():
    dump(clfs, 'trained/{}_model.joblib'.format(name.lower()))

In [None]:
# Example of loading and using the saved model
clfs = {
    'SVC': load('trained/svc_model.joblib'),
    'KNN': load('trained/knn_model.joblib'),
    'NB': load('trained/nb_model.joblib'),
    'LR': load('trained/lr_model.joblib'),
    'RF': load('trained/rf_model.joblib'),
    'Adaboost': load('trained/adaboost_model.joblib'),
    'Bgc': load('trained/bgc_model.joblib'),
    'ETC': load('trained/etc_model.joblib'),
    'GBDT': load('trained/gbdt_model.joblib')    
}
y_pred = np.zeros_like(y_test)
for name, clfs in clfs.items():
    y_pred += clfs.predict(X_test)
y_pred = (y_pred > 7).astype(int)  # Majority voting

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print("Accuracy:", accuracy, " Precision:", precision)

Accuracy: 0.95995995995996  Precision: 0.9924812030075187
