In [50]:
# Importing necessary libraries
import numpy as np        # For numerical operations
import pandas as pd       # For data manipulation and analysis

import csv

# Importing WordCloud for text visualization
from wordcloud import WordCloud

In [51]:
csv.field_size_limit(10**7)
USE_PERSONAL_DATA = False

df = pd.read_csv('data/TREC-06.csv', encoding='latin1', engine='python')
if USE_PERSONAL_DATA:
    df_personal = pd.read_csv('data/personal_spam_ham.csv', encoding='latin1', engine='python')

In [52]:
df.info()
if USE_PERSONAL_DATA:
   df_personal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16458 entries, 0 to 16457
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sender    16190 non-null  object 
 1   receiver  15920 non-null  object 
 2   date      15963 non-null  object 
 3   subject   16080 non-null  object 
 4   body      16415 non-null  object 
 5   label     16400 non-null  float64
 6   urls      16400 non-null  float64
dtypes: float64(2), object(5)
memory usage: 900.2+ KB


In [53]:
# Transform every body to "Subject: " + subject + "\n" + body
df['body'] = df.apply(lambda x: 'Subject: ' + str(x['subject']) + '\n' + str(x['body']), axis=1)
df.drop(columns=['sender', 'receiver', 'date', 'subject', 'urls'], inplace=True, errors='ignore')
df.rename(columns = {'label': 'target', 'body': 'text'}, inplace = True)
df.info()

if USE_PERSONAL_DATA:
    df_personal.drop(columns=['Unnamed: 0', 'label'], inplace=True, errors='ignore')
    df_personal.rename(columns = {'label_num': 'target'}, inplace = True)
    df_personal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16458 entries, 0 to 16457
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    16458 non-null  object 
 1   target  16400 non-null  float64
dtypes: float64(1), object(1)
memory usage: 257.3+ KB


In [54]:
if USE_PERSONAL_DATA:
    df = pd.concat([df, df_personal], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16458 entries, 0 to 16457
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    16458 non-null  object 
 1   target  16400 non-null  float64
dtypes: float64(1), object(1)
memory usage: 257.3+ KB


In [55]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [56]:
df.isnull().sum()

text      0
target    0
dtype: int64

In [57]:
df.duplicated().sum()


np.int64(41)

In [58]:
#remove Duplicate
df = df.drop_duplicates(keep = 'first')

In [59]:
df.shape

(16417, 2)

In [11]:
from utils.transformText import transform_text

[nltk_data] Downloading package stopwords to /home/thayer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/thayer/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
transform_text('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [13]:
df['transformed_text'] = df['text'].apply(transform_text)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cv = CountVectorizer()
tfid = TfidfVectorizer(max_features = 3000)

In [15]:
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 2)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 50, random_state = 2)
bc = BaggingClassifier(n_estimators = 50, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 50, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)

In [19]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt    
}


In [20]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [21]:
accuracy_scores = []
precision_scores = []
for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9747081712062257
Precision:  0.9518900343642611

For:  KNN
Accuracy:  0.9552529182879378
Precision:  0.9175257731958762

For:  NB
Accuracy:  0.9377431906614786
Precision:  0.8526645768025078

For:  LR
Accuracy:  0.9542801556420234
Precision:  0.8903225806451613

For:  RF
Accuracy:  0.9698443579766537
Precision:  0.9271523178807947

For:  Adaboost
Accuracy:  0.9212062256809338
Precision:  0.8170731707317073

For:  Bgc
Accuracy:  0.9455252918287937
Precision:  0.8770226537216829

For:  ETC
Accuracy:  0.9688715953307393
Precision:  0.9326599326599326

For:  GBDT
Accuracy:  0.9289883268482491
Precision:  0.9186046511627907


In [22]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt    
}
y_pred = np.zeros_like(y_test)
for name, clfs in clfs.items():
    y_pred += clfs.predict(X_test)
y_pred = (y_pred > 7).astype(int)  # Majority voting

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print("Accuracy:", accuracy, " Precision:", precision)
# Accuracy: 0.95995995995996  Precision: 0.9924812030075187

Accuracy: 0.9591439688715954  Precision: 0.9768339768339769
