In [None]:
# import libraries

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load the data

sms = pd.read_csv('/content/drive/MyDrive/spam_detection/sms_translate.csv') # load spam dataset

enron = pd.read_csv('/content/drive/MyDrive/spam_detection/enron_full.csv')

youtube = pd.read_csv('/content/drive/MyDrive/spam_detection/youtube_translate.csv')

####gia na katharisw ta stopwords prepei na kanw prwta tokenization, opote kanw auto
####sth synexeia ta pernaw mesa apo to tfidf kai einai hdh tokenized

In [None]:
# Prepare the dataset with tokenizer that separates words by " ". also seperates the punctuation from the words if we keep them
# Secondly do some cleaning

en_stop_words = stopwords.words('english')
gr_stop_words = stopwords.words('greek')

def cleaning_en(text):

    text = text.lower() #lowercasing
    tokens = word_tokenize(text) # tokenize
    tokens = [token for token in tokens if token not in en_stop_words] # remove stop words

    return tokens

def cleaning_gr(text):

    text = text.lower()
    tokens = word_tokenize(text)
    # tokens = [token for token in tokens if token not in gr_stop_words] # remove stop words

    return tokens



In [None]:
# # preprocessing
# sms['tokenize_en'] = sms.Message.apply(cleaning_en)
# sms['tokenize_gr'] = sms.gtrans_el.apply(cleaning_gr)


enron['tokenize_en'] = enron.Message.apply(cleaning_en)
# enron['tokenize_gr'] = enron.gtrans_el.apply(cleaning_gr)


# twitter['tokenize_en'] = twitter.Message.apply(cleaning_en)
# twitter['tokenize_gr'] = twitter.gtrans_el.apply(cleaning_gr)


# youtube['tokenize_en'] = youtube.Message.apply(cleaning_en)
# youtube['tokenize_gr'] = youtube.gtrans_el.apply(cleaning_gr)


In [None]:
flag = 'en' # choose the language
data = enron # choose the dataset for training
name_data = 'enron'

if flag == 'en':

  # english data
  X = data.tokenize_en
  y = data.Category.values

else:

  # greek data
  X = data.tokenize_gr
  y = data.Category.values

In [None]:
Xtrain, Xtest,ytrain, ytest = train_test_split(X, y, random_state=56, test_size=0.2, stratify = y)
x_train, x_valid,y_train, y_valid = train_test_split(Xtrain, ytrain, random_state=56, test_size=0.25, stratify = ytrain)

In [None]:
# for tuning the hyperparameters

Xtrain = x_train
Xtest = x_valid
ytest = y_valid
ytrain = y_train

In [None]:
# TFIDF vectorization
# our input is already tokenized which means the tokenized lists will be passed as it is to the vectorizer.

vectorizer = TfidfVectorizer(max_features = 1000,lowercase=False,tokenizer=lambda x: x)
Xtrain_vec = vectorizer.fit_transform(Xtrain)
Xtest_vec = vectorizer.transform(Xtest)

print('features used: ',Xtrain_vec.shape[1])



features used:  1000


In [None]:
# after vectorization
# train and evaluate different machine learning algorithms
# evaluation metrics accuracy, f1 macro, balanced accuracy
# Logistic Regression, Decision tree, SVM, Random Forest
# the ensemble method Random Forest decrease the propability that Decision tree has to overfit in training data


models = [LogisticRegression(solver='liblinear',random_state=56), DecisionTreeClassifier(random_state=56), SVC(random_state=56),
          RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=56)]

f_measures = {}
acc = {}
balanced_acc = {}

for clf in models:
  clf.fit(Xtrain_vec,ytrain)
  pred = clf.predict(Xtest_vec)
  key = f'{clf}'
  f_measures[key]=f1_score(ytest, pred, average='macro')
  acc[key] = accuracy_score(ytest,pred)
  balanced_acc[key] = balanced_accuracy_score(ytest,pred)


print("Dataset used for training purpose is: ",name_data)
for name,score in f_measures.items():
    print("Classifier:{} -  F1 macro:{}".format(name,round(score,4)))
for name,score in acc.items():
    print("Classifier:{} -  Accuracy:{}".format(name,round(score,4)))
for name,score in balanced_acc.items():
    print("Classifier:{} -  BalancedAccuracy:{}".format(name,round(score,4)))

Dataset used for training purpose is:  enron
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  F1 macro:0.97
Classifier:DecisionTreeClassifier(random_state=56) -  F1 macro:0.9377
Classifier:SVC(random_state=56) -  F1 macro:0.9751
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  F1 macro:0.9743
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  Accuracy:0.97
Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.9379
Classifier:SVC(random_state=56) -  Accuracy:0.9752
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  Accuracy:0.9743
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  BalancedAccuracy:0.9705
Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.9382
Classifier:SVC(random_state=56) -  BalancedAccuracy:0.9755
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  BalancedAccuracy:0.9749
