In [None]:
# import libraries

import gensim.downloader
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from google.colab import drive
drive.mount('/content/drive')
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# load the data

sms = pd.read_csv('/content/drive/MyDrive/spam_detection/sms_translate.csv') # load spam dataset

enron = pd.read_csv('/content/drive/MyDrive/spam_detection/enron_full.csv')

youtube = pd.read_csv('/content/drive/MyDrive/spam_detection/youtube_translate.csv')

In [None]:
# Prepare the dataset with tokenizer that separates words by " ". also seperates the punctuation from the words if we keep them
# Secondly do some cleaning

en_stop_words = stopwords.words('english')
gr_stop_words = stopwords.words('greek')

def cleaning_en(text):

  text = text.lower()
  tokens = word_tokenize(text)
  # tokens = [token for token in tokens if token not in en_stop_words] # remove stop words

  return tokens

def cleaning_gr(text):

  text = text.lower()
  tokens = word_tokenize(text)
  # tokens = [token for token in tokens if token not in gr_stop_words] # remove stop words

  return tokens


In [None]:
# # preprocessing
# sms['tokenize_en'] = sms.Message.apply(cleaning_en)
# sms['tokenize_gr'] = sms.gtrans_el.apply(cleaning_gr)

enron['tokenize_en'] = enron.Message.apply(cleaning_en)
enron['tokenize_gr'] = enron.gtrans_el.apply(cleaning_gr)

# twitter['tokenize_en'] = twitter.Message.apply(cleaning_en)
# twitter['tokenize_gr'] = twitter.gtrans_el.apply(cleaning_gr)

# youtube['tokenize_en'] = youtube.Message.apply(cleaning_en)
# youtube['tokenize_gr'] = youtube.gtrans_el.apply(cleaning_gr)

In [None]:
flag = 'gr' # choose the language
data = enron # choose the dataset for training
name_data = 'enron'

if flag == 'en':

  # english data
  X = data.tokenize_en
  y = data.Category.values

else:

  # greek data
  X = data.tokenize_gr
  y = data.Category.values

In [None]:
# EVALUATION ON 20% OF DATASET
# validation on 20% of dataset to tune hyperparameters
# data split
# training data for Word2Vec must be list of lists

Xtrain, Xtest,ytrain, ytest = train_test_split(X.tolist(), y, random_state=56, test_size=0.2, stratify = y)
x_train, x_valid,y_train, y_valid = train_test_split(Xtrain, ytrain, random_state=56, test_size=0.25, stratify = ytrain)

##Word2vec implementation

In [None]:
# for tuning the hyperparameters

# Xtrain = x_train
# Xtest = x_valid
# ytest = y_valid
# ytrain = y_train

In [None]:
# train the Word2vec model, we use CBOW model sg = 0
vec_size = 200 # size of vector embeddings for each sentence
model = Word2Vec(sentences=Xtrain,epochs = 5, window = 10, min_count=5, vector_size=vec_size, sg=0)

### text vectorization

vocab = model.wv.index_to_key # vocabulary words

# we will acquire the embedding with traversing through data (data: list of lists of words)

def text_vectorization(data):

    sentences_emb = np.zeros((len(data),vec_size)) # vector embeddings of given dataset
    for i in range(len(data)):
        sentence = data[i]
        w_embeddings = [] # here we store the embedding of each word in a sentence
        for words in sentence:
        # do this when we use word2vec
            if words in vocab:
                w_embeddings.append(model.wv[words])
            else:
               w_embeddings.append([0]*vec_size) # if word not in vocab vector has zeros, we can also skip the OOV words
        sentences_emb[i,:] = np.sum(w_embeddings,axis=0) # calculate the mean or sum to create a vector for each sentence

    return sentences_emb

# encoding as numpy array to use it in sklearn models

Xtrain_vec = text_vectorization(Xtrain) # input: list of lists of word, Output: embeddings of sentences as numpy array
Xtest_vec = text_vectorization(Xtest)

In [None]:
# after vectorization
# train and evaluate different machine learning algorithms
# evaluation metrics accuracy, f1 macro, balanced accuracy
# Logistic Regression, Decision tree, SVM, Random Forest
# the ensemble method Random Forest decrease the propability that Decision tree has to overfit in training data

# solver = 'sag' for logistic regression on enron, due to large dataset also max_iter is set to 10000 because of converge problem of solver
models = [LogisticRegression(solver='sag',random_state=56,max_iter=10000), DecisionTreeClassifier(random_state=56), SVC(random_state=56),
          RandomForestClassifier(n_estimators=150,n_jobs=-1,random_state=56)]

f_measures = {}
acc = {}
balanced_acc = {}

for clf in models:
  clf.fit(Xtrain_vec,ytrain)
  pred = clf.predict(Xtest_vec)
  key = f'{clf}'
  f_measures[key]=f1_score(ytest, pred, average='macro')
  acc[key] = accuracy_score(ytest,pred)
  balanced_acc[key] = balanced_accuracy_score(ytest,pred)


print("Dataset used for training purpose is: ",name_data)
for name,score in f_measures.items():
    print("Classifier:{} -  F1 Macro:{}".format(name,round(score,4)))
for name,score in acc.items():
    print("Classifier:{} -  Accuracy:{}".format(name,round(score,4)))
for name,score in balanced_acc.items():
    print("Classifier:{} -  BalancedAccuracy:{}".format(name,round(score,4)))

Dataset used for training purpose is:  enron
Classifier:LogisticRegression(max_iter=10000, random_state=56, solver='sag') -  F1 Macro:0.9637
Classifier:DecisionTreeClassifier(random_state=56) -  F1 Macro:0.93
Classifier:SVC(random_state=56) -  F1 Macro:0.9671
Classifier:RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=56) -  F1 Macro:0.9729
Classifier:LogisticRegression(max_iter=10000, random_state=56, solver='sag') -  Accuracy:0.9639
Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.9303
Classifier:SVC(random_state=56) -  Accuracy:0.9671
Classifier:RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=56) -  Accuracy:0.973
Classifier:LogisticRegression(max_iter=10000, random_state=56, solver='sag') -  BalancedAccuracy:0.9637
Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.9298
Classifier:SVC(random_state=56) -  BalancedAccuracy:0.9677
Classifier:RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=56) -  Ba