In [None]:
!pip install transformers



In [None]:
# import libraries
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, classification_report,confusion_matrix
from transformers import AutoTokenizer, AutoModel
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# to take advantage of gpu acceleration on training
print('Using device:', device)

Using device: cuda


In [None]:
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
model = AutoModel.from_pretrained("nlpaueb/bert-base-greek-uncased-v1").to(device)

In [None]:
# load the data

sms = pd.read_csv('/content/drive/MyDrive/spam_detection/sms_translate.csv')

enron = pd.read_csv('/content/drive/MyDrive/spam_detection/enron_full.csv')

youtube = pd.read_csv('/content/drive/MyDrive/spam_detection/youtube_translate.csv')

In [None]:
# tokenizing the input to be compatible with bert model

data = youtube
X = data.gtrans_el
y = data.Category.values
encoded_dict = tokenizer.batch_encode_plus(X.tolist(), add_special_tokens=True, max_length = 128, padding='max_length' , truncation=True, return_tensors = 'pt')
input_ids = encoded_dict['input_ids']
attention_mask = encoded_dict['attention_mask']

In [None]:
# feature extraction
model.eval() # set model to evaluation mode

dataset = TensorDataset(input_ids, attention_mask)
embeddings = [] # store the sentence embeddings

# we use batch_size only to load batches of data in gpu and speed up processing through bert model
# so it has none effect

batch_size =  128 # lower batch size if we are out of gpu memory

dataloader = DataLoader(
            dataset,  # the samples
            batch_size = batch_size, #traversing through the dataset with batch_size
            shuffle = False
        )

for step,batch in enumerate(tqdm(dataloader)):
  input_ids = batch[0].to(device)
  attention_mask = batch[1].to(device)
  with torch.no_grad():
    outputs = model(input_ids,token_type_ids=None,attention_mask=attention_mask)
  last_hidden_states = outputs.last_hidden_state # last hidden states has size of [batch_size,sequence_length,hidden_laye_sizer=768]
  embeddings.append(last_hidden_states[:,0,:].cpu()) # take as final embeddings the vector representation of cls token


embeddings = torch.cat(embeddings, dim=0)
embeddings = embeddings.numpy()# to use it in sklearn model

100%|██████████| 13/13 [00:11<00:00,  1.10it/s]


In [None]:
# train-validation-test split
Xtrain, Xtest,ytrain, ytest = train_test_split(embeddings, y, random_state=56, test_size=0.2, stratify = y)
x_train,x_valid,y_train,y_valid = train_test_split(Xtrain, ytrain, random_state=56, test_size=0.2, stratify = ytrain)

In [None]:
# for tuning the hyperparameters

# Xtrain = x_train
# Xtest = x_valid
# ytest = y_valid
# ytrain = y_train

In [None]:
# sklearn classifiers and metrics

models = [LogisticRegression(solver='liblinear',random_state=56), DecisionTreeClassifier(random_state=56), SVC(random_state=56),
          RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=56)]


for clf in models:
  clf.fit(Xtrain,ytrain)
  pred = clf.predict(Xtest)
  print("results for "+clf.__class__.__name__+"\n")
  print("Classification report:\n\n"+str(classification_report(ytest,pred,target_names=['ham','spam'])))
  print("accuracy is "+str(round(accuracy_score(ytest,pred),4))+"\n")
  print("f1 macro is: "+str(round(f1_score(ytest, pred, average='macro'),4))+"\n")
  print("balanced accuracy is "+str(round(balanced_accuracy_score(ytest,pred),4))+"\n")
  print("confusion matrix"+str(confusion_matrix(ytest, pred))+"\n\n") # [[TN FP],[FN TP]]

results for LogisticRegression

Classification report:

              precision    recall  f1-score   support

         ham       0.87      0.90      0.89       176
        spam       0.88      0.85      0.86       153

    accuracy                           0.88       329
   macro avg       0.88      0.87      0.87       329
weighted avg       0.88      0.88      0.88       329

accuracy is 0.8754

f1 macro is: 0.8745

balanced accuracy is 0.8737

confusion matrix[[158  18]
 [ 23 130]]


results for DecisionTreeClassifier

Classification report:

              precision    recall  f1-score   support

         ham       0.71      0.69      0.70       176
        spam       0.65      0.67      0.66       153

    accuracy                           0.68       329
   macro avg       0.68      0.68      0.68       329
weighted avg       0.68      0.68      0.68       329

accuracy is 0.6809

f1 macro is: 0.6797

balanced accuracy is 0.6799

confusion matrix[[122  54]
 [ 51 102]]


results 


### we use batch_size only to load batches of data in gpu and speed up processing through bert model so it has none effect
for sms with batch_size=128 and max_sequence_length = 128

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  F1 Weighted:0.9756

Classifier:DecisionTreeClassifier(random_state=56) -  F1 Weighted:0.8914
Classifier:SVC(random_state=56) -  F1 Weighted:0.98

Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  F1 Weighted:0.9211

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  Accuracy:0.9758

Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.8943

Classifier:SVC(random_state=56) -  Accuracy:0.9806

Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  Accuracy:0.9321

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  BalancedAccuracy:0.9392

Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.7318

Classifier:SVC(random_state=56) -  BalancedAccuracy:0.9252

Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  BalancedAccuracy:0.7299

for sms with batch_size=128 and max_sequence_length = 512

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  F1 Weighted:0.9756
Classifier:DecisionTreeClassifier(random_state=56) -  F1 Weighted:0.8914
Classifier:SVC(random_state=56) -  F1 Weighted:0.981
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  F1 Weighted:0.9268
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  Accuracy:0.9758
Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.8943
Classifier:SVC(random_state=56) -  Accuracy:0.9816
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  Accuracy:0.936
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  BalancedAccuracy:0.9392
Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.7318
Classifier:SVC(random_state=56) -  BalancedAccuracy:0.9291
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  BalancedAccuracy:0.7489

for twitter with batch_size=128 and max_sequence_length = 512

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  F1 Weighted:0.7165
Classifier:DecisionTreeClassifier(random_state=56) -  F1 Weighted:0.5769
Classifier:SVC(random_state=56) -  F1 Weighted:0.7313
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  F1 Weighted:0.6792
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  Accuracy:0.7166
Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.577
Classifier:SVC(random_state=56) -  Accuracy:0.7313
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  Accuracy:0.6798
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  BalancedAccuracy:0.7164
Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.5768
Classifier:SVC(random_state=56) -  BalancedAccuracy:0.7313
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  BalancedAccuracy:0.6793

for enron with batch_size=128 and max_sequence_length = 512

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  F1 Weighted:0.9495
Classifier:DecisionTreeClassifier(random_state=56) -  F1 Weighted:0.7988
Classifier:SVC(random_state=56) -  F1 Weighted:0.9659
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  F1 Weighted:0.9266
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  Accuracy:0.9495
Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.7987
Classifier:SVC(random_state=56) -  Accuracy:0.9659
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  Accuracy:0.9267
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  BalancedAccuracy:0.9493
Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.7986
Classifier:SVC(random_state=56) -  BalancedAccuracy:0.9658
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  BalancedAccuracy:0.9253

for youtube with batch_size=128 and max_sequence_length = 512

Classifier:LogisticRegression(random_state=56, solver='liblinear') -  F1 Weighted:0.8769
Classifier:DecisionTreeClassifier(random_state=56) -  F1 Weighted:0.7206
Classifier:SVC(random_state=56) -  F1 Weighted:0.8948
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  F1 Weighted:0.8795
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  Accuracy:0.8769
Classifier:DecisionTreeClassifier(random_state=56) -  Accuracy:0.7205
Classifier:SVC(random_state=56) -  Accuracy:0.8949
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  Accuracy:0.8795
Classifier:LogisticRegression(random_state=56, solver='liblinear') -  BalancedAccuracy:0.8781
Classifier:DecisionTreeClassifier(random_state=56) -  BalancedAccuracy:0.7208
Classifier:SVC(random_state=56) -  BalancedAccuracy:0.896
Classifier:RandomForestClassifier(n_jobs=-1, random_state=56) -  BalancedAccuracy:0.8801

