using a pre trained transformer model to take embeddings and pass them to sklearn classifiers

In [None]:
!pip install transformers



In [None]:
# import libraries
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, classification_report,confusion_matrix
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# to take advantage of gpu acceleration on training
print('Using device:', device)

Using device: cuda


In [None]:
# # Load the BERT tokenizer
# # Load pretrained Bert and pass it to gpu

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)



In [None]:
# load the data

sms = pd.read_csv('/content/drive/MyDrive/spam_detection/sms_translate.csv')

enron = pd.read_csv('/content/drive/MyDrive/spam_detection/enron_full.csv')

youtube = pd.read_csv('/content/drive/MyDrive/spam_detection/youtube_translate.csv')

In [None]:
# tokenizing the input to be compatible with bert model
# visualize the sequence lengths

# data = enron

# X = data.Message
# y = data.Category.values

# max_seq_length = 512



# X = X.tolist()
# encoded = []
# for i in range(len(X)):
#   encoded_train = tokenizer.encode(X[i], add_special_tokens=True, return_tensors = 'pt')
#   encoded.append(encoded_train.size(1))

# print("max sequence: ",np.max(encoded))
# print("avg sequence: ",np.mean(encoded))
# print("min sequence: ",np.min(encoded))


# # Create a bar chart
# plt.bar(range(len(encoded)), encoded)
# plt.axhline(y=max_seq_length, color='red', linestyle='--')

# # Customize the plot (optional)
# plt.title("Sequence Lengths")
# plt.xlabel("Sample Index")
# plt.ylabel("Length")

# # Show the plot
# plt.show()


In [None]:
# encoding to be compatible with BERT model
encoded_dict = tokenizer.batch_encode_plus(X, add_special_tokens=True, max_length = max_seq_length, padding='max_length', truncation=True, return_tensors = 'pt')
input_ids = encoded_dict['input_ids']
attention_mask = encoded_dict['attention_mask']

In [None]:
# feature extraction
model.eval() # set model to evaluation mode

dataset = TensorDataset(input_ids, attention_mask)
embeddings = [] # store the sentence embeddings

batch_size =  128 # lower batch size if we are out of gpu memory

dataloader = DataLoader(
            dataset,  # the samples
            batch_size = batch_size, #traversing through the dataset with batch_size
            shuffle = False
            )


for step,batch in enumerate(tqdm(dataloader)):
  input_ids = batch[0].to(device)
  attention_mask = batch[1].to(device)
  with torch.no_grad():
    outputs = model(input_ids,token_type_ids=None,attention_mask=attention_mask)
  last_hidden_states = outputs.last_hidden_state # last hidden states has size of [batch_size,sequence_length,hidden_laye_sizer=768]
  embeddings.append(last_hidden_states[:,0,:].cpu()) # take as final embedding the vector representation of cls token


embeddings = torch.cat(embeddings, dim=0)
embeddings = embeddings.numpy()# to use it in sklearn model


100%|██████████| 229/229 [15:50<00:00,  4.15s/it]


In [None]:
# train-validation-test split
Xtrain, Xtest,ytrain, ytest = train_test_split(embeddings, y, random_state=56, test_size=0.2, stratify = y)
x_train,x_valid,y_train,y_valid = train_test_split(Xtrain, ytrain, random_state=56, test_size=0.2, stratify = ytrain)

In [None]:
# for tuning the hyperparameters

# Xtrain = x_train
# Xtest = x_valid
# ytest = y_valid
# ytrain = y_train

In [None]:
# sklearn classifiers and metrics

models = [LogisticRegression(solver='liblinear',random_state=56), DecisionTreeClassifier(random_state=56), SVC(random_state=56),
          RandomForestClassifier(n_estimators=150,n_jobs=-1,random_state=56)]


for clf in models:
  clf.fit(Xtrain,ytrain)
  pred = clf.predict(Xtest)
  print("results for "+clf.__class__.__name__+"\n")
  print("Classification report:\n\n"+str(classification_report(ytest,pred,target_names=['ham','spam'])))
  print("accuracy is "+str(round(accuracy_score(ytest,pred),4))+"\n")
  print("f1 macro is: "+str(round(f1_score(ytest, pred, average='macro'),4))+"\n")
  print("balanced accuracy is "+str(round(balanced_accuracy_score(ytest,pred),4))+"\n")
  print("confusion matrix"+str(confusion_matrix(ytest, pred))+"\n\n") # [[TN FP],[FN TP]]

results for LogisticRegression

Classification report:

              precision    recall  f1-score   support

         ham       0.98      0.98      0.98      3095
        spam       0.98      0.98      0.98      2747

    accuracy                           0.98      5842
   macro avg       0.98      0.98      0.98      5842
weighted avg       0.98      0.98      0.98      5842

accuracy is 0.9784

f1 macro is: 0.9784

balanced accuracy is 0.9784

confusion matrix[[3032   63]
 [  63 2684]]


results for DecisionTreeClassifier

Classification report:

              precision    recall  f1-score   support

         ham       0.88      0.88      0.88      3095
        spam       0.87      0.87      0.87      2747

    accuracy                           0.87      5842
   macro avg       0.87      0.87      0.87      5842
weighted avg       0.87      0.87      0.87      5842

accuracy is 0.8745

f1 macro is: 0.8741

balanced accuracy is 0.8741

confusion matrix[[2730  365]
 [ 368 2379]]


