in this notebook we use pipeline of 3 thematic filters,

we want to see if we can achieve better results in our custom dataset

In [24]:
!pip install transformers



In [25]:
# import libraries
import numpy as np
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoConfig, AutoTokenizer, BertConfig, get_linear_schedule_with_warmup
from torch.optim import AdamW
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')
torch.cuda.manual_seed_all(56)
torch.manual_seed(56)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<torch._C.Generator at 0x7f40e675e4b0>

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# to take advantage of gpu acceleration on training
print('Using device:', device)

Using device: cuda


In [27]:
# load the data

my_data = pd.read_csv('/content/drive/MyDrive/spam_detection/custom_test_set.csv') # new custom dataset with 301 samples


In [28]:
language = 'gr' # language of message


if language == 'en':

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # tokenizer
  social_prom_filter = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2).to(device)
  advertisement_filter = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2).to(device)
  phising_filter = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels = 2).to(device)


  # load the fine tuned thematic filters
  social_prom_filter.load_state_dict(torch.load('/content/drive/My Drive/spam_detection/thematic_filtering/BERT_social_promotion_filter.pth')) # social promotion filter
  social_prom_filter.eval()
  advertisement_filter.load_state_dict(torch.load('/content/drive/My Drive/spam_detection/thematic_filtering/BERT_advertisement_filter.pth')) # advertisement filter
  advertisement_filter.eval()
  phising_filter.load_state_dict(torch.load('/content/drive/My Drive/spam_detection/thematic_filtering/BERT_phising_filter.pth')) # phishing filter
  phising_filter.eval()

  Xtest = my_data.Message
  ytest = my_data.Category.values



else:

  # greek BERT for classification
  tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
  social_prom_filter = AutoModelForSequenceClassification.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",num_labels=2).to(device)
  advertisement_filter = AutoModelForSequenceClassification.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",num_labels=2).to(device)
  phising_filter = AutoModelForSequenceClassification.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",num_labels=2).to(device)

  # load the fine tuned thematic filters
  social_prom_filter.load_state_dict(torch.load('/content/drive/My Drive/spam_detection/thematic_filtering/Greek_BERT_social_promotion_filter.pth')) # social promotion filter
  social_prom_filter.eval()
  advertisement_filter.load_state_dict(torch.load('/content/drive/My Drive/spam_detection/thematic_filtering/Greek_BERT_advertisement_filter.pth')) # advertisement filter
  advertisement_filter.eval()
  phising_filter.load_state_dict(torch.load('/content/drive/My Drive/spam_detection/thematic_filtering/Greek_BERT_phising_filter.pth')) # phishing filter
  phising_filter.eval()

  # Xtest = my_data.gtrans_el # machine translated
  Xtest = my_data.Message_el # human translated
  ytest = my_data.Category.values


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
def get_prob(logits):
 prob = F.softmax(logits,dim=1)
 return prob

In [30]:
def encode_input(Xtest):

  encoded_test = tokenizer.batch_encode_plus(Xtest.tolist(), add_special_tokens=True, max_length = 128, padding='max_length' , truncation=True, return_tensors = 'pt')
  input_ids_test = encoded_test['input_ids']
  attention_mask_test = encoded_test['attention_mask']
  labels_test = torch.tensor(ytest)
  data_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)
  dataloader_test = DataLoader(
                data_test, # The validation samples.
                batch_size = 32, # Evaluate with this batch size.
                shuffle = False
            )

  return dataloader_test

In [35]:
dataloader_test = encode_input(Xtest)

In [None]:
models = [social_prom_filter, advertisement_filter, phising_filter]
prob = np.zeros((len(models),len(Xtest),2))

for i in range(len(models)):
  all_logits = []
  for step,batch in enumerate(dataloader_test):
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    with torch.no_grad():
      outputs = models[i](input_ids, attention_mask=attention_mask, labels=labels)
    all_logits.append(outputs.logits.cpu())
  all_logits = torch.cat(all_logits, dim=0)
  prob[i,:,:] = get_prob(all_logits)


In [None]:
# if a filter predict as spam a sample then we take it as spam

# # # predictions based on models
pred1 = np.array(np.argmax(prob[0,:,:],axis=1)) # predictions based on social promotion filter
pred2 = np.array(np.argmax(prob[1,:,:],axis=1)) # predictions based on advertising filter
pred3 = np.array(np.argmax(prob[2,:,:],axis=1)) # predictions based on phishing filter

final_pred = pred1 + pred2 + pred3

final_pred = [1 if pred >= 1 else 0 for pred in final_pred]


print("Classification report on pipeline of filters:\n\n"+str(classification_report(ytest,final_pred,target_names=['ham','spam'])))
print("accuracy is "+str(round(accuracy_score(ytest,final_pred),4)))
print("balanced accuracy is "+str(round(balanced_accuracy_score(ytest,final_pred),4)))
print("f1 macro is "+str(round(f1_score(ytest,final_pred,average='macro'),4)))
print("confusion matrix"+str(confusion_matrix(ytest, final_pred))+"\n\n") # [[TN FP],[FN TP]]