Universal Filter

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
# import libraries
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')
RANDOM_STATE = 56
torch.cuda.manual_seed_all(56)
torch.manual_seed(56)

Mounted at /content/drive


<torch._C.Generator at 0x791d05fa4f70>

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
# load the data

data1 = pd.read_csv('/content/drive/MyDrive/spam_detection/sms_translate.csv')

data2 = pd.read_csv('/content/drive/MyDrive/spam_detection/youtube_translate.csv')

# evaluate on a custom dataset custom_test_setv2.csv
data = pd.read_csv('/content/drive/MyDrive/spam_detection/custom_test_set.csv') # 301 samples


language = 'gr'

if language == 'en':

  X1 = data1.Message
  Y1 = data1.Category.values

  X2 = data2.Message
  Y2 = data2.Category.values

  my_X = [data.Message, data.Message_el, data.gtrans_el]
  my_Y = [data.Category.values,data.Category.values,data.Category.values]


else:

  X1 = data1.gtrans_el
  Y1 = data1.Category.values

  X2 = data2.gtrans_el
  Y2 = data2.Category.values

  my_X = [data.Message, data.Message_el, data.gtrans_el]
  my_Y = [data.Category.values,data.Category.values,data.Category.values]


In [5]:
if language == 'en':

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # tokenizer
  model = BertForSequenceClassification.from_pretrained(
      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
      num_labels = 2, # The number of output labels--2 for binary classification.
  ).to(device)

else:

  # greek BERT for classification
  tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
  model = AutoModelForSequenceClassification.from_pretrained("nlpaueb/bert-base-greek-uncased-v1",num_labels=2).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/530k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/454M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/bert-base-greek-uncased-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def split_sets(weighted,X1,Y1,X2,Y2):

    if weighted == 1:

      length = len(X3) # length of youtube

      # weighted X1
      spam_index = np.where(Y1==1)[0].tolist()
      ham_index = np.where(Y1==0)[0].tolist()

      if length//2 > len(spam_index):

        spam_index = spam_index
        ham_index = ham_index[0:(length-len(spam_index))]
        index = spam_index + ham_index
        X1 = X1[index]
        Y1 = Y1[index]

      else:

        spam_index = spam_index[0:length//2]
        ham_index = ham_index[0:length//2]
        index = spam_index + ham_index
        X1 = X1[index]
        Y1 = Y1[index]

      # weighted X2
      spam_index = np.where(Y2==1)[0].tolist()
      ham_index = np.where(Y2==0)[0].tolist()
      if length/2 > len(spam_index):
        spam_index = spam_index
        ham_index = ham_index[0:(length-len(spam_index))]
        index = spam_index + ham_index
        X2 = X2[index]
        Y2 = Y2[index]

      else:

        spam_index = spam_index[0:length//2]
        ham_index = ham_index[0:length//2]
        index = spam_index + ham_index
        X2 = X2[index]
        Y2 = Y2[index]



    Xtrain1, Xtest1,ytrain1, ytest1 = train_test_split(X1, Y1, random_state=RANDOM_STATE, test_size=0.2, stratify = Y1)
    x_train1, x_valid1 ,y_train1, y_valid1 = train_test_split(Xtrain1, ytrain1, random_state=RANDOM_STATE, test_size=0.25, stratify = ytrain1)
    Xtrain2, Xtest2,ytrain2, ytest2 = train_test_split(X2, Y2, random_state=RANDOM_STATE, test_size=0.2, stratify = Y2)
    x_train2, x_valid2 ,y_train2, y_valid2 = train_test_split(Xtrain2, ytrain2, random_state=RANDOM_STATE, test_size=0.25, stratify = ytrain2)


    # so Xtrain, ytrain has 80% from both sms datasets and youtube
    Xtrain = pd.concat([Xtrain1, Xtrain2], ignore_index = True)
    ytrain = np.concatenate((ytrain1, ytrain2))

    # so x_train, y_train has 60% of both sms, youtube
    x_train = pd.concat([x_train1, x_train2], ignore_index = True)
    y_train = np.concatenate((y_train1, y_train2))

    # so x_valid, y_valid has 20 % of both sms,youtube and used for evaluation via fune tuning
    x_valid = pd.concat([x_valid1, x_valid2], ignore_index = True)
    y_valid = np.concatenate((y_valid1, y_valid2))

    # so test set has 20% from both sms and youtube datasets
    Xtest = pd.concat([Xtest1, Xtest2], ignore_index = True) # fusion of dataset
    ytest = np.concatenate((ytest1, ytest2))

    return Xtrain,ytrain,x_train,y_train,x_valid,y_valid,Xtest,ytest,Xtest1,ytest1,Xtest2,ytest2


# weighted = 1-> take samples in the same size as youtube dataset, weighted samples of datasets
# weighted = 0 # 0 -> take whole datasets

weighted = 0
# Xtest = fuse test set, Xtesti = evaluation test from dataset_i, i={1,2,3}
Xtrain, ytrain, x_train, y_train, x_valid, y_valid, Xtest, ytest, Xtest1, ytest1, Xtest2, ytest2 = split_sets(weighted,X1,Y1,X2,Y2)

X_testing = [Xtest, Xtest1, Xtest2] + my_X
y_testing = [ytest, ytest1, ytest2] + my_Y

sets = ["fuse_test", "sms_test","youtube_test","my_test_en","my_test_el","my_test_el_machine_translated"]

In [8]:
# encoding the input to be compatible with BERT model
# encoding train - test data and store them  representations in dataloaders

def train_test_encoding(Xtrain,Xtest,ytrain,ytest,batch_size):

  encoded_train = tokenizer.batch_encode_plus(Xtrain.tolist(), add_special_tokens=True, max_length = 128, padding='max_length' , truncation=True, return_tensors = 'pt')
  encoded_test = tokenizer.batch_encode_plus(Xtest.tolist(), add_special_tokens=True, max_length = 128, padding='max_length' , truncation=True, return_tensors = 'pt')
  input_ids_train = encoded_train['input_ids']
  attention_mask_train = encoded_train['attention_mask']
  labels_train = torch.tensor(ytrain)
  input_ids_test = encoded_test['input_ids']
  attention_mask_test = encoded_test['attention_mask']
  labels_test = torch.tensor(ytest)

  # combine the training/testing inputs into a TensorDataset
  data_train = TensorDataset(input_ids_train, attention_mask_train, labels_train)
  data_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)

  dataloader_train = DataLoader(
              data_train,  # the training samples
              batch_size = batch_size, #traversing through the dataset with batch_size
              shuffle = True
          )
  # Shuffling the data after each epoch ensures that you will not be “stuck” with too many bad batches

  dataloader_test = DataLoader(
              data_test, # The validation samples.
              batch_size = batch_size, # Evaluate with this batch size.
              shuffle = False
          )

  return dataloader_train, dataloader_test

In [9]:
# get logits (tensors) and pass them through a softmax layer. Then turn them into predictions that stored in a numpy array

def get_predictions(logits):
 prob_softmax = F.softmax(logits,dim=1)
 pred = np.array(np.argmax(prob_softmax,axis=1))
 return pred

In [10]:
# compute the class weights

# wj=n_samples / (n_classes * n_samplesj), for j=0,1 classes


def compute_class_weights(ytrain):

  total_samples = len(ytrain)
  w0 = total_samples/(2*len([y for y in ytrain if y == 0]))
  w1 = total_samples/(2*len([y for y in ytrain if y == 1]))


  return w0,w1

In [14]:
# # RUN THIS ONLY ON VALIDATION PHASE TO FIND OPTIMAL NUMBER OF EPOCHS

# batch_size = 32 # for training
# epochs = 4 # num of epochs to train

# dataloader_train, dataloader_valid = train_test_encoding(x_train,x_valid,y_train,y_valid,batch_size)
# w0,w1 = compute_class_weights(y_train)
# weights = torch.tensor([w0, w1]).to(device)

# # applying weight decay to all trainable parameters except bias and normalization layer weigths
# no_decay = ['bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)

# scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0*epochs*len(dataloader_train), num_training_steps = epochs*len(dataloader_train))


# # metrics per epoch

# Loss_train = []


# f1_valid = []
# accuracy_valid = []
# Loss_valid = []


# for epoch in range(epochs):

#   model.train() # set model to training mode
#   train_loss = 0 # accumulate loss for every batch per epoch

#   # training loop
#   for step,batch in enumerate(tqdm(dataloader_train)):

#     input_ids = batch[0].to(device)
#     attention_mask = batch[1].to(device)
#     labels = batch[2].to(device)
#     model.zero_grad() # clear gradients
#     outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#     # weighted loss for class imbalance
#     criterion = torch.nn.CrossEntropyLoss(weight=weights,reduction='mean')
#     batch_loss = criterion(outputs.logits, labels)
#     # batch_loss = outputs.loss # loss for the batch if we dont have weighted loss function
#     train_loss += batch_loss
#     batch_loss.backward()
#     # update parameters
#     optimizer.step()
#     # Update the learning rate.
#     scheduler.step()





#   Loss_train.append(train_loss/len(dataloader_train))# compute the average loss for all the batches in epoch

#   model.eval() # set model to evaluation mode
#   valid_loss = 0 # accumulate loss for every batch
#   all_logits = [] # store logits of every batch to pass them all into function get predictions and take the predictions overall

#   # evaluation
#   for step,batch in enumerate(dataloader_valid):

#     input_ids = batch[0].to(device)
#     attention_mask = batch[1].to(device)
#     labels = batch[2].to (device)

#     with torch.no_grad():
#       outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#     batch_loss = outputs.loss # loss for the batch
#     all_logits.append(outputs.logits.cpu())
#     valid_loss += batch_loss

#   all_logits = torch.cat(all_logits, dim=0)
#   pred = get_predictions(all_logits)
#   valid_loss = (valid_loss/len(dataloader_valid)) # compute the average loss for all the batches in epoch

#    # validation metrics
#   Loss_valid.append(valid_loss)
#   accuracy_valid.append(accuracy_score(y_valid,pred))
#   f1_valid.append(f1_score(y_valid, pred, average='macro'))



# # learning curves

# epoch = [c for c in range(1,epochs+1)]

# validation_loss = [loss.cpu() for loss in Loss_valid]
# training_loss = [tensor.detach().cpu() for tensor in Loss_train]

# # plot learning curve
# plt.figure()
# plt.title('Loss')
# plt.plot(epoch,validation_loss,color='orange',label='validation')
# plt.plot(epoch,training_loss,color='blue',label='train')
# plt.xlabel('# of epochs')
# plt.xticks(epoch)
# plt.legend(['val_loss', 'loss'])
# plt.show()


# optimal_epochs = np.argmin(validation_loss) + 1
# print("optimal number of epochs found = " +str(optimal_epochs)+" with training Loss = "+str(training_loss[optimal_epochs-1])+" and validation Loss = "+str(validation_loss[optimal_epochs-1]))

In [11]:
# RUN THIS FOR TRAINING THE PRETRAINING MODEL IN ALL TRAIN DATASET

# after we found the optimal hypeparameters
# train from the initial pretreained phase the model with all the train data = train + validation
# finally evaluate on test set
batch_size = 32
warm_up = 0.02
dataloader_train, dataloader_test = train_test_encoding(Xtrain,Xtest,ytrain,ytest,32) #Then we retrain it in all train data = train + validation
w0,w1 = compute_class_weights(ytrain)
weights = torch.tensor([w0, w1]).to(device)

# reset the optimizer
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=4e-5)


if language == 'en':

  optimal_epochs = 3 # for english for both weighted and no weighted fusion

else:

  optimal_epochs = 2 # no weighted fusion

# new scheduler to train the model in all avaliable training data
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=warm_up*optimal_epochs*len(dataloader_train), num_training_steps = optimal_epochs*len(dataloader_train))

In [None]:
model.train() # set model to training mode


for epoch in range(optimal_epochs):

  # training loop
  for step,batch in enumerate(tqdm(dataloader_train)):

    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)
    model.zero_grad() # clear gradients
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # forward pass
    # weighted loss for class imbalance
    criterion = torch.nn.CrossEntropyLoss(weight=weights,reduction='mean')
    batch_loss = criterion(outputs.logits, labels)
    # batch_loss = outputs.loss # loss for the batch if we dont have weighted loss function
    batch_loss.backward()
    # update parameters
    optimizer.step()
    # Update the learning rate.
    scheduler.step()




model.eval() # set model to evaluation mode
test_loss = 0 # accumulate loss for every batch
all_logits = [] # store logits of every batch to pass them all into function get predictions and take the predictions overall

# evaluation of Xtest
for step,batch in enumerate(dataloader_test):

  input_ids = batch[0].to(device)
  attention_mask = batch[1].to(device)
  labels = batch[2].to(device)

  with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
  batch_loss = outputs.loss # loss for the batch
  all_logits.append(outputs.logits.cpu())
  test_loss += batch_loss


all_logits = torch.cat(all_logits, dim=0)
pred = get_predictions(all_logits)
test_loss = test_loss/len(dataloader_test) # compute the average loss for all the batches
print("results on test set: "+str(sets[0]))
print("Loss is "+str(test_loss))
print("Classification report:\n\n"+str(classification_report(ytest,pred,target_names=['ham','spam'])))
print("accuracy is "+str(round(accuracy_score(ytest,pred),4)))
print("f1 macro is "+str(round(f1_score(ytest,pred,average='macro'),4)))
print("balanced accuracy is "+str(round(balanced_accuracy_score(ytest,pred),4)))
print("confusion matrix"+str(confusion_matrix(ytest, pred))+"\n") # [[TN FP],[FN TP]]


predictions = []

# evaluation of test dataset except Xtest
for i in range(1,len(X_testing)):

  #encoding
  encoded_test = tokenizer.batch_encode_plus(X_testing[i].tolist(), add_special_tokens=True, max_length = 128, padding='max_length' , truncation=True, return_tensors = 'pt')
  input_ids_test = encoded_test['input_ids']
  attention_mask_test = encoded_test['attention_mask']
  labels_test = torch.tensor(y_testing[i])

  data_test = TensorDataset(input_ids_test, attention_mask_test, labels_test)
  dataloader_test = DataLoader(
              data_test, # The validation samples.
              batch_size = 32, # Evaluate with this batch size.
              shuffle = False
          )

  test_loss = 0 # accumulate loss for every batch
  all_logits = [] # store logits of every batch to pass them all into function get predictions and take the predictions overall

  # evaluation
  for step,batch in enumerate(dataloader_test):

    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to(device)

    with torch.no_grad():
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    batch_loss = outputs.loss # loss for the batch
    all_logits.append(outputs.logits.cpu())
    test_loss += batch_loss


  all_logits = torch.cat(all_logits, dim=0)
  pred = get_predictions(all_logits)
  predictions.append(pred)
  test_loss = test_loss/len(dataloader_test) # compute the average loss for all the batches
  print("results on test set: "+str(sets[i]))
  print("Loss is "+str(test_loss))
  print("Classification report:\n\n"+str(classification_report(y_testing[i],pred,target_names=['ham','spam'])))
  print("accuracy is "+str(round(accuracy_score(y_testing[i],pred),4)))
  print("f1 macro is "+str(round(f1_score(y_testing[i],pred,average='macro'),4)))
  print("balanced accuracy is "+str(round(balanced_accuracy_score(y_testing[i],pred),4)))
  print("confusion matrix"+str(confusion_matrix(y_testing[i], pred))+"\n") # [[TN FP],[FN TP]]


In [13]:
# save the fine tuned model
# torch.save(model.state_dict(),'/content/drive/My Drive/spam_detection/fine_tuned_models/Universal_filter_english.pth')