In [12]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install nlpaug



In [13]:
import numpy as np
import pandas as pd
import torch
import os
import random

from transformers import DebertaForSequenceClassification, DebertaTokenizer, Trainer, TrainingArguments
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

cuda_available = torch.cuda.is_available()
device = torch.device('cuda') if cuda_available else torch.device('cpu')

torch.manual_seed(0)

data = pd.read_csv("dontpatronizeme_pcl.tsv",
                       sep="\t",
                       names=['par_id', 'art_id', 'keyword', 'country', 'text', 'label'],
                       skiprows=4)

data['label'] = data['label'].apply(lambda x: 0 if x in [0, 1] else 1)

trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id
teids.par_id = teids.par_id

data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,par_id,art_id,keyword,country,text,label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0
2,3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0
4,5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0
...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,Sri Lankan norms and culture inhibit women fro...,0
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0
10466,10467,@@20282330,in-need,ng,""" She has one huge platform , and information ...",1
10467,10468,@@16753236,hopeless,in,""" Anja Ringgren Loven I ca n't find a word to ...",1


In [14]:
# Rebuild training set

rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

train_set = pd.DataFrame(rows)
# Split train into train and internal validation set 80:20

val_size = int(len(train_set) * 0.2)

train_set = train_set.sample(frac=1)
val_set = train_set.iloc[0:val_size].reset_index(drop=True).copy()
train_set = train_set.iloc[val_size:].reset_index(drop=True).copy()

In [15]:
# downsample negative instances
pcldf = train_set[train_set.label==1]
npos = len(pcldf)

train_set_downsampled = pd.concat([pcldf, train_set[train_set.label==0][:npos*2]])
train_set_downsampled_shuffled = train_set_downsampled.sample(frac=1).reset_index(drop=True)

In [16]:
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
train_encodings = tokenizer(list(train_set_downsampled_shuffled["text"]), return_tensors="pt", truncation=True, padding=True).to(device)
val_encodings = tokenizer(list(val_set["text"]), return_tensors="pt", truncation=True, padding=True).to(device)
train_labels = list(train_set_downsampled_shuffled["label"])
val_labels = list(val_set["label"])

In [17]:
# Rebuild test set

# rows = [] # will contain par_id, label and text
# for idx in range(len(teids)):
#   parid = teids.par_id[idx]
#   #print(parid)
#   # select row from original dataset
#   keyword = data.loc[data.par_id == parid].keyword.values[0]
#   text = data.loc[data.par_id == parid].text.values[0]
#   label = data.loc[data.par_id == parid].label.values[0]
#   rows.append({
#       'par_id':parid,
#       'community':keyword,
#       'text':text,
#       'label':label
#   })

# test_set = pd.DataFrame(rows)
# test_set = test_set.sample(frac=1)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

# define model

def compute_metrics(input):
    y_pred = np.argmax(input.predictions, axis=1)
    y_true = input.label_ids
    accuracy = accuracy_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)
    return {'accuracy': accuracy, 'f1 score': f1score}

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2).to(device)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = TrainingArguments(
    output_dir="Deberta/",
    learning_rate=1e-5,
    weight_decay=0.05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1 score",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1 score
1,No log,0.279017,0.872836,0.5058
2,No log,0.358752,0.854328,0.497942
3,0.430700,0.430859,0.868657,0.511111
4,0.430700,0.572894,0.875821,0.516279
5,0.172900,0.857093,0.840597,0.485549
6,0.172900,0.864812,0.853134,0.497959
7,0.037600,0.830576,0.872836,0.503497
8,0.037600,0.886203,0.868657,0.504505
9,0.010300,1.003754,0.85194,0.489712
10,0.010300,1.014139,0.853731,0.498978


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2360, training_loss=0.13837842908451112, metrics={'train_runtime': 1081.5823, 'train_samples_per_second': 17.391, 'train_steps_per_second': 2.182, 'total_flos': 5767100841553920.0, 'train_loss': 0.13837842908451112, 'epoch': 10.0})

In [19]:
trainer.save_model(f"deberta_downsample")

In [20]:
predictions = trainer.predict(val_dataset)

# Step 1: Extract predictions and true labels
raw_predictions = predictions.predictions
true_labels = predictions.label_ids

# Step 2: Convert logits to predicted labels
predicted_labels = np.argmax(raw_predictions, axis=1)

# Step 3: Compute the F1 score
f1 = f1_score(true_labels, predicted_labels, average='binary')

print(f"F1 Score: {f1}")
# run predictions

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


F1 Score: 0.5162790697674419


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
