In [1]:
import numpy as np
import pandas as pd
import torch
import os
import random

from transformers import DebertaForSequenceClassification, DebertaTokenizer, Trainer, TrainingArguments
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score




just making sure that the model generalises well to the dev set 

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

cuda_available = torch.cuda.is_available()
device = torch.device('cuda') if cuda_available else torch.device('cpu')

torch.manual_seed(0)

data = pd.read_csv("../data/dontpatronizeme_pcl.tsv",
                       sep="\t",
                       names=['par_id', 'art_id', 'keyword', 'country', 'text', 'label'],
                       skiprows=4)

data['label'] = data['label'].apply(lambda x: 0 if x in [0, 1] else 1)

trids = pd.read_csv('../data/train_semeval_parids-labels.csv')
teids = pd.read_csv('../data/dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id
teids.par_id = teids.par_id

data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tao\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tao\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tao\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Tao\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,par_id,art_id,keyword,country,text,label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0
2,3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0
4,5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0
...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,Sri Lankan norms and culture inhibit women fro...,0
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0
10466,10467,@@20282330,in-need,ng,""" She has one huge platform , and information ...",1
10467,10468,@@16753236,hopeless,in,""" Anja Ringgren Loven I ca n't find a word to ...",1


In [17]:
# Rebuild test set 

rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })
  
test_set = pd.DataFrame(rows)
test_set = test_set.sample(frac=1).reset_index(drop=True)
test_set.dropna()

Unnamed: 0,par_id,community,text,label
0,9781,immigrant,The 25-minute-long game of heated taunts was i...,0
1,10007,hopeless,It is seen in recurring violence and continuin...,1
2,650,in-need,"When contacted , Yadav said , "" There are two ...",1
3,9046,migrant,"Even so , many speakers figured out at various...",0
4,9950,homeless,"In June , several NGOs decided to reduce food ...",0
...,...,...,...,...
2089,10309,hopeless,Reading Future Sex it turns out that my friend...,0
2090,9186,women,One bright spot in all of this is that while t...,0
2091,9254,immigrant,My colleague and friend is the multi-lingual d...,0
2092,10252,hopeless,Pressure group Volta4Change is set to march ag...,0


In [20]:
for idx, row in test_set.iterrows():
    if type(row['text']) is not str:   
        test_set.drop(idx, inplace=True)

In [21]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
test_encodings = tokenizer(list(test_set["text"]), return_tensors="pt", truncation=True, padding=True).to(device)
test_labels = list(test_set["label"])
test_dataset = Dataset(test_encodings, test_labels)

def compute_metrics(input):
    y_pred = np.argmax(input.predictions, axis=1)
    y_true = input.label_ids
    accuracy = accuracy_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)
    return {'accuracy': accuracy, 'f1 score': f1score}

In [24]:
model = DebertaForSequenceClassification.from_pretrained("../models/deberta_upsample/deberta_upsample").to(device)

In [25]:
training_args = TrainingArguments(
    output_dir="Deberta/",
    learning_rate=1e-5,
    weight_decay=0.05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1 score",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.evaluate(test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/262 [00:00<?, ?it/s]

{'eval_loss': 0.6886156797409058,
 'eval_accuracy': 0.9283325370281892,
 'eval_f1 score': 0.5833333333333334,
 'eval_runtime': 22.8183,
 'eval_samples_per_second': 91.725,
 'eval_steps_per_second': 11.482}