In [None]:
!pip install imbalanced-learn
!pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 9.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 60.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 663 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [None]:
# All common imports here
import os
import random
import numpy as np
from urllib import request
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset,random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Get the latest files from Official Github page

In [None]:
# Get the Module from Task Organisers Repo
# File name: dont_patronize_me.py
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
# File name: evaluation.py
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


## Load DontPatronizeMe Module

In [None]:
from dont_patronize_me import DontPatronizeMe

In [None]:
dpm = DontPatronizeMe('/content/drive/MyDrive/NLP Final Project/data/', '/content/drive/MyDrive/NLP Final Project/data/dontpatronizeme_pcl.tsv')

## Load Subtask 1 data

In [None]:
# Load Task 1 data and get the data frame
dpm.load_task1()
train_df = dpm.train_task1_df

In [None]:
train_df.head()

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"we 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"in libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""white house press secretary sean spicer said ...",0,0
3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" just like we received migrants fleeing el ...",0,0


In [None]:
train_df.loc[0, 'text']

"we 're living in times of absolute insanity , as i 'm pretty sure most people are aware . for a while , waking up every day to check the news seemed to carry with it the same feeling of panic and dread that action heroes probably face when they 're trying to decide whether to cut the blue or green wire on a ticking bomb -- except the bomb 's instructions long ago burned in a fire and imminent catastrophe seems the likeliest outcome . it 's hard to stay that on-edge for that long , though , so it 's natural for people to become inured to this constant chaos , to slump into a malaise of hopelessness and pessimism ."

In [None]:
# Length of train data 
len(train_df)

10469

In [None]:
# Get value count of each label
train_df['label'].value_counts()

0    9476
1     993
Name: label, dtype: int64

In [None]:
# Categories of PCL counts (may not be used in binary classification)
train_df['orig_label'].value_counts()

0    8529
1     947
3     458
4     391
2     144
Name: orig_label, dtype: int64

## Oversample the minority class

In [None]:
X = np.array(train_df['text'])
y = np.array(train_df['label'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2)

In [None]:
# x_train, y_train = np.array(train_df['text']), np.array(train_df['label']) 

In [None]:
x_train.shape

(8375,)

In [None]:
print(len(x_train), len(y_train))

8375 8375


In [None]:
print(Counter(y_train))

Counter({0: 7581, 1: 794})


In [None]:
print(Counter(y_test))

Counter({0: 1895, 1: 199})


In [None]:
# Instantiate RandomOversampler
oversampler = RandomOverSampler(sampling_strategy='minority')

In [None]:
# Oversample minority class
oversample_x, oversample_y = oversampler.fit_resample(x_train.reshape(-1, 1), y_train)

In [None]:
# Squeeze out the oversampled data
oversample_x = oversample_x.ravel()

In [None]:
print(oversample_x.shape, oversample_y.shape)

(15162,) (15162,)


In [None]:
# Number of samples in each class
print(Counter(oversample_y))

Counter({0: 7581, 1: 7581})


In [None]:
len(np.unique(oversample_x))

8375

## BERT - Sequence Classification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
print(tokenizer.encode_plus(oversample_x[0]))

{'input_ids': [101, 2005, 1037, 2096, 1010, 2023, 2126, 1997, 2559, 2012, 2477, 2081, 1037, 2843, 1997, 3168, 2000, 2033, 1012, 2059, 1045, 2165, 1037, 16663, 2000, 3788, 2105, 1996, 2103, 1010, 2004, 1037, 2965, 1997, 5456, 1012, 2004, 2017, 3328, 4949, 16161, 1010, 2017, 3325, 2129, 1996, 5337, 1998, 11900, 2892, 2046, 2169, 2060, 1010, 2000, 1996, 2391, 2073, 1996, 7372, 2468, 20625, 2135, 18449, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
def encode_text(oversample_x):
  input_ids = []
  attention_masks = []

  for text in oversample_x:
      encoded_dict = tokenizer.encode_plus(
                          text,                      
                          add_special_tokens = True, 
                          pad_to_max_length = True,
                          max_length = 256,
                          return_attention_mask = True,
                          truncation = True, 
                          return_tensors = 'pt'
                    )
      
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks   

In [None]:
train_input_ids, train_attention_masks = encode_text(oversample_x)



In [None]:
validation_input_ids, validation_attention_masks = encode_text(x_test)



In [None]:
print(torch.cuda.is_available())

True


In [None]:
train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
oversample_y = torch.tensor(oversample_y)

In [None]:
print(train_input_ids.shape, oversample_y.shape, train_attention_masks.shape)

torch.Size([15162, 256]) torch.Size([15162]) torch.Size([15162, 256])


In [None]:
validation_input_ids = torch.cat(validation_input_ids, dim=0)
validation_attention_masks = torch.cat(validation_attention_masks, dim=0)
y_test = torch.tensor(y_test)

In [None]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, oversample_y)
val_dataset = TensorDataset(validation_input_ids, validation_attention_masks, y_test)

In [None]:
def get_train_validation_data_loader(train_dataset, val_dataset, batch_size=128):
  train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

  validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size
        )
  
  return train_dataloader, validation_dataloader

In [None]:
train_dataloader, validation_dataloader = get_train_validation_data_loader(train_dataset, val_dataset,32)

## BERT Model

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False, 
)

model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
def get_optimizer_scheduler():
  optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
  epochs = 5
  total_steps = len(train_dataloader) * epochs 
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
  return optimizer,scheduler,epochs

In [None]:
optimizer,scheduler,epochs = get_optimizer_scheduler()

In [None]:
import numpy as np
from sklearn.metrics import f1_score

def compute_f1_score(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  micro = f1_score(pred_flat, labels_flat, average='micro')
  macro = f1_score(pred_flat, labels_flat, average='macro')
  return (macro,micro)

In [None]:
def train_model(model,train_dataloader,epochs):

  training_stats = []
  train_losses = []
  start = datetime.now()
  for epoch in range(0, epochs):
      print(f'Epoch : {epoch+1}')
      model.train()
      for step, batch in enumerate(train_dataloader):
          batch_input_ids = batch[0].to(device)
          batch_input_mask = batch[1].to(device)
          batch_labels = batch[2].to(device)
          model.zero_grad()        

          loss,logits = model(batch_input_ids, 
                              token_type_ids=None, 
                              attention_mask=batch_input_mask, 
                              labels=batch_labels,
                              return_dict=False)
          train_losses.append(loss.item())
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step() 
      print(f'Average training loss {np.mean(train_losses)}')
      training_stats.append(
          {
              'epoch': epoch + 1,
              'Training Loss': np.mean(train_losses),  
          }
      )

      print(f'Total training time {datetime.now()-start}')
  return training_stats 

In [None]:
def evaluate_model(model,dataset_loader):

  model.eval()

  f1_macro_scores = []
  f1_micro_scores = []
  validation_losses = []

  for batch in dataset_loader:
      
      batch_input_ids = batch[0].to(device)
      batch_input_mask = batch[1].to(device)
      batch_labels = batch[2].to(device)
      
      
      with torch.no_grad():        
          (loss, logits) = model(batch_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=batch_input_mask,
                                  labels=batch_labels,
                          return_dict=False)
          
      
      # total_eval_loss += loss.item()
      validation_losses.append(loss.item())

      
      logits = logits.detach().cpu().numpy()
      label_ids = batch_labels.to('cpu').numpy()

      
      macro,micro = compute_f1_score(logits, label_ids)
      f1_macro_scores.append(macro)
      f1_micro_scores.append(micro)
      
      
  print(f'F1_macro: {np.mean(f1_macro_scores)}')
  print(f'F1_micro: {np.mean(f1_micro_scores)}')
  print(f'Validation Loss {np.mean(validation_losses)}')

In [None]:
train_model(model,train_dataloader,5)

Epoch : 1
Average training loss 0.26491181743390196
Total training time 0:06:02.905962
Epoch : 2
Average training loss 0.1599214793742145
Total training time 0:12:05.755544
Epoch : 3
Average training loss 0.11022985873673649
Total training time 0:18:08.669731
Epoch : 4
Average training loss 0.0833313927338889
Total training time 0:24:11.770438
Epoch : 5
Average training loss 0.06697923514934528
Total training time 0:30:15.047113


[{'Training Loss': 0.26491181743390196, 'epoch': 1},
 {'Training Loss': 0.1599214793742145, 'epoch': 2},
 {'Training Loss': 0.11022985873673649, 'epoch': 3},
 {'Training Loss': 0.0833313927338889, 'epoch': 4},
 {'Training Loss': 0.06697923514934528, 'epoch': 5}]

In [None]:
evaluate_model(model,validation_dataloader)

F1_macro: 0.6756678719754098
F1_micro: 0.9143668831168831
Validation Loss 0.6106060626634369


In [None]:
torch.save(model,'/content/drive/MyDrive/NLP Final Project/Model/trained_modified.h5')

### Generate predictions on Train Data

In [None]:
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

In [None]:
dpm.load_test()

In [None]:
dpm.test_path

'dontpatronizeme_pcl.tsv'

In [None]:
eval_ids, eval_attention_masks = encode_text(X)



In [None]:
eval_ids = torch.cat(eval_ids, dim=0)
eval_attention_masks = torch.cat(eval_attention_masks, dim=0)

In [None]:
print(eval_ids.shape, eval_attention_masks.shape, eval_y.shape)

torch.Size([10469, 256]) torch.Size([10469, 256]) torch.Size([10469])


In [None]:
eval_dataset = TensorDataset(eval_ids, eval_attention_masks)

In [None]:
eval_dataloader = DataLoader(eval_dataset, batch_size = 32)

In [None]:
print(type(eval_dataloader))

<class 'torch.utils.data.dataloader.DataLoader'>


In [None]:
logit_list = []
predictions_list = []

model.eval()
for batch in eval_dataloader:
    batch_ids = batch[0].to(device)
    batch_mask = batch[1].to(device)
    
    with torch.no_grad():        
          logits,  = model(batch_ids, 
                                  token_type_ids=None, 
                                  attention_mask=batch_mask,
                                  return_dict=False)
          # print(type(logits))
          logits = logits.cpu().numpy()
    predictions = list(np.argmax(logits, axis=-1))
    predictions_list.extend(predictions)

In [None]:
predictions_list[:5]

[0, 0, 0, 0, 0]

In [None]:
!mkdir res ref

In [None]:
# predictions for task 1
preds_task1 = [[pred] for pred in predictions_list]
labels2file(preds_task1, os.path.join('res/', 'task1.txt'))

In [None]:
# Gold labels
labels2file(dpm.train_task1_df.label.apply(lambda x:[x]).tolist(), os.path.join('ref/', 'task1.txt'))

In [None]:
!python3 evaluation.py . .

In [None]:
!cat scores.txt

task1_precision:0.9404761904761905
task1_recall:0.8751258811681772
task1_f1:0.9066249347939488
