In [None]:
!pip install imbalanced-learn
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 8.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 31.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 474 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transfor

In [None]:
# All common imports here
import os
import random
import numpy as np
from urllib import request
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from transformers import BertTokenizer
import torch
from torch.utils.data import TensorDataset,random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
from datetime import datetime
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Get the latest files from Official Github page

In [None]:
# Get the Module from Task Organisers Repo
# File name: dont_patronize_me.py
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [None]:
# File name: evaluation.py
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/evaluation.py


## Load DontPatronizeMe Module

In [None]:
from dont_patronize_me import DontPatronizeMe

In [None]:
dpm = DontPatronizeMe('/content/drive/MyDrive/NLP Final Project/data/', 'dontpatronizeme_pcl.tsv')

## Load Subtask 1 data

In [None]:
# Load Task 1 data and get the data frame
dpm.load_task1()
train_df = dpm.train_task1_df

In [None]:
train_df.head()

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"we 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"in libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""white house press secretary sean spicer said ...",0,0
3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" just like we received migrants fleeing el ...",0,0


In [None]:
train_df.loc[0, 'text']

"we 're living in times of absolute insanity , as i 'm pretty sure most people are aware . for a while , waking up every day to check the news seemed to carry with it the same feeling of panic and dread that action heroes probably face when they 're trying to decide whether to cut the blue or green wire on a ticking bomb -- except the bomb 's instructions long ago burned in a fire and imminent catastrophe seems the likeliest outcome . it 's hard to stay that on-edge for that long , though , so it 's natural for people to become inured to this constant chaos , to slump into a malaise of hopelessness and pessimism ."

In [None]:
# Length of train data 
len(train_df)

10469

In [None]:
# Get value count of each label
train_df['label'].value_counts()

0    9476
1     993
Name: label, dtype: int64

In [None]:
# Categories of PCL counts (may not be used in binary classification)
train_df['orig_label'].value_counts()

0    8529
1     947
3     458
4     391
2     144
Name: orig_label, dtype: int64

## Oversample the minority class

In [None]:
x_train, y_train = np.array(train_df['text']), np.array(train_df['label']) 

In [None]:
x_train.shape

(10469,)

In [None]:
print(len(x_train), len(y_train))

10469 10469


In [None]:
print(Counter(y_train))

Counter({0: 9476, 1: 993})


In [None]:
# Instantiate RandomOversampler
oversampler = RandomOverSampler(sampling_strategy='minority')

In [None]:
# Oversample minority class
oversample_x, oversample_y = oversampler.fit_resample(x_train.reshape(-1, 1), y_train)

In [None]:
# Squeeze out the oversampled data
oversample_x = oversample_x.ravel()

In [None]:
print(oversample_x.shape, oversample_y.shape)

(18952,) (18952,)


In [None]:
# Number of samples in each class
print(Counter(oversample_y))

Counter({0: 9476, 1: 9476})


In [None]:
len(np.unique(oversample_x))

10469

## BERT - Sequence Classification

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
print(tokenizer.encode_plus(oversample_x[0]))

{'input_ids': [101, 2057, 1005, 2128, 2542, 1999, 2335, 1997, 7619, 19272, 1010, 2004, 1045, 1005, 1049, 3492, 2469, 2087, 2111, 2024, 5204, 1012, 2005, 1037, 2096, 1010, 12447, 2039, 2296, 2154, 2000, 4638, 1996, 2739, 2790, 2000, 4287, 2007, 2009, 1996, 2168, 3110, 1997, 6634, 1998, 14436, 2008, 2895, 7348, 2763, 2227, 2043, 2027, 1005, 2128, 2667, 2000, 5630, 3251, 2000, 3013, 1996, 2630, 2030, 2665, 7318, 2006, 1037, 28561, 5968, 1011, 1011, 3272, 1996, 5968, 1005, 1055, 8128, 2146, 3283, 5296, 1999, 1037, 2543, 1998, 17566, 25539, 3849, 1996, 2066, 21292, 9560, 1012, 2009, 1005, 1055, 2524, 2000, 2994, 2008, 2006, 1011, 3341, 2005, 2008, 2146, 1010, 2295, 1010, 2061, 2009, 1005, 1055, 3019, 2005, 2111, 2000, 2468, 1999, 12165, 2000, 2023, 5377, 8488, 1010, 2000, 28702, 2046, 1037, 28935, 5562, 1997, 20625, 2791, 1998, 21877, 18719, 26725, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
def encode_text(oversample_x):
  input_ids = []
  attention_masks = []

  for text in oversample_x:
      encoded_dict = tokenizer.encode_plus(
                          text,                      
                          add_special_tokens = True, 
                          pad_to_max_length = True,
                          max_length = 256,
                          return_attention_mask = True,
                          truncation = True, 
                          return_tensors = 'pt'
                    )
      
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])
  return input_ids, attention_masks   

In [None]:
input_ids, attention_masks = encode_text(oversample_x)



In [None]:
print(torch.cuda.is_available())

True


In [None]:
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
oversample_y = torch.tensor(oversample_y)

In [None]:
print(input_ids.shape, oversample_y.shape, attention_masks.shape)

torch.Size([18952, 256]) torch.Size([18952]) torch.Size([18952, 256])


In [None]:
dataset = TensorDataset(input_ids, attention_masks, oversample_y)
train_dataset_size = int(0.9 * len(dataset))
validation_data_size = len(dataset) - train_dataset_size
train_dataset, val_dataset = random_split(dataset, [train_dataset_size , validation_data_size])

In [None]:
def get_train_validation_data_loader(train_dataset, val_dataset, batch_size=128):
  train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )

  validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size
        )
  
  return train_dataloader, validation_dataloader

In [None]:
train_dataloader, validation_dataloader = get_train_validation_data_loader(train_dataset, val_dataset,32)

## BERT Model

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2, 
    output_attentions = False, 
    output_hidden_states = False, 
)

model.cuda()

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
def get_optimizer_scheduler():
  optimizer = AdamW(model.parameters(),lr = 2e-5,eps = 1e-8)
  epochs = 5
  total_steps = len(train_dataloader) * epochs 
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
  return optimizer,scheduler,epochs

In [None]:
optimizer,scheduler,epochs = get_optimizer_scheduler()

In [None]:
import numpy as np
from sklearn.metrics import f1_score

def compute_f1_score(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  micro = f1_score(pred_flat, labels_flat, average='micro')
  macro = f1_score(pred_flat, labels_flat, average='macro')
  return (macro,micro)

In [None]:
def train_model(model,train_dataloader,epochs):

  training_stats = []
  train_losses = []
  start = datetime.now()
  for epoch in range(0, epochs):
      print(f'Epoch : {epoch+1}')
      model.train()
      for step, batch in enumerate(train_dataloader):
          batch_input_ids = batch[0].to(device)
          batch_input_mask = batch[1].to(device)
          batch_labels = batch[2].to(device)
          model.zero_grad()        

          loss,logits = model(batch_input_ids, 
                              token_type_ids=None, 
                              attention_mask=batch_input_mask, 
                              labels=batch_labels,
                              return_dict=False)
          train_losses.append(loss.item())
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step() 
      print(f'Average training loss {np.mean(train_losses)}')
      training_stats.append(
          {
              'epoch': epoch + 1,
              'Training Loss': np.mean(train_losses),  
          }
      )

      print(f'Total training time {datetime.now()-start}')
  return training_stats 

In [None]:
def evaluate_model(model,dataset_loader):

  model.eval()

  f1_macro_scores = []
  f1_micro_scores = []
  validation_losses = []

  for batch in dataset_loader:
      
      batch_input_ids = batch[0].to(device)
      batch_input_mask = batch[1].to(device)
      batch_labels = batch[2].to(device)
      
      
      with torch.no_grad():        
          (loss, logits) = model(batch_input_ids, 
                                  token_type_ids=None, 
                                  attention_mask=batch_input_mask,
                                  labels=batch_labels,
                          return_dict=False)
          
      
      # total_eval_loss += loss.item()
      validation_losses.append(loss.item())

      
      logits = logits.detach().cpu().numpy()
      label_ids = batch_labels.to('cpu').numpy()

      
      macro,micro = compute_f1_score(logits, label_ids)
      f1_macro_scores.append(macro)
      f1_micro_scores.append(micro)
      
      
  print(f'F1_macro: {np.mean(f1_macro_scores)}')
  print(f'F1_micro: {np.mean(f1_micro_scores)}')
  print(f'Validation Loss {np.mean(validation_losses)}')

In [None]:
train_model(model,train_dataloader,5)

Epoch : 1
Average training loss 0.06207996662347936
Total training time 0:06:51.553235
Epoch : 2
Average training loss 0.039019759361687646
Total training time 0:13:43.444234
Epoch : 3
Average training loss 0.027102326879212046
Total training time 0:20:35.049687
Epoch : 4
Average training loss 0.020568734338885543
Total training time 0:27:27.378902
Epoch : 5
Average training loss 0.016624867337368225
Total training time 0:34:19.319274


[{'Training Loss': 0.06207996662347936, 'epoch': 1},
 {'Training Loss': 0.039019759361687646, 'epoch': 2},
 {'Training Loss': 0.027102326879212046, 'epoch': 3},
 {'Training Loss': 0.020568734338885543, 'epoch': 4},
 {'Training Loss': 0.016624867337368225, 'epoch': 5}]

In [None]:
evaluate_model(model,validation_dataloader)

F1_macro: 0.9812441703761493
F1_micro: 0.9817708333333334
Validation Loss 0.10817408568691463


In [None]:
torch.save(model,'/content/drive/MyDrive/NLP Final Project/Model/trained.h5')