In [1]:
#Lib to be installed before running the code

!pip install numpy
!pip install torch
!pip install transformers
!pip install pandas
!pip install scikit-learn
!pip install tensorboard
!pip install sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [2]:
import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
#math calculation
import numpy as np
import pandas as pd

#torch lib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch import optim

#basic lib
import sys
import random
import math
import time
from tqdm import tqdm

#sklearn Lib
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

#transformer lib Autotokenizer
from transformers import BertTokenizer, AutoTokenizer
from transformers import BertModel, AutoModel, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter

#Cuda memory
use_cuda = True if torch.cuda.is_available() else False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



torch.autograd.set_detect_anomaly(True)
torch.backends.cudnn.benchmark = True
np.random.seed(0)
torch.manual_seed(0)

base_model = 'twitter-xlm-roberta-base-sentiment'
model_list = ['bert-base-uncased', 'bert-base-multilingual-uncased', 'google/muril-base-cased', 'xlm-roberta-base',
              'ai4bharat/indic-bert','cardiffnlp/twitter-xlm-roberta-base','cardiffnlp/twitter-xlm-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base', 'cardiffnlp/twitter-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base-hate', 'roberta-base']
model_path = 'mnt/saved_models/'
results_path = 'mnt/saved_results/'

In [4]:
#to set up the dataset to train and test
lang = 'hx'
model_choice = 8

In [5]:
writer = SummaryWriter(log_dir="/home/jupyter/tboard/" + base_model + "_" + lang)
device

device(type='cuda')

In [18]:
#to set pre trained tokenizer for the given dataset
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])

#max sequence Length
MAX_SEQ_LEN = 128

label_idx = 1
text_idx = 0

#to read data and augument the data we have this function
class HateData(Dataset):
    def __init__(self, data_path, split='train', lang='bengali', aug_prob=0.2, flip_prob=0.5,sep='\t'):
        self.split = split
        self.data = pd.read_csv(data_path+ lang +'_'+split+ ".tsv", sep=sep, lineterminator='\n')
        if self.split == 'train':
            self.label2data = {0:[], 1:[], 2:[]}
            for i in tqdm(range(len(self.data))):
                row = self.data.iloc[i]
                self.label2data[row[label_idx]].append(row[text_idx])
            self.aug_prob = aug_prob
            self.flip_prob = flip_prob

    def __len__(self):
        return len(self.data)


    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        data = self.data.iloc[index]
        labels = data[label_idx]
        text = data[text_idx]
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids = inputs['input_ids']
        token_type_ids = np.zeros(MAX_SEQ_LEN)
        attn_mask = inputs['attention_mask']
        aug_text = text
        labels_aug = labels

        if self.split == 'train' and labels == 1:
            if np.random.uniform() < self.aug_prob:
                aug_text = np.random.choice(self.label2data[0])

                if np.random.uniform() < self.flip_prob:
                    aug_text = aug_text + " [SEP] " + text
                else:
                    aug_text = text + " [SEP] " + aug_text
            labels_aug = 1

        inputs_aug = tokenizer(aug_text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids_aug = inputs_aug['input_ids']
        token_type_ids_aug = np.zeros(MAX_SEQ_LEN)
        attn_mask_aug = inputs_aug['attention_mask']

        input_ids = torch.tensor(np.vstack([input_ids, input_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        token_type_ids = torch.tensor(np.vstack([token_type_ids, token_type_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        attn_mask = torch.tensor(np.vstack([attn_mask, attn_mask_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        labels = torch.tensor(np.vstack([labels, labels_aug]), dtype=torch.long).view(2)

        return input_ids, attn_mask, token_type_ids, labels


In [7]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        H1, H2, num_class = 768, 128, 3
        self.bert = AutoModel.from_pretrained(model_list[model_choice])
        self.clf = nn.Sequential(
            nn.Linear(H1, H2),
            nn.ReLU(),
            nn.Linear(H2, H2),
            nn.ReLU(),
            nn.Linear(H2, num_class)
        )

    def forward(self, input_ids, attn_mask, token_type_ids):
        outputs = self.bert(input_ids, attn_mask)
        cls_emb = outputs.pooler_output
        logits = self.clf(cls_emb)
        return logits


In [8]:
loss_fn = nn.CrossEntropyLoss()#

In [9]:

#We used this function to train the data

def train(input_ids, attn_mask, token_type_ids, label, model, model_opt, scdl):
    model_opt.zero_grad()
    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]
    loss = 0.0
    if use_cuda:
        input_ids = input_ids.to(device)
        attn_mask = attn_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        label = label.to(device)
    logits = model(input_ids[:,0,:], attn_mask[:,0,:], token_type_ids[:,0,:])
    logits_aug = model(input_ids[:,1,:], attn_mask[:,1,:], token_type_ids[:,1,:])
    loss = loss_fn(logits, label[:,0]) + loss_fn(logits_aug, label[:,1])
    loss.backward()
    model_opt.step()
    scdl.step()
    return float(loss.item())



In [10]:

#evaluate fuction
def evaluate(input_ids, attn_mask, token_type_ids, label, model, mode='train'):

    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]
    with torch.no_grad():
        if use_cuda:
            input_ids = input_ids.to(device)
            attn_mask = attn_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)

        logits = model(input_ids[:,0,:], attn_mask[:,0,:], token_type_ids[:,0,:])
        loss = loss_fn(logits, label[:,0])

        if mode == 'train':
            return float(loss.item())
        preds = torch.argmax(logits, dim=1).flatten()
        return float(loss.item()), preds.cpu().numpy()




In [12]:
df_test = pd.read_csv("hx_test.tsv", sep='\t', lineterminator='\n')
gt_labels = np.array(df_test['label'])

In [13]:
len(gt_labels)

1924

In [14]:
#save the model and  train the data
def trainIters(model, epochs, train_loader, test_loader, learning_rate=3e-5, log_step=168, valid_step=168, mode='train'):

    model_opt = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    num_train_steps = (len(train_loader)*epochs)
    scdl = get_linear_schedule_with_warmup(model_opt, num_warmup_steps=int(0.1*num_train_steps), num_training_steps=num_train_steps)

    print("Initialised optimizer and lr scheduler")

    # valid_best_loss = []
    best_acc = 0.0
    tot = len(train_data) // train_loader.batch_size
    tot_val = len(val_data) // test_loader.batch_size
    plot_steps = 0

    for epoch in range(epochs):
        train_loss_total = 0.0
        train_step = 0
        # Training

        model.train()
        for entry in tqdm(train_loader, total=tot, position=0, leave=True):
            loss = train(entry[0], entry[1], entry[2], entry[3], model, model_opt, scdl)
            plot_steps += 1
            train_step += 1
            # if not math.isnan(loss) :
            train_loss_total = train_loss_total + loss

            train_loss = train_loss_total / train_step

            if plot_steps % log_step == 0:
                writer.add_scalar("Train Loss", train_loss, plot_steps)

            if (plot_steps % valid_step == 0) or (plot_steps >= num_train_steps - 1):
                model.eval()
                test_pred = []

                for entry in tqdm(test_loader, total=tot_val, position=0, leave=True):
                    loss_v, pred_v = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
                    test_pred.extend([pd for pd in pred_v])
                val_acc = f1_score(gt_labels, test_pred, average='macro')
                print("Validation F1: " + str(val_acc))
                writer.add_scalar("Val F1", val_acc, plot_steps)
                if val_acc > best_acc:
                    torch.save(model.state_dict(), model_path + "model_" + base_model + "_" + lang + "_easymix_hx" + ".pth")
                    print("Model saved for step: " + str(plot_steps))
                    best_acc = val_acc

                model.train()
            writer.flush()

        print('epoch: '+str(epoch))
        print('total loss: '+str(train_loss_total/tot))




In [20]:
#Load the data
train_data = HateData(data_path="", split='train', lang=lang,sep=',')
val_data = HateData(data_path="", split='test', lang=lang,sep='\t')

100%|██████████| 15383/15383 [00:02<00:00, 7295.25it/s]


In [21]:
#Load the Data
BS = 16
dataload = DataLoader(train_data, batch_size=BS, shuffle=True)
dataload_val = DataLoader(val_data, batch_size=BS, shuffle=False)
print((len(train_data)/16)//2)

480.0


In [31]:
#Load the classifier
model = Classifier()
model.load_state_dict(torch.load("Hx_robert_tws_easymix.pth", map_location=device))
model = model.to(device)


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
#Train the data
trainIters(model, 5, dataload, dataload_val)

Initialised optimizer and lr scheduler


121it [00:13,  8.69it/s]                         


Validation F1: 0.6840707022848154


 17%|█▋        | 168/961 [03:55<1:18:47,  5.96s/it]

Model saved for step: 168


121it [00:14,  8.52it/s]                         
 35%|███▍      | 336/961 [07:54<57:40,  5.54s/it]

Validation F1: 0.6775282637495245


121it [00:14,  8.45it/s]                         


Validation F1: 0.6877716844094858


 52%|█████▏    | 504/961 [11:58<56:30,  7.42s/it]

Model saved for step: 504


121it [00:14,  8.46it/s]                         
 70%|██████▉   | 672/961 [15:57<28:00,  5.81s/it]

Validation F1: 0.6840437620833747


121it [00:14,  8.39it/s]                         
 87%|████████▋ | 840/961 [19:55<11:17,  5.60s/it]

Validation F1: 0.6692532179671241


962it [22:37,  1.41s/it]


epoch: 0
total loss: 2.0783050917437866


121it [00:14,  8.45it/s]                         
  5%|▍         | 46/961 [01:15<1:26:01,  5.64s/it]

Validation F1: 0.6052152106706453


121it [00:14,  8.44it/s]                         
 22%|██▏       | 214/961 [05:13<1:09:43,  5.60s/it]

Validation F1: 0.665328760719032


121it [00:14,  8.46it/s]                         
 40%|███▉      | 382/961 [09:14<54:34,  5.66s/it]

Validation F1: 0.6540860158854666


121it [00:14,  8.50it/s]                         
 57%|█████▋    | 550/961 [13:07<38:52,  5.67s/it]

Validation F1: 0.643694617951175


121it [00:14,  8.47it/s]                         
 75%|███████▍  | 718/961 [16:59<23:09,  5.72s/it]

Validation F1: 0.6680590880190228


121it [00:14,  8.43it/s]                         
 92%|█████████▏| 886/961 [20:57<07:03,  5.64s/it]

Validation F1: 0.6545025239464429


962it [22:39,  1.41s/it]


epoch: 1
total loss: 2.0775161955532746


121it [00:14,  8.39it/s]                         
 10%|▉         | 92/961 [02:20<1:20:47,  5.58s/it]

Validation F1: 0.6405836526727932


121it [00:14,  8.42it/s]                         
 27%|██▋       | 260/961 [06:21<1:06:34,  5.70s/it]

Validation F1: 0.6755220000439048


121it [00:14,  8.43it/s]                         
 45%|████▍     | 428/961 [10:21<50:50,  5.72s/it]

Validation F1: 0.6548303590182684


121it [00:14,  8.47it/s]                         
 62%|██████▏   | 596/961 [14:20<35:18,  5.80s/it]

Validation F1: 0.6773727054747151


121it [00:14,  8.46it/s]                         
 80%|███████▉  | 764/961 [18:18<18:12,  5.55s/it]

Validation F1: 0.6567403345887719


121it [00:14,  8.43it/s]                         
 97%|█████████▋| 932/961 [22:17<02:41,  5.56s/it]

Validation F1: 0.6730584126678879


962it [22:57,  1.43s/it]


epoch: 2
total loss: 2.0597660802033393


121it [00:14,  8.45it/s]                         
 14%|█▍        | 138/961 [03:19<1:19:10,  5.77s/it]

Validation F1: 0.6579882610969213


121it [00:14,  8.46it/s]                         
 32%|███▏      | 306/961 [07:17<1:02:26,  5.72s/it]

Validation F1: 0.6208350371499484


121it [00:14,  8.45it/s]                         
 49%|████▉     | 474/961 [11:16<45:44,  5.64s/it]

Validation F1: 0.6561780139637011


121it [00:14,  8.46it/s]                         
 67%|██████▋   | 642/961 [15:14<29:33,  5.56s/it]

Validation F1: 0.6419556287720098


121it [00:14,  8.47it/s]                         
 84%|████████▍ | 810/961 [19:13<13:59,  5.56s/it]

Validation F1: 0.676826069805036


962it [22:34,  1.41s/it]


epoch: 3
total loss: 2.0324932628318


121it [00:14,  8.47it/s]                         
  2%|▏         | 16/961 [00:34<1:27:13,  5.54s/it]

Validation F1: 0.6742389325645807


121it [00:14,  8.51it/s]                         
 19%|█▉        | 184/961 [04:28<1:11:12,  5.50s/it]

Validation F1: 0.6570096423998519


121it [00:14,  8.47it/s]                         
 37%|███▋      | 352/961 [08:22<55:48,  5.50s/it]

Validation F1: 0.6680772051302989


121it [00:14,  8.45it/s]                         
 54%|█████▍    | 520/961 [12:16<41:10,  5.60s/it]

Validation F1: 0.6534109019052886


121it [00:14,  8.42it/s]                         
 72%|███████▏  | 688/961 [16:14<25:58,  5.71s/it]

Validation F1: 0.6480083513287557


121it [00:14,  8.44it/s]                         
 89%|████████▉ | 856/961 [20:12<10:18,  5.89s/it]

Validation F1: 0.6435578852827483


121it [00:14,  8.47it/s]                         
100%|██████████| 961/961 [22:46<00:00,  5.61s/it]

Validation F1: 0.6419149462296743


121it [00:14,  8.43it/s]                         
962it [23:01,  1.44s/it]

Validation F1: 0.6419149462296743
epoch: 4
total loss: 2.0051973611581584





######################## TESTING ######################

In [34]:

#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/test.pth", map_location=device))
model = model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
#Load the Test Data
test_data = HateData(data_path="", split='test', lang=lang)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [37]:
model.eval()
test_loss = []
test_pred = []

#Record the prediction result
wr = open(results_path + "test_prediction_" + base_model + "_" + lang + ".txt", "w")
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

100%|██████████| 1924/1924 [00:28<00:00, 67.32it/s]

Test Loss:  0.7562068308715219





In [40]:
df_test = pd.read_csv(lang+"_test.tsv", sep='\t', lineterminator='\n')
gt_labels = np.array(df_test['label'])

In [None]:
print(len(test_pred))

In [41]:
print(classification_report(gt_labels, test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6924    0.8606    0.7674       782
           1     0.6398    0.4343    0.5174       548
           2     0.7879    0.7694    0.7785       594

    accuracy                         0.7110      1924
   macro avg     0.7067    0.6881    0.6878      1924
weighted avg     0.7069    0.7110    0.6996      1924



In [None]:
ConfusionMatrixDisplay.from_predictions(gt_labels, np.array(test_pred))