In [4]:
#Lib to be installed before running the code
 
#!pip install numpy
#!pip install torch
#!pip install transformers
#!pip install pandas
#!pip install scikit-learn
#!pip install tensorboard
#!pip install sentencepiece


In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
#math calculation 
import numpy as np
import pandas as pd

#torch lib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch import optim

#basic lib
import sys
import random
import math
import time
from tqdm import tqdm

#sklearn Lib 
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

#transformer lib Autotokenizer
from transformers import BertTokenizer, AutoTokenizer
from transformers import BertModel, AutoModel, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter

#Cuda memory
use_cuda = True if torch.cuda.is_available() else False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



torch.autograd.set_detect_anomaly(True)
torch.backends.cudnn.benchmark = True
np.random.seed(0)
torch.manual_seed(0)

base_model = 'twitter-xlm-roberta-base-sentiment'
model_list = ['bert-base-uncased', 'bert-base-multilingual-uncased', 'google/muril-base-cased', 'xlm-roberta-base',
              'ai4bharat/indic-bert','cardiffnlp/twitter-xlm-roberta-base','cardiffnlp/twitter-xlm-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base', 'cardiffnlp/twitter-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base-hate', 'roberta-base']
model_path = 'mnt/saved_models/'
results_path = 'mnt/saved_results/'

In [4]:
#to set up the dataset to train and test
lang = 'hx'
model_choice = 8

In [5]:
writer = SummaryWriter(log_dir="/home/jupyter/tboard/" + base_model + "_" + lang)
device

device(type='cpu')

In [6]:
#to set pre trained tokenizer for the given dataset
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])

#max sequence Length
MAX_SEQ_LEN = 128

label_idx = 1
text_idx = 0

#to read data and augument the data we have this function 
class HateData(Dataset):
    def __init__(self, data_path, split='train', lang='bengali', aug_prob=0.2, flip_prob=0.5):
        self.split = split
        self.data = pd.read_csv(data_path + split + "_" + lang + ".tsv", sep='\t', lineterminator='\n') 
        if self.split == 'train':
            self.label2data = {0:[], 1:[], 2:[]}
            for i in tqdm(range(len(self.data))):
                row = self.data.iloc[i]
                self.label2data[row[label_idx]].append(row[text_idx])
            self.aug_prob = aug_prob
            self.flip_prob = flip_prob

    def __len__(self):
        return len(self.data)

    
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        data = self.data.iloc[index]
        labels = data[label_idx]
        text = data[text_idx]
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids = inputs['input_ids']
        token_type_ids = np.zeros(MAX_SEQ_LEN)
        attn_mask = inputs['attention_mask']
        aug_text = text  
        labels_aug = labels
        
        if self.split == 'train' and labels == 1:
            if np.random.uniform() < self.aug_prob:
                aug_text = np.random.choice(self.label2data[0])
         
                if np.random.uniform() < self.flip_prob:
                    aug_text = aug_text + " [SEP] " + text
                else:
                    aug_text = text + " [SEP] " + aug_text 
            labels_aug = 1
      
        inputs_aug = tokenizer(aug_text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids_aug = inputs_aug['input_ids']
        token_type_ids_aug = np.zeros(MAX_SEQ_LEN)
        attn_mask_aug = inputs_aug['attention_mask']

        input_ids = torch.tensor(np.vstack([input_ids, input_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        token_type_ids = torch.tensor(np.vstack([token_type_ids, token_type_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        attn_mask = torch.tensor(np.vstack([attn_mask, attn_mask_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        labels = torch.tensor(np.vstack([labels, labels_aug]), dtype=torch.long).view(2)

        return input_ids, attn_mask, token_type_ids, labels


In [7]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        H1, H2, num_class = 768, 128, 3
        self.bert = AutoModel.from_pretrained(model_list[model_choice])
        self.clf = nn.Sequential(
            nn.Linear(H1, H2),
            nn.ReLU(),
            nn.Linear(H2, H2),
            nn.ReLU(),
            nn.Linear(H2, num_class)
        )
        
    def forward(self, input_ids, attn_mask, token_type_ids):  
        outputs = self.bert(input_ids, attn_mask)
        cls_emb = outputs.pooler_output 
        logits = self.clf(cls_emb)
        return logits


In [8]:
loss_fn = nn.CrossEntropyLoss()#

In [9]:

#We used this function to train the data 

def train(input_ids, attn_mask, token_type_ids, label, model, model_opt, scdl):
    model_opt.zero_grad()
    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]
    loss = 0.0
    if use_cuda:
        input_ids = input_ids.to(device)
        attn_mask = attn_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        label = label.to(device)
    logits = model(input_ids[:,0,:], attn_mask[:,0,:], token_type_ids[:,0,:])
    logits_aug = model(input_ids[:,1,:], attn_mask[:,1,:], token_type_ids[:,1,:])
    loss = loss_fn(logits, label[:,0]) + loss_fn(logits_aug, label[:,1])
    loss.backward()
    model_opt.step()
    scdl.step()
    return float(loss.item())



In [10]:

#evaluate fuction 
def evaluate(input_ids, attn_mask, token_type_ids, label, model, mode='train'):
   
    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]
    with torch.no_grad():
        if use_cuda:
            input_ids = input_ids.to(device)
            attn_mask = attn_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)
        
        logits = model(input_ids[:,0,:], attn_mask[:,0,:], token_type_ids[:,0,:])
        loss = loss_fn(logits, label[:,0])
        
        if mode == 'train':
            return float(loss.item())
        preds = torch.argmax(logits, dim=1).flatten()
        return float(loss.item()), preds.cpu().numpy()
        



In [11]:
df_test = pd.read_csv("data/multilingual/test_hx.tsv", sep='\t', lineterminator='\n')
gt_labels = np.array(df_test['label'])

In [11]:
len(gt_labels)

1924

In [13]:
#save the model and  train the data
def trainIters(model, epochs, train_loader, test_loader, learning_rate=3e-5, log_step=168, valid_step=168, mode='train'):

    model_opt = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    num_train_steps = (len(train_loader)*epochs) 
    scdl = get_linear_schedule_with_warmup(model_opt, num_warmup_steps=int(0.1*num_train_steps), num_training_steps=num_train_steps)

    print("Initialised optimizer and lr scheduler")

    # valid_best_loss = [] 
    best_acc = 0.0 
    tot = len(train_data) // train_loader.batch_size
    tot_val = len(val_data) // test_loader.batch_size
    plot_steps = 0
    
    for epoch in range(epochs):
        train_loss_total = 0.0
        train_step = 0
        # Training
        
        model.train()        
        for entry in tqdm(train_loader, total=tot, position=0, leave=True):
            loss = train(entry[0], entry[1], entry[2], entry[3], model, model_opt, scdl)
            plot_steps += 1
            train_step += 1
            # if not math.isnan(loss) :      
            train_loss_total = train_loss_total + loss
            
            train_loss = train_loss_total / train_step
            
            if plot_steps % log_step == 0:
                writer.add_scalar("Train Loss", train_loss, plot_steps)
            
            if (plot_steps % valid_step == 0) or (plot_steps >= num_train_steps - 1):
                model.eval()
                test_pred = []

                for entry in tqdm(test_loader, total=tot_val, position=0, leave=True):
                    loss_v, pred_v = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')      
                    test_pred.extend([pd for pd in pred_v])
                val_acc = f1_score(gt_labels, test_pred, average='macro')
                print("Validation F1: " + str(val_acc))
                writer.add_scalar("Val F1", val_acc, plot_steps)
                if val_acc > best_acc:
                    torch.save(model.state_dict(), model_path + "model_" + base_model + "_" + lang + "_easymix_hx" + ".pth") 
                    print("Model saved for step: " + str(plot_steps))
                    best_acc = val_acc         

                model.train()
            writer.flush()

        print('epoch: '+str(epoch))
        print('total loss: '+str(train_loss_total/tot))


        

In [14]:
#Load the data 
train_data = HateData(data_path="data/multilingual/", split='train', lang=lang)
val_data = HateData(data_path="data/multilingual/", split='test', lang=lang)

100%|██████████| 15383/15383 [00:01<00:00, 9785.89it/s]


In [15]:
#Load the Data
BS = 16
dataload = DataLoader(train_data, batch_size=BS, shuffle=True)
dataload_val = DataLoader(val_data, batch_size=BS, shuffle=False)
print((len(train_data)/16)//2)

480.0


In [15]:
#Load the classifier 
model = Classifier()
model = model.float()
model = model.to(device)


Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
#Train the data
trainIters(model, 5, dataload, dataload_val)

Initialised optimizer and lr scheduler


121it [06:32,  3.24s/it]                         /it] 


Validation F1: 0.502426297455825


 17%|█▋        | 168/961 [1:27:41<32:18:15, 146.65s/it]

Model saved for step: 168


121it [06:21,  3.15s/it]                         /it]  


Validation F1: 0.5849010847253647


 35%|███▍      | 336/961 [2:49:58<24:04:14, 138.65s/it]

Model saved for step: 336


121it [06:21,  3.16s/it]                         /it]  


Validation F1: 0.647090475265772


 52%|█████▏    | 504/961 [4:03:36<17:36:04, 138.65s/it]

Model saved for step: 504


121it [06:22,  3.16s/it]                         /it]  
 70%|██████▉   | 672/961 [5:22:01<11:08:00, 138.69s/it]

Validation F1: 0.6387299170070943


121it [06:25,  3.19s/it]                         t]    
 87%|████████▋ | 840/961 [6:38:47<4:43:38, 140.65s/it]

Validation F1: 0.6200146401511757


962it [7:29:08, 28.01s/it]                            


epoch: 0
total loss: 1.6253475148645573


121it [06:24,  3.17s/it]                         ]
  5%|▍         | 46/961 [25:29<35:44:33, 140.63s/it]

Validation F1: 0.6382145132425631


121it [05:55,  2.94s/it]                         /it]


Validation F1: 0.6679545774810817


 22%|██▏       | 214/961 [1:36:00<27:00:24, 130.15s/it]

Model saved for step: 1176


121it [05:54,  2.93s/it]                         /it]  


Validation F1: 0.6745350918939031


 40%|███▉      | 382/961 [2:45:57<20:44:32, 128.97s/it]

Model saved for step: 1344


121it [05:57,  2.96s/it]                         /it]  


Validation F1: 0.6765054402645277


 57%|█████▋    | 550/961 [3:55:39<14:55:03, 130.67s/it]

Model saved for step: 1512


121it [05:59,  2.97s/it]                         /it]  
 75%|███████▍  | 718/961 [5:05:32<8:50:13, 130.92s/it]

Validation F1: 0.6748642204623859


121it [05:47,  2.87s/it]                         t]   


Validation F1: 0.681315937457268


 92%|█████████▏| 886/961 [6:14:44<2:38:38, 126.92s/it]

Model saved for step: 1848


962it [6:43:35, 25.17s/it]                            


epoch: 1
total loss: 1.1924661666185874


121it [05:51,  2.90s/it]                         ]
 10%|▉         | 92/961 [40:07<30:49:50, 127.72s/it]

Validation F1: 0.6762870537906297


121it [05:55,  2.94s/it]                         /it]
 27%|██▋       | 260/961 [1:49:28<25:11:34, 129.38s/it]

Validation F1: 0.6733426422692133


121it [06:01,  2.99s/it]                         /it]  
 45%|████▍     | 428/961 [2:58:43<19:26:23, 131.30s/it]

Validation F1: 0.6645915972614225


121it [05:51,  2.90s/it]                         /it]  


Validation F1: 0.6837822268370299


 62%|██████▏   | 596/961 [4:08:32<13:00:51, 128.36s/it]

Model saved for step: 2520


121it [05:46,  2.86s/it]                         /it]  
 80%|███████▉  | 764/961 [5:17:30<6:53:55, 126.07s/it]

Validation F1: 0.6803363087884705


121it [05:59,  2.97s/it]                         t]   
 97%|█████████▋| 932/961 [6:26:55<1:03:43, 131.85s/it]

Validation F1: 0.680817274478701


121it [06:04,  3.01s/it]                         t]   
100%|██████████| 961/961 [6:44:17<00:00, 132.17s/it]

Validation F1: 0.68130827347745


121it [06:13,  3.08s/it]                         
962it [6:50:43, 25.62s/it]                          

Validation F1: 0.68130827347745
epoch: 2
total loss: 0.8196352052576956





######################## TESTING ######################

In [17]:

#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws_easymix.pth", map_location=device))
model = model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
#Load the Test Data
test_data = HateData(data_path="data/multilingual/", split='test', lang=lang)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [19]:
model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + lang + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

100%|██████████| 1924/1924 [07:59<00:00,  4.01it/s]

Test Loss:  0.7636778038072549





In [20]:
df_test = pd.read_csv("data/multilingual/test_"+lang+".tsv", sep='\t', lineterminator='\n')
gt_labels = np.array(df_test['label'])

In [None]:
print(len(test_pred))

In [21]:
print(classification_report(gt_labels, test_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7666    0.7225    0.7439       782
           1     0.5615    0.5584    0.5599       548
           2     0.7368    0.7963    0.7654       594

    accuracy                         0.6985      1924
   macro avg     0.6883    0.6924    0.6897      1924
weighted avg     0.6990    0.6985    0.6981      1924



In [None]:
ConfusionMatrixDisplay.from_predictions(gt_labels, np.array(test_pred))