## Libraies used in this IPYNB file 

In [8]:
import numpy as np
import pandas as pd

#torch lib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch import optim

#basic lib
import sys
import random
import math
import time
from tqdm import tqdm

#sklearn Lib 
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

#transformer lib Autotokenizer
from transformers import BertTokenizer, AutoTokenizer
from transformers import BertModel, AutoModel, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter


#NLTK lib and pandas
import pandas as pd
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import language_tool_python 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stemmer = WordNetLemmatizer()
grammer = language_tool_python.LanguageTool('en-US')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# pre-requisite Function to test model

In [6]:
#Cuda memory
use_cuda = True if torch.cuda.is_available() else False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)
torch.backends.cudnn.benchmark = True
np.random.seed(0)
torch.manual_seed(0)

base_model = 'twitter-xlm-roberta-base-sentiment'

#model selection 
model_list = ['bert-base-uncased', 'bert-base-multilingual-uncased', 'google/muril-base-cased', 'xlm-roberta-base',
              'ai4bharat/indic-bert','cardiffnlp/twitter-xlm-roberta-base','cardiffnlp/twitter-xlm-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base', 'cardiffnlp/twitter-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base-hate', 'roberta-base']

#model path 
model_path = 'mnt/saved_models/'

#result are saaved in this location 
results_path = 'mnt/saved_results/'

#Data augumentation 
class HateData(Dataset):
    def __init__(self, data_path, split='train', lang='bengali', aug_prob=0.2, flip_prob=0.5):
        self.split = split
        self.data = pd.read_csv(data_path + split + lang + ".tsv", sep='\t', lineterminator='\n') 
        if self.split == 'train':
            self.label2data = {0:[], 1:[], 2:[]}
            for i in tqdm(range(len(self.data))):
                row = self.data.iloc[i]
                self.label2data[row[label_idx]].append(row[text_idx])
            self.aug_prob = aug_prob
            self.flip_prob = flip_prob

    def __len__(self):
        return len(self.data)

    
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        data = self.data.iloc[index]
        labels = data[label_idx]
        text = data[text_idx]
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids = inputs['input_ids']
        token_type_ids = np.zeros(MAX_SEQ_LEN)
        attn_mask = inputs['attention_mask']
        aug_text = text  
        labels_aug = labels
        
        if self.split == 'train' and labels == 1:
            if np.random.uniform() < self.aug_prob:
                aug_text = np.random.choice(self.label2data[0])
         
                if np.random.uniform() < self.flip_prob:
                    aug_text = aug_text + " [SEP] " + text
                else:
                    aug_text = text + " [SEP] " + aug_text 
            labels_aug = 1
      
        inputs_aug = tokenizer(aug_text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids_aug = inputs_aug['input_ids']
        token_type_ids_aug = np.zeros(MAX_SEQ_LEN)
        attn_mask_aug = inputs_aug['attention_mask']

        input_ids = torch.tensor(np.vstack([input_ids, input_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        token_type_ids = torch.tensor(np.vstack([token_type_ids, token_type_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        attn_mask = torch.tensor(np.vstack([attn_mask, attn_mask_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        labels = torch.tensor(np.vstack([labels, labels_aug]), dtype=torch.long).view(2)

        return input_ids, attn_mask, token_type_ids, labels


#data classifier 
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        H1, H2, num_class = 768, 128, 3
        self.bert = AutoModel.from_pretrained(model_list[model_choice])
        self.clf = nn.Sequential(
            nn.Linear(H1, H2),
            nn.ReLU(),
            nn.Linear(H2, H2),
            nn.ReLU(),
            nn.Linear(H2, num_class)
        )
        
    def forward(self, input_ids, attn_mask, token_type_ids):  
        outputs = self.bert(input_ids, attn_mask)
        cls_emb = outputs.pooler_output 
        logits = self.clf(cls_emb)
        return logits
    
#evaluate fuction 
def evaluate(input_ids, attn_mask, token_type_ids, label, model, mode='train'):
   
    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]
    with torch.no_grad():
        if use_cuda:
            input_ids = input_ids.to(device)
            attn_mask = attn_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)
        
        logits = model(input_ids[:,0,:], attn_mask[:,0,:], token_type_ids[:,0,:])
        loss = loss_fn(logits, label[:,0])
        
        if mode == 'train':
            return float(loss.item())
        preds = torch.argmax(logits, dim=1).flatten()
        return float(loss.item()), preds.cpu().numpy()
loss_fn = nn.CrossEntropyLoss()


### Tested Model with 1000 samples for Hate Explain 

In [7]:
data=pd.read_csv("data/hatexplain/hx_test.tsv", sep='\t')
data["post"]
sam1=[]
sam2=[]

test_sam=pd.DataFrame()
for i in range(1000):
    sam1.append(data["post"][i])
    sam2.append(data["label"][i])

test_sam['post']=sam1
test_sam['label']=sam2
test_sam.to_csv("hxsam_test.tsv",sep="\t",index=False)



model_choice = 8
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])
#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws.pth", map_location=device))
model = model.to(device)
label_idx = 1
MAX_SEQ_LEN = 128
text_idx = 0

test_data = HateData(data_path="", split='', lang="hxsam_test")
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

df_test = pd.read_csv("data/multilingual/test_"+"Hx"+".tsv", sep='\t', lineterminator='\n')
gt_labels = test_sam["label"]

print(classification_report(gt_labels, test_pred, digits=4))

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [04:15<00:00,  3.92it/s]

Test Loss:  0.599637269705534
              precision    recall  f1-score   support

           0     0.8643    0.8008    0.8313       477
           1     0.3617    0.3542    0.3579       144
           2     0.7890    0.8681    0.8266       379

    accuracy                         0.7620      1000
   macro avg     0.6716    0.6744    0.6720      1000
weighted avg     0.7634    0.7620    0.7614      1000






### Synonym Replacement
#### All the words in the sentences are replaced by their synonyms while they're not stopwords, when their length is more than 3 letters.

In [9]:

data=pd.read_csv("data/hatexplain/hx_test.tsv", sep='\t')
data["post"]
sam1=[]
sam2=[]
post=[]

test_sam=pd.DataFrame()
for i in range(1000):
    sam1.append(data["post"][i])
    sam2.append(data["label"][i])
stop_words = set(stopwords.words('english'))
                
for i in range(len(sam1)):
    
    l=sam1[i].split()
    sen=[]
    for i in l:
        if i in stop_words:
            sen.append(i)
            pass
        else:
            try:
                if len(i)<=3:
                    sen.append(i)
                else:
                    j=wordnet.synsets(i)
                    lt=j[0].lemmas()[0].name()
                    if i !=lt:
                        sen.append(lt)
                    else:
                        sen.append(i)
            except:
                sen.append(i)
    post.append(' '.join(sen))


test_sam['post']=post
test_sam['label']=sam2

test_sam.to_csv("hxsam_test2.tsv",sep="\t",index=False)

model_choice = 8
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])
#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws.pth", map_location=device))
model = model.to(device)
label_idx = 1
MAX_SEQ_LEN = 128
text_idx = 0

test_data = HateData(data_path="", split='', lang="hxsam_test2")
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

df_test = pd.read_csv("data/multilingual/test_"+"Hx"+".tsv", sep='\t', lineterminator='\n')
gt_labels = test_sam["label"]

print(classification_report(gt_labels, test_pred, digits=4))



Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [04:18<00:00,  3.86it/s]

Test Loss:  0.7526975889280438
              precision    recall  f1-score   support

           0     0.8337    0.7463    0.7876       477
           1     0.3577    0.3056    0.3296       144
           2     0.7111    0.8443    0.7720       379

    accuracy                         0.7200      1000
   macro avg     0.6342    0.6321    0.6297      1000
weighted avg     0.7187    0.7200    0.7157      1000






### Charcter swaping
#### All the words in the sentences are swaped their charcters while they're not stopwords, when their length is more than 3 letters.

In [18]:
import pandas as pd
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import language_tool_python 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stemmer = WordNetLemmatizer()
grammer = language_tool_python.LanguageTool('en-US')

data=pd.read_csv("data/hatexplain/hx_test.tsv", sep='\t')
data["post"]
sam1=[]
sam2=[]
post=[]

test_sam=pd.DataFrame()
for i in range(1000):
    sam1.append(data["post"][i])
    sam2.append(data["label"][i])
stop_words = set(stopwords.words('english'))
                
for i in range(len(sam1)):
    
    l=sam1[i].split()
    sen=[]
    count=0
    for i in l:
        if i in stop_words:
            sen.append(i)
            pass
        else:
            try:
                if len(i)<=3:
                    sen.append(i)
                else:
                    if count==2:
                        sen.append(i)
                    else:
                        t=list(i)
                        t[2],t[3]=t[3],t[2]
                        r="".join(t)
                        count=count+1
                        sen.append(r)
            except:
                sen.append(i)
    post.append(' '.join(sen))


test_sam['post']=post
test_sam['label']=sam2

test_sam.to_csv("hxsam_test2.tsv",sep="\t",index=False)

model_choice = 8
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])
#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws.pth", map_location=device))
model = model.to(device)
label_idx = 1
MAX_SEQ_LEN = 128
text_idx = 0

test_data = HateData(data_path="", split='', lang="hxsam_test2")
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

df_test = pd.read_csv("data/multilingual/test_"+"Hx"+".tsv", sep='\t', lineterminator='\n')
gt_labels = test_sam["label"]

print(classification_report(gt_labels, test_pred, digits=4))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [05:39<00:00,  2.95it/s]

Test Loss:  0.6691783783510328
              precision    recall  f1-score   support

           0     0.8419    0.7925    0.8164       477
           1     0.3356    0.3403    0.3379       144
           2     0.7654    0.8179    0.7908       379

    accuracy                         0.7370      1000
   macro avg     0.6476    0.6502    0.6484      1000
weighted avg     0.7400    0.7370    0.7378      1000






### Concatenation last charcter of the word to next word in each sentence

In [17]:
import pandas as pd
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import language_tool_python 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stemmer = WordNetLemmatizer()
grammer = language_tool_python.LanguageTool('en-US')

data=pd.read_csv("data/hatexplain/hx_test.tsv", sep='\t')
data["post"]
sam1=[]
sam2=[]
post=[]

test_sam=pd.DataFrame()
for i in range(1000):
    sam1.append(data["post"][i])
    sam2.append(data["label"][i])
stop_words = set(stopwords.words('english'))
                
for i in range(len(sam1)):
    
    l=sam1[i].split()
    sen=[]
    count=0
    word=""
    for i in l:
        if word!="":
            i=word+i
            word=""
        else:
            pass
        if i in stop_words:
            sen.append(i)
            pass
        else:
            try:
                if len(i)<=3:
                    sen.append(i)
                else:
                    if count==1:
                        sen.append(i)
                    else:
                        t=list(i)
                        word=t.pop()
                        #print(word)
                        r="".join(t)
                        count=count+1
                        sen.append(r)
            except:
                sen.append(i)
    post.append(' '.join(sen))


test_sam['post']=post
test_sam['label']=sam2

test_sam.to_csv("hxsam_test2.tsv",sep="\t",index=False)

model_choice = 8
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])
#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws.pth", map_location=device))
model = model.to(device)
label_idx = 1
MAX_SEQ_LEN = 128
text_idx = 0

test_data = HateData(data_path="", split='', lang="hxsam_test2")
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

df_test = pd.read_csv("data/multilingual/test_"+"Hx"+".tsv", sep='\t', lineterminator='\n')
gt_labels = test_sam["label"]

print(classification_report(gt_labels, test_pred, digits=4))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [05:49<00:00,  2.87it/s]

Test Loss:  0.6058173116743565
              precision    recall  f1-score   support

           0     0.8617    0.7966    0.8279       477
           1     0.3624    0.3750    0.3686       144
           2     0.7951    0.8602    0.8264       379

    accuracy                         0.7600      1000
   macro avg     0.6731    0.6773    0.6743      1000
weighted avg     0.7646    0.7600    0.7612      1000






### dropping few words from sentences

In [16]:
import pandas as pd
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import language_tool_python 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stemmer = WordNetLemmatizer()
grammer = language_tool_python.LanguageTool('en-US')

data=pd.read_csv("data/hatexplain/hx_test.tsv", sep='\t')
data["post"]
sam1=[]
sam2=[]
post=[]

test_sam=pd.DataFrame()
for i in range(1000):
    sam1.append(data["post"][i])
    sam2.append(data["label"][i])
stop_words = set(stopwords.words('english'))
                
for i in range(len(sam1)):
    
    l=sam1[i].split()
    sen=[]
    count=0
    word=""
    for i in l:
        sen.append(i)
    random.shuffle(sen)
    sen.pop()
    random.shuffle(sen)
    sen.pop()
    post.append(' '.join(sen))
    post


test_sam['post']=post
test_sam['label']=sam2

test_sam.to_csv("hxsam_test2.tsv",sep="\t",index=False)

model_choice = 8
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])
#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws.pth", map_location=device))
model = model.to(device)
label_idx = 1
MAX_SEQ_LEN = 128
text_idx = 0

test_data = HateData(data_path="", split='', lang="hxsam_test2")
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

df_test = pd.read_csv("data/multilingual/test_"+"Hx"+".tsv", sep='\t', lineterminator='\n')
gt_labels = test_sam["label"]

print(classification_report(gt_labels, test_pred, digits=4))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [06:01<00:00,  2.76it/s]


Test Loss:  0.7552851028814912
              precision    recall  f1-score   support

           0     0.7851    0.8197    0.8021       477
           1     0.3376    0.3681    0.3522       144
           2     0.7971    0.7256    0.7597       379

    accuracy                         0.7190      1000
   macro avg     0.6399    0.6378    0.6380      1000
weighted avg     0.7252    0.7190    0.7212      1000



In [15]:
import pandas as pd
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import language_tool_python 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stemmer = WordNetLemmatizer()
grammer = language_tool_python.LanguageTool('en-US')

data=pd.read_csv("data/hatexplain/hx_test.tsv", sep='\t')
data["post"]
sam1=[]
sam2=[]
post=[]

test_sam=pd.DataFrame()
for i in range(1000):
    sam1.append(data["post"][i])
    sam2.append(data["label"][i])
stop_words = set(stopwords.words('english'))
                
for i in range(len(sam1)):
    
    l=sam1[i].split()
    sen=[]
    count=0
    word=""
    for i in l:
        sen.append(i)
    random.shuffle(sen)
    random.shuffle(sen)
    post.append(' '.join(sen))
    post


test_sam['post']=post
test_sam['label']=sam2

test_sam.to_csv("hxsam_test2.tsv",sep="\t",index=False)

model_choice = 8
tokenizer = AutoTokenizer.from_pretrained(model_list[model_choice])
#Load pre trained Model
model = Classifier()
model.load_state_dict(torch.load("mnt/saved_models/Hx_robert_tws.pth", map_location=device))
model = model.to(device)
label_idx = 1
MAX_SEQ_LEN = 128
text_idx = 0

test_data = HateData(data_path="", split='', lang="hxsam_test2")
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)


model.eval()
test_loss = []
test_pred = []

#Record the prediction result  
wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
    v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
    test_loss.append(v_loss)
    test_pred.append(v_pred)
    wr.write(str(v_pred)+"\n")
test_loss = np.mean(test_loss)#.item()
print("Test Loss: ", test_loss)
wr.close()

df_test = pd.read_csv("data/multilingual/test_"+"Hx"+".tsv", sep='\t', lineterminator='\n')
gt_labels = test_sam["label"]

print(classification_report(gt_labels, test_pred, digits=4))

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [05:47<00:00,  2.87it/s]

Test Loss:  0.634000144816935
              precision    recall  f1-score   support

           0     0.8533    0.7925    0.8217       477
           1     0.3245    0.3403    0.3322       144
           2     0.7759    0.8311    0.8025       379

    accuracy                         0.7420      1000
   macro avg     0.6512    0.6546    0.6522      1000
weighted avg     0.7478    0.7420    0.7440      1000




