## Libraies used in this IPYNB file 

In [1]:
import numpy as np
import pandas as pd

#torch lib
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch import optim

#basic lib
import sys
import random
import math
import time
from tqdm import tqdm

#sklearn Lib 
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

#transformer lib Autotokenizer
from transformers import BertTokenizer, AutoTokenizer
from transformers import BertModel, AutoModel, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter


#NLTK lib and pandas
import pandas as pd
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
import language_tool_python 
import nltk
nltk.download('stopwords')
nltk.download('wordnet')


stemmer = WordNetLemmatizer()
grammer = language_tool_python.LanguageTool('en-US')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# pre-requisite Function to test model

In [14]:
#Cuda memory
use_cuda = True if torch.cuda.is_available() else False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.autograd.set_detect_anomaly(True)
torch.backends.cudnn.benchmark = True
np.random.seed(0)
torch.manual_seed(0)

base_model = 'twitter-xlm-roberta-base-sentiment'

#model selection 
model_list = ['bert-base-uncased', 'bert-base-multilingual-uncased', 'google/muril-base-cased', 'xlm-roberta-base',
              'ai4bharat/indic-bert','cardiffnlp/twitter-xlm-roberta-base','cardiffnlp/twitter-xlm-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base', 'cardiffnlp/twitter-roberta-base-sentiment',
              'cardiffnlp/twitter-roberta-base-hate', 'roberta-base']

#model path 
model_path = 'mnt/saved_models/'

#result are saaved in this location 
results_path = 'mnt/saved_results/'

#Data augumentation 
class HateData(Dataset):
    def __init__(self, data_path, split='train', lang='bengali', aug_prob=0.2, flip_prob=0.5):
        self.split = split
        self.data = pd.read_csv(data_path + split + lang , sep=',', lineterminator='\n') 
        if self.split == 'train':
            self.label2data = {0:[], 1:[], 2:[]}
            for i in tqdm(range(len(self.data))):
                row = self.data.iloc[i]
                self.label2data[row[label_idx]].append(row[text_idx])
            self.aug_prob = aug_prob
            self.flip_prob = flip_prob

    def __len__(self):
        return len(self.data)

    
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        data = self.data.iloc[index]
        labels = data[label_idx]
        text = data[text_idx]
        inputs = tokenizer(text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids = inputs['input_ids']
        token_type_ids = np.zeros(MAX_SEQ_LEN)
        attn_mask = inputs['attention_mask']
        aug_text = text  
        labels_aug = labels
        
        if self.split == 'train' and labels == 1:
            if np.random.uniform() < self.aug_prob:
                aug_text = np.random.choice(self.label2data[0])
         
                if np.random.uniform() < self.flip_prob:
                    aug_text = aug_text + " [SEP] " + text
                else:
                    aug_text = text + " [SEP] " + aug_text 
            labels_aug = 1
      
        inputs_aug = tokenizer(aug_text, padding='max_length', truncation=True, max_length=MAX_SEQ_LEN)
        input_ids_aug = inputs_aug['input_ids']
        token_type_ids_aug = np.zeros(MAX_SEQ_LEN)
        attn_mask_aug = inputs_aug['attention_mask']

        input_ids = torch.tensor(np.vstack([input_ids, input_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        token_type_ids = torch.tensor(np.vstack([token_type_ids, token_type_ids_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        attn_mask = torch.tensor(np.vstack([attn_mask, attn_mask_aug]), dtype=torch.long).view(2, MAX_SEQ_LEN)
        labels = torch.tensor(np.vstack([labels, labels_aug]), dtype=torch.long).view(2)

        return input_ids, attn_mask, token_type_ids, labels


#data classifier 
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        H1, H2, num_class = 768, 128, 2
        self.bert = AutoModel.from_pretrained(model_list[model_choice])
        self.clf = nn.Sequential(
            nn.Linear(H1, H2),
            nn.ReLU(),
            nn.Linear(H2, H2),
            nn.ReLU(),
            nn.Linear(H2, num_class)
        )
        
    def forward(self, input_ids, attn_mask, token_type_ids):  
        outputs = self.bert(input_ids, attn_mask)
        cls_emb = outputs.pooler_output 
        logits = self.clf(cls_emb)
        return logits
    
#evaluate fuction 
def evaluate(input_ids, attn_mask, token_type_ids, label, model, mode='train'):
   
    batch_size = input_ids.shape[0]
    seq_len = input_ids.shape[1]
    with torch.no_grad():
        if use_cuda:
            input_ids = input_ids.to(device)
            attn_mask = attn_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            label = label.to(device)
        
        logits = model(input_ids[:,0,:], attn_mask[:,0,:], token_type_ids[:,0,:])
        loss = loss_fn(logits, label[:,0])
        
        if mode == 'train':
            return float(loss.item())
        preds = torch.argmax(logits, dim=1).flatten()
        return float(loss.item()), preds.cpu().numpy()
loss_fn = nn.CrossEntropyLoss()


### Multilingual Model Testing

In [30]:
model_names=["all-language-bert-base-multilingual","all_language_xlm-roberta-base","all_language_twitter-xlm-roberta-base","all_language_twitter-xlm-roberta-base-sentiment"]
model_num=[1,3,5,6]
#Load pre trained Model
for j in range(4):
    model_choice=model_num[j]
    print(model_list[model_num[j]])
    print("\n")
    tokenizer = AutoTokenizer.from_pretrained(model_list[model_num[j]])
    model = Classifier()
    model.load_state_dict(torch.load("mnt/saved_models/"+model_names[j]+".pth", map_location=device))
    model = model.to(device)
    label_idx = 1
    MAX_SEQ_LEN = 128
    text_idx = 0
    lan=['ja','it','ro','ru']
    name=['japaness', 'italin','russian','romanian']
    for i in range(4):
        print("Language \n")
        print(name[i])
        print("\n")
        test_data = HateData(data_path="data/multilingual/2/"+lan[i]+"_test.tsv", split='', lang="")
        test_loader = DataLoader(test_data, batch_size=1, shuffle=False)
        model.eval()
        test_loss = []
        test_pred = []

        #Record the prediction result  
        wr = open(results_path + "test_prediction_" + base_model + "_" + "Hx" + ".txt", "w")    
        for entry in tqdm(test_loader, total=len(test_data)//test_loader.batch_size, position=0, leave=True):
            v_loss, v_pred = evaluate(entry[0], entry[1], entry[2], entry[3], model, mode='test')
            test_loss.append(v_loss)
            test_pred.append(v_pred)
            wr.write(str(v_pred)+"\n")
        test_loss = np.mean(test_loss)#.item()
        print("Test Loss: ", test_loss)
        wr.close()


        df_test = pd.read_csv("data/multilingual/2/"+lan[i]+"_test.tsv", sep=',', lineterminator='\n')
        gt_labels = df_test["label"]

        print(classification_report(gt_labels, test_pred, digits=4))

        print("Error Analysis for "+name[i])
        count=0
        for i in range(50):
            if gt_labels[i]!=test_pred[i]:
                print(df_test['post'][i])
                count=count+1
                if count>=5:
                    break


bert-base-multilingual-uncased


Language 

japaness




100%|██████████| 52/52 [00:17<00:00,  3.01it/s]


Test Loss:  0.4407794945336019
              precision    recall  f1-score   support

           0     0.8980    0.9778    0.9362        45
           1     0.6667    0.2857    0.4000         7

    accuracy                         0.8846        52
   macro avg     0.7823    0.6317    0.6681        52
weighted avg     0.8668    0.8846    0.8640        52

Error Analysis for japaness
フランスをイスラム化する極悪非道な計画を実行しているマクロン大統領が今何をしているのかがこれだ、水起きろ… @url
@user 彼がアラブ人に変装していようが、黒人がユダヤ人に変装していようが、私には関係ありません。しかし、そこには変装が… @url
真実 #イスラム教 #イスラム教徒 #モスク #検閲 #sos #ヨーロッパ #自由表現 #歌 #左翼… @url
まあ、いいえ、でもレベウスにも巨根はあります — 良い tg @url
@user 強制的に遅刻者はくそー
Language 

italin




100%|██████████| 52/52 [00:16<00:00,  3.23it/s]


Test Loss:  0.5404823472938285
              precision    recall  f1-score   support

           0     0.8776    0.9556    0.9149        45
           1     0.3333    0.1429    0.2000         7

    accuracy                         0.8462        52
   macro avg     0.6054    0.5492    0.5574        52
weighted avg     0.8043    0.8462    0.8187        52

Error Analysis for italin
ecco cosa sta facendo Macron nel momento in cui sta attuando un piano diabolico per islamizzare la Francia, svegliati... @url
@user se è travestito da arabo o nero da ebreo o altro, non mi interessa. ma ecco il travestimento… @url
verità #islam #musulmani #moschea #censura #sos #Europa #espressionedellalibertà #ong #sinistra… @url
beh no, ma anche i rebeus hanno un grosso cazzo — bravo tg @url
@usate tutte queste persone che rifiutano di cedere al terrorismo intellettuale che deve spaventare più di una persona.
Language 

russian




100%|██████████| 52/52 [00:16<00:00,  3.20it/s]


Test Loss:  0.44519931620631653
              precision    recall  f1-score   support

           0     0.9149    0.9556    0.9348        45
           1     0.6000    0.4286    0.5000         7

    accuracy                         0.8846        52
   macro avg     0.7574    0.6921    0.7174        52
weighted avg     0.8725    0.8846    0.8763        52

Error Analysis for russian
iată ce face Macron în momentul în care implementează un plan diabolic de islamizare a Franței, trezește-te miercuri... @url
truth #islam #musulmani #moschee #cenzură #sos #europe #freedomexpression #ong #leftist… @url
@user al naibii de retardații în vigoare
Marea Britanie: rugăciunea musulmană este recitată (în arabă) înainte de deschiderea ședinței consiliului municipal din... @url
- expulzarea imediată a migranților vinovați de furt viol...
Language 

romanian




100%|██████████| 52/52 [00:15<00:00,  3.27it/s]


Test Loss:  0.5024353414594841
              precision    recall  f1-score   support

           0     0.8776    0.9556    0.9149        45
           1     0.3333    0.1429    0.2000         7

    accuracy                         0.8462        52
   macro avg     0.6054    0.5492    0.5574        52
weighted avg     0.8043    0.8462    0.8187        52

Error Analysis for romanian
вот что делает Макрон в тот момент, когда реализует дьявольский план по исламизации Франции, проснись, ср… @url
истина #ислам #мусульмане #мечеть #цензура #сос #европа #свободавыражение #онг #левый… @url
ну нет, но у ребеуса тоже большой член — хорошо, tg @url
@используйте всех этих людей, которые отказываются поддаваться интеллектуальному терроризму, который должен напугать не одного человека.
@user черт возьми, какие дебилы в силе
xlm-roberta-base


Language 

japaness




100%|██████████| 52/52 [00:16<00:00,  3.23it/s]


Test Loss:  0.3052644353861419
              precision    recall  f1-score   support

           0     0.9149    0.9556    0.9348        45
           1     0.6000    0.4286    0.5000         7

    accuracy                         0.8846        52
   macro avg     0.7574    0.6921    0.7174        52
weighted avg     0.8725    0.8846    0.8763        52

Error Analysis for japaness
真実 #イスラム教 #イスラム教徒 #モスク #検閲 #sos #ヨーロッパ #自由表現 #歌 #左翼… @url
まあ、いいえ、でもレベウスにも巨根はあります — 良い tg @url
@user 強制的に遅刻者はくそー
彼女は汚いアラブ人だった @url
イギリス：市議会の開会前にイスラム教徒の祈りが（アラビア語で）唱えられる… @url
Language 

italin




100%|██████████| 52/52 [00:16<00:00,  3.17it/s]


Test Loss:  0.2896663761482789
              precision    recall  f1-score   support

           0     0.8936    0.9333    0.9130        45
           1     0.4000    0.2857    0.3333         7

    accuracy                         0.8462        52
   macro avg     0.6468    0.6095    0.6232        52
weighted avg     0.8272    0.8462    0.8350        52

Error Analysis for italin
ecco cosa sta facendo Macron nel momento in cui sta attuando un piano diabolico per islamizzare la Francia, svegliati... @url
dc ora dicono che @user o l'@user sono una lobby ebraica e sionista sapendo che pretende di essere tale… @url
verità #islam #musulmani #moschea #censura #sos #Europa #espressionedellalibertà #ong #sinistra… @url
beh no, ma anche i rebeus hanno un grosso cazzo — bravo tg @url
@user maledetti i ritardati in vigore
Language 

russian




100%|██████████| 52/52 [00:15<00:00,  3.31it/s]


Test Loss:  0.30872314548693025
              precision    recall  f1-score   support

           0     0.8913    0.9111    0.9011        45
           1     0.3333    0.2857    0.3077         7

    accuracy                         0.8269        52
   macro avg     0.6123    0.5984    0.6044        52
weighted avg     0.8162    0.8269    0.8212        52

Error Analysis for russian
iată ce face Macron în momentul în care implementează un plan diabolic de islamizare a Franței, trezește-te miercuri... @url
acesta este rezultatul raportului despre islamism care confundă islamul cu terorismul.
truth #islam #musulmani #moschee #cenzură #sos #europe #freedomexpression #ong #leftist… @url
ei bine, nu, dar și rebeus au un cocoș mare — bun tg @url
este chiar acolo te arăți sugând rebeusul nu-ți spun, dar îți spun că dacă aș fi în locul tău mi-ar fi rușine — m… @url
Language 

romanian




100%|██████████| 52/52 [00:15<00:00,  3.38it/s]


Test Loss:  0.31877099998438585
              precision    recall  f1-score   support

           0     0.8696    0.8889    0.8791        45
           1     0.1667    0.1429    0.1538         7

    accuracy                         0.7885        52
   macro avg     0.5181    0.5159    0.5165        52
weighted avg     0.7749    0.7885    0.7815        52

Error Analysis for romanian
Со мной что-то подобное происходит, я разгоняюсь как монгол и наезжаю на людей, я сошел с ума (я плачу на гонках в реальной жизни) @url
вот что делает Макрон в тот момент, когда реализует дьявольский план по исламизации Франции, проснись, ср… @url
это результат доклада об исламизме, который путает ислам и терроризм.
dc теперь говорят, что @user или @user являются еврейским и сионистским лобби, зная, что оно претендует на таковое… @url
истина #ислам #мусульмане #мечеть #цензура #сос #европа #свободавыражение #онг #левый… @url
cardiffnlp/twitter-xlm-roberta-base




Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Language 

japaness




100%|██████████| 52/52 [00:19<00:00,  2.65it/s]


Test Loss:  0.5535262277967726
              precision    recall  f1-score   support

           0     0.8958    0.9556    0.9247        45
           1     0.5000    0.2857    0.3636         7

    accuracy                         0.8654        52
   macro avg     0.6979    0.6206    0.6442        52
weighted avg     0.8425    0.8654    0.8492        52

Error Analysis for japaness
フランスをイスラム化する極悪非道な計画を実行しているマクロン大統領が今何をしているのかがこれだ、水起きろ… @url
まあ、いいえ、でもレベウスにも巨根はあります — 良い tg @url
@複数の人を怖がらせる知的テロリズムに屈することを拒否するこれらの人々をすべて利用してください。
@user 強制的に遅刻者はくそー
イギリス：市議会の開会前にイスラム教徒の祈りが（アラビア語で）唱えられる… @url
Language 

italin




100%|██████████| 52/52 [00:16<00:00,  3.07it/s]


Test Loss:  0.9690257287046944
              precision    recall  f1-score   support

           0     0.8571    0.9333    0.8936        45
           1     0.0000    0.0000    0.0000         7

    accuracy                         0.8077        52
   macro avg     0.4286    0.4667    0.4468        52
weighted avg     0.7418    0.8077    0.7733        52

Error Analysis for italin
ecco cosa sta facendo Macron nel momento in cui sta attuando un piano diabolico per islamizzare la Francia, svegliati... @url
@user se è travestito da arabo o nero da ebreo o altro, non mi interessa. ma ecco il travestimento… @url
verità #islam #musulmani #moschea #censura #sos #Europa #espressionedellalibertà #ong #sinistra… @url
beh no, ma anche i rebeus hanno un grosso cazzo — bravo tg @url
@usate tutte queste persone che rifiutano di cedere al terrorismo intellettuale che deve spaventare più di una persona.
Language 

russian




100%|██████████| 52/52 [00:16<00:00,  3.13it/s]


Test Loss:  0.8372328490636741
              precision    recall  f1-score   support

           0     0.8627    0.9778    0.9167        45
           1     0.0000    0.0000    0.0000         7

    accuracy                         0.8462        52
   macro avg     0.4314    0.4889    0.4583        52
weighted avg     0.7466    0.8462    0.7933        52

Error Analysis for russian
iată ce face Macron în momentul în care implementează un plan diabolic de islamizare a Franței, trezește-te miercuri... @url
@utilizator indiferent dacă este deghizat în arab sau negru ca evreu sau orice altceva, nu-mi pasă. dar acolo deghizarea... @url
ei bine, nu, dar și rebeus au un cocoș mare — bun tg @url
@user al naibii de retardații în vigoare
Marea Britanie: rugăciunea musulmană este recitată (în arabă) înainte de deschiderea ședinței consiliului municipal din... @url
Language 

romanian




100%|██████████| 52/52 [00:16<00:00,  3.15it/s]


Test Loss:  0.7660414250567555
              precision    recall  f1-score   support

           0     0.8776    0.9556    0.9149        45
           1     0.3333    0.1429    0.2000         7

    accuracy                         0.8462        52
   macro avg     0.6054    0.5492    0.5574        52
weighted avg     0.8043    0.8462    0.8187        52

Error Analysis for romanian
вот что делает Макрон в тот момент, когда реализует дьявольский план по исламизации Франции, проснись, ср… @url
@user, меня не волнует, замаскирован ли он под араба или чернокожего под еврея или что-то в этом роде. но там маскировка… @url
ну нет, но у ребеуса тоже большой член — хорошо, tg @url
@используйте всех этих людей, которые отказываются поддаваться интеллектуальному терроризму, который должен напугать не одного человека.
@user черт возьми, какие дебилы в силе
cardiffnlp/twitter-xlm-roberta-base-sentiment




Some weights of XLMRobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Language 

japaness




100%|██████████| 52/52 [00:17<00:00,  3.02it/s]


Test Loss:  0.49390753698893464
              precision    recall  f1-score   support

           0     0.8776    0.9556    0.9149        45
           1     0.3333    0.1429    0.2000         7

    accuracy                         0.8462        52
   macro avg     0.6054    0.5492    0.5574        52
weighted avg     0.8043    0.8462    0.8187        52

Error Analysis for japaness
フランスをイスラム化する極悪非道な計画を実行しているマクロン大統領が今何をしているのかがこれだ、水起きろ… @url
@user 彼がアラブ人に変装していようが、黒人がユダヤ人に変装していようが、私には関係ありません。しかし、そこには変装が… @url
まあ、いいえ、でもレベウスにも巨根はあります — 良い tg @url
@複数の人を怖がらせる知的テロリズムに屈することを拒否するこれらの人々をすべて利用してください。
@user 強制的に遅刻者はくそー
Language 

italin




100%|██████████| 52/52 [00:17<00:00,  2.95it/s]


Test Loss:  0.41751978711153453
              precision    recall  f1-score   support

           0     0.8800    0.9778    0.9263        45
           1     0.5000    0.1429    0.2222         7

    accuracy                         0.8654        52
   macro avg     0.6900    0.5603    0.5743        52
weighted avg     0.8288    0.8654    0.8315        52

Error Analysis for italin
ecco cosa sta facendo Macron nel momento in cui sta attuando un piano diabolico per islamizzare la Francia, svegliati... @url
@user se è travestito da arabo o nero da ebreo o altro, non mi interessa. ma ecco il travestimento… @url
@usate tutte queste persone che rifiutano di cedere al terrorismo intellettuale che deve spaventare più di una persona.
@user maledetti i ritardati in vigore
Gran Bretagna: si recita la preghiera musulmana (in arabo) prima dell'apertura della seduta del consiglio comunale di… @url
Language 

russian




100%|██████████| 52/52 [00:17<00:00,  2.91it/s]


Test Loss:  0.6571767923350518
              precision    recall  f1-score   support

           0     0.8636    0.8444    0.8539        45
           1     0.1250    0.1429    0.1333         7

    accuracy                         0.7500        52
   macro avg     0.4943    0.4937    0.4936        52
weighted avg     0.7642    0.7500    0.7569        52

Error Analysis for russian
opozitia nu are program si se opreste pe dezbateri false matar ba ministru @url
iată ce face Macron în momentul în care implementează un plan diabolic de islamizare a Franței, trezește-te miercuri... @url
@utilizator indiferent dacă este deghizat în arab sau negru ca evreu sau orice altceva, nu-mi pasă. dar acolo deghizarea... @url
rt @user istoria Quebecului merită să-i acordăm atenție. atât de trist încât cu greu o mai predăm. text foarte interesant
@folosește pe toți acești oameni care refuză să cedeze terorismului intelectual care trebuie să sperie mai mult de o persoană.
Language 

romanian




100%|██████████| 52/52 [00:17<00:00,  2.95it/s]

Test Loss:  0.4352676968734998
              precision    recall  f1-score   support

           0     0.8958    0.9556    0.9247        45
           1     0.5000    0.2857    0.3636         7

    accuracy                         0.8654        52
   macro avg     0.6979    0.6206    0.6442        52
weighted avg     0.8425    0.8654    0.8492        52

Error Analysis for romanian
вот что делает Макрон в тот момент, когда реализует дьявольский план по исламизации Франции, проснись, ср… @url
@user, меня не волнует, замаскирован ли он под араба или чернокожего под еврея или что-то в этом роде. но там маскировка… @url
ну нет, но у ребеуса тоже большой член — хорошо, tg @url
@используйте всех этих людей, которые отказываются поддаваться интеллектуальному терроризму, который должен напугать не одного человека.
она была грязной арабкой @url



