<a href="https://colab.research.google.com/github/vlavrent/Multilingual-Hate-Speech-Detection/blob/main/Multilingual_Hate_Speech_with_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [None]:
pip install sentence-transformers

In [None]:
pip install sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch

if torch.cuda.is_available():

  device = torch.device("cuda")
  print(f'There are {torch.cuda.device_count()} GPU(s) available.')
  print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from torchsampler import ImbalancedDatasetSampler
import torch
import random
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.model_selection import KFold
import torch, gc
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score,classification_report
from transformers import RobertaTokenizer, RobertaModel, CamembertTokenizer, CamembertModel
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.model_selection import train_test_split



nltk.download('stopwords')

In [None]:
class Polish():

    def __init__(self,data_path,tag_path):
        self.path = data_path
        self.tag_path = tag_path
        self.data = self.read_data()
        self.tag = self.read_tag()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):

        open_data = open(self.path, "r", encoding="utf8")

        return pd.DataFrame(open_data)

    def read_tag(self):

        open_tag = open(self.tag_path,'r')

        return pd.DataFrame(open_tag)

    def remove_punctuation(self,data,column):
      
      return data[column].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
  
    def lower(self):

        return self.data[self.column].str.lower()

    def rename_columns(self,data,column):

        return data.rename(columns={0:column})

    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'http\S+', '',x))

    def remove_end_line(self,data,column):

        return data[column].str.replace('\n','')

    def concat(self):

        self.data[self.label] = self.tag[self.label]

        return self.data
    
    def convert_int(self):

      return self.tag[self.label].apply(lambda x: int(x))

    def double_space(self):
      self.data[self.column].apply(lambda x: x.replace('  ',' '))


    def clean_data(self):

        text_column = self.column
        label_column = self.label

        # Rename columns in both label and text data
        self.data = self.rename_columns(self.data,text_column)

        self.tag = self.rename_columns(self.tag, label_column)

        # Remove url
        self.remove_url()

        # Remove end line character from label and text data
        self.data[text_column] = self.remove_end_line(self.data,text_column)

        self.tag[label_column] = self.remove_end_line(self.tag,label_column)


        # Remove Punctuation
        self.data[text_column] = self.remove_punctuation(self.data,text_column) 

        # Lower words in text data
        self.data[text_column] = self.lower()

        # Remove double space
        self.double_space()

        # Convert label to int
        self.tag[label_column] = self.convert_int()
        
        # Concat text and labels

        return self.concat()

In [None]:
class Slovenian():
    def __init__(self,path):
        self.path = path
        self.data = self.read_data()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):
        return pd.read_csv(self.path)

    def rename_columns(self):

        self.data = self.data[['text','type']]
        return self.data.rename(columns={'type':self.label})
    
    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'http\S+', '',x))

    def strip_punctuation(self):
        return self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    def lower(self):
        return self.data[self.column].str.lower()

    def remove_stopwords(self, data, column):
        data[column] = data[column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("slovene")])
        return data[column].apply(lambda x: " ".join(x))

    def change_labels(self,x):

        change = {'Background offensive':'offensive', 'Acceptable speech':'acceptable', 'Background violence':'offensive',
                  'Other offensive':'offensive', 'Inappropriate':'offensive', 'Other violence':'offensive'}

        for k, v in change.items():

             x = x.replace(k, v)
        return x

    def convert_labels(self):

        self.data[self.label] = self.data[self.label].apply(lambda x: self.change_labels(x))

    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 1 if x=='offensive' else 0)
    
    def double_space(self):
      self.data[self.column].apply(lambda x: x.replace('  ',' '))


    def clean_data(self):

        # Rename Columns
        self.data = self.rename_columns()

        # Remove url
        self.remove_url()

        # Strip Punctuation
        self.data[self.column] = self.strip_punctuation()

        # Lowercase
        self.data[self.column] = self.lower()

        # Remove stopwords
        self.data[self.column] = self.remove_stopwords(self.data,self.column)
        
        # Convert labels
        self.convert_labels()

        # Remove double space
        self.double_space()
        
        # Binarize labels
        self.binarize_labels()

        return self.data

In [None]:
class Croatian():
    def __init__(self,path):
        self.path = path
        self.data = self.read_data()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):
        return pd.read_csv(self.path)

    def rename_columns(self):

        self.data = self.data[['text','type']]
        return self.data.rename(columns={'type':self.label})
    
    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'http\S+', '',x))

    def strip_punctuation(self):
        return self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    def lower(self):
        return self.data[self.column].str.lower()

    def remove_stopwords(self, data, column):
        data[column] = data[column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("slovene")])
        return data[column].apply(lambda x: " ".join(x))

    def change_labels(self,x):

        change = {'Background offensive':'offensive', 'Acceptable speech':'not offensive', 'Background violence':'offensive',
                  'Other offensive':'offensive', 'Inappropriate':'offensive', 'Other violence':'offensive'}

        for k, v in change.items():

             x = x.replace(k, v)
        return x

    def convert_labels(self):

        self.data[self.label] = self.data[self.label].apply(lambda x: self.change_labels(x))

        hate = ['offensive','not offensive']

        self.data = self.data[self.data[self.label].isin(hate)]
    
    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 1 if x=='offensive' else 0)

    def double_space(self):
      self.data[self.column].apply(lambda x: x.replace('  ',' '))

    def clean_data(self):

        # Rename Columns
        self.data = self.rename_columns()

        # Remove url
        self.remove_url()

        # Strip Punctuation
        self.data[self.column] = self.strip_punctuation()

        # Lowercase
        self.data[self.column] = self.lower()

        # Remove stopwords
        self.data[self.column] = self.remove_stopwords(self.data,self.column)

        # Remove double space
        self.double_space()
        
        # Binarize labels
        self.convert_labels()
       
        self.binarize_labels()

        return self.data

In [None]:
from nltk.corpus import stopwords
class Greek():
    def __init__(self,path):
        self.path  = path
        self.data = self.read_data()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):
        self.data = pd.read_csv(self.path)
        self.data.tweet = self.data.tweet.apply(lambda x: str(x))
        self.data.subtask_a = self.data.subtask_a.apply(lambda x: str(x))
        return self.data

    def rename_columns(self):

        return self.data.rename(columns={'tweet':self.column,'subtask_a':self.label})

    def replace_hashtag(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub("#[\w]+", "hashtag ", x))
        self.data[self.column] = self.data[self.column].apply(lambda x: x.replace('hashtag',''))
        return self.data[self.column]
    
    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'http\S+', '',x))

    def strip_punctuation(self):

        return self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]',' ',x))

    def lower(self):
        return self.data[self.column].str.lower()

    def remove_stopwords(self, data, column):
        data[column] = data[column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("greek")])
        return data[column].apply(lambda x: " ".join(x))

    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 0 if x=='NOT' else 1)
    
    def double_space(self):
      self.data[self.column].apply(lambda x: x.replace('  ',' '))

    def clean_data(self):
        text_column = self.column
        label_column = self.label

        # Rename Columns
        self.data = self.rename_columns()

        # Replace hashtag
        self.data[text_column] = self.replace_hashtag()

        # Remove urls
        self.remove_url()

        # Strip Punctuation
        self.data[text_column] = self.strip_punctuation()

        # Lower text
        self.data[text_column] = self.lower()

        # Remove Stopwords
        self.data[text_column] = self.remove_stopwords(self.data,text_column)

        # Remove double space
        self.double_space()

        # Binarize labels
        self.binarize_labels()

        return self.data

In [None]:
class English():

    def __init__(self,path):
        self.data = self.read(path)
        self.label = 'label'
        self.column = 'text'

    def read(self,path):
        return pd.read_csv(path)

    def replace_mentions(self):
        self.data[self.column] = self.data[self.column].apply(lambda row: re.sub("@[A-Za-z0-9]+_*[A-Za-z0-9]+|@[_]+[A-Za-z0-9]+_*[A-Za-z0-9]+|@[_]+[A-Za-z0-9]+[_]+_*[A-Za-z0-9]+[_]+", "mention", row))
        self.data[self.column] = self.data[self.column].apply(lambda row: re.sub("mention_", "mention", row))

    def remove_end_line(self):

        return self.data[self.column].str.replace('\n','')

    def remove_punctuation(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    def replace_hashtag(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub("#[\w]+", "hashtag ", x))
        self.data[self.column] = self.data[self.column].apply(lambda x: x.replace('hashtag',''))
        
    def remove_stopwords(self):
        self.data[self.column] = self.data[self.column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("english")])
        self.data[self.column] = self.data[self.column].apply(lambda x: " ".join(x))

    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\#)*\b', '',x))
    
    def lower(self):
        return self.data[self.column].str.lower()
    
    def double_space(self):
      return self.data[self.column].apply(lambda x: x.replace('  ',' '))
    
    def remove_noise(self):
      return self.data[self.column].apply(lambda x: x.replace('rt',''))

    
    def clean_data(self):
      
        # Remove urls
        self.remove_url()

        # Remove end-line
        self.data[self.column] = self.remove_end_line()

        # Replace mentions
        self.replace_mentions()

        # Replace hashtags
        self.replace_hashtag()

        # Remove punctuation
        self.remove_punctuation()

        # Remove stopwords
        self.remove_stopwords()

        # Lower text
        self.data[self.column] = self.lower()

        # Remove double space
        self.data[self.column] = self.double_space()

        # Remove noise
        self.data[self.column] = self.remove_noise()

        return self.data

In [None]:
class Transform_Data():
  def __init__(self,train,test,language_model,column,label,cmodel):
    self.max_length = 60
    self.tokenizer = cmodel.from_pretrained(language_model)
    self.train = train
    self.test = test
    self.column = column
    self.label = label

  def Tokenizer(self):

    train_encodings = self.tokenizer.batch_encode_plus(self.train[self.column].tolist(),add_special_tokens = True, truncation=True, padding=True, max_length=self.max_length,return_tensors='pt')
    train_y = torch.tensor(self.train[self.label].tolist())
  
    val_encodings = self.tokenizer.batch_encode_plus(self.test[self.column].tolist(),add_special_tokens = True, truncation=True, padding=True, max_length=self.max_length,return_tensors='pt')
    val_y = torch.tensor(self.test[self.label].tolist())

    return train_encodings, train_y, val_encodings,val_y 

  def Tensors_and_DataLoaders(self,train_encodings,train_y,val_encodings, val_y):

    #====================
    #Train data
    #====================
    train_data = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_y)
    class_samples = [(train_y == 0.).sum(dim=0),(train_y == 1.).sum(dim=0)]
    total_samples = sum(class_samples)
    

    class_weights = [total_samples/class_samples[i] for i in range(len(class_samples))]
    weights = [class_weights[train_y[i]] for i in range(int(total_samples))]

    train_sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(total_samples))
    

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)


    #====================
    #Test/Validation data
    #====================

    val_data = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_y)

    val_sampler = SequentialSampler(val_data)

    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

    return train_dataloader, val_dataloader

  def Execute(self):

    train_encodings, train_y, val_encodings,val_y = self.Tokenizer()
    train_dataloader, val_dataloader = self.Tensors_and_DataLoaders(train_encodings,train_y,val_encodings, val_y)
    return train_encodings, train_y, val_encodings,val_y, train_dataloader, val_dataloader


In [None]:
import torch.nn as nn

class Model_Arch(nn.Module):

    def __init__(self, ROBERTA,freeze_bert):
      
      super(Model_Arch, self).__init__()

      self.bert = ROBERTA
      self.freeze_bert = freeze_bert
      
      # dropout layer
      self.dropout = nn.Dropout(0.3)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(self.bert.config.hidden_size,1)
      

      # sigmoid activation function
      self.sigmoid =  nn.Sigmoid()
      #self.softmax = nn.LogSoftmax(dim=1)

      if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _,cls_hs = self.bert(sent_id, attention_mask=mask,return_dict=False) #,return_dict=False
      
      x = self.fc1(cls_hs)
      
      # activation function
      #x = self.relu(x)
      
      # dropout
      #x = self.dropout(x)
      
      # apply softmax activation
      x = self.sigmoid(x)
      

      return x

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import get_scheduler


def Optimizer_Scheduler(model,train_dataloader,num_epochs):
  
  optimizer = AdamW(model.parameters(), lr=5e-5) #weight_decay=0.02
 
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_linear_schedule_with_warmup(
                       optimizer=optimizer,
                       num_warmup_steps=0,
                       num_training_steps=num_training_steps)

  return optimizer, lr_scheduler, num_training_steps

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

def Mean(data):
    return sum(data) / len(data)

def train(model, num_epochs, train_dataloader,val_dataloader,best_valid_loss,language,lanmodel):

  # Optimizer and Scheduler
  optimizer, lr_scheduler, num_training_steps = Optimizer_Scheduler(model,train_dataloader,num_epochs)

  progress_bar = tqdm(range(num_training_steps))
  loss_fn = nn.BCELoss()
  #loss_fn = nn.CrossEntropyLoss()


  # Initialize arrays
  train_acc = []
  val_acc = [] 
  train_loss = []  
  val_loss = []
  avg_acc_0 = []
  avg_acc_1 = []
  total_avg_loss_train = []
  total_avg_loss_val = []

  # Set a flag for results
  flag = True
  
  for epoch in range(num_epochs):
    
    total_loss, batch_loss, batch_counts = 0, 0, 0

    predictions = []
    real_label = []

    model.train()
    for step, batch in enumerate(train_dataloader):
      batch_counts += 1
      
      # batch to GPU
      batch = [r.to(device) for r in batch]

      sent_id, mask, labels = batch
      real_label.append(labels.detach().cpu().numpy())
      
      
      # clear previous gradients
      model.zero_grad()
       
      # predictions for current batch
      output = model(sent_id, mask)

      pred = torch.round(output)
      #pred = torch.argmax(output,1)

      pred = pred.detach().cpu().numpy()
      pred = pred.flatten()
      predictions.append(pred)

      
      
      # compute loss for current batch
      loss = loss_fn(output, labels.unsqueeze(1).float())
      #loss = loss_fn(output, labels)

      # add total loss
      total_loss = total_loss + loss.item()


      # backpropagation to calculate gradients
      loss.backward()

      # prevent the exploding gradient problem
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


      # update parameters
      optimizer.step()

      # update scheduler 
      lr_scheduler.step()

      # clear optimizer gradients
      optimizer.zero_grad()

      progress_bar.update(1)

    # compute training loss of each batch
    avg_loss = total_loss / len(train_dataloader)

    # flatten labels and predictions
    flat_label = np.concatenate(real_label).astype(int).ravel().tolist()
    
    flat_predictions = np.concatenate(predictions).astype(int).ravel().tolist()

    # Append accuracy and validation loss for training data
    train_acc.append(f1_score(flat_label,flat_predictions))
    train_loss.append(avg_loss)

    # Validation
    val_flat_label,val_flat_predictions, val_avg_loss = validate(model,val_dataloader,loss_fn)

    if val_avg_loss < best_valid_loss:
        best_valid_loss = val_avg_loss
        torch.save(model.state_dict(), '/content/drive/My Drive/Datasets/All/'+lanmodel+'_'+language+'_weights.pt')     
    
    
    # Append accuracy and validation loss for validation data
    val_acc.append(f1_score(val_flat_label,val_flat_predictions))
    val_loss.append(val_avg_loss)

    # Compute each class Accuracy (validation set)
    acc_0,acc_1 = Class_F1_score(val_flat_label,val_flat_predictions)
    avg_acc_0.append(acc_0)
    avg_acc_1.append(acc_1)

    # Print table with insights
    if flag:
      print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Train F1-score':^9} | {'Val F1-score':^9} | {'Class_0 F1-score':^10} | {'Class_1 F1-score':^10} ")
      flag = False

    print("-"*85)
    print(f"{str(epoch + 1) +'/'+ str(num_epochs):^7} | {avg_loss:^12.6f} | {val_avg_loss:^10.6f} | {f1_score(flat_label,flat_predictions):^9.2f} | {f1_score(val_flat_label,val_flat_predictions):^9.2f} | {acc_0:^10.2f} | {acc_1:^10.2f} ")
  
  

  # Print mean of Accuracy and Loss
  print("-"*85)
  print(f"{'Average':^7} | {Mean(train_loss):^12.6f} | {Mean(val_loss):^10.6f} | {Mean(train_acc):^9.2f} | {Mean(val_acc):^9.2f} | {Mean(avg_acc_0):^10.2f} | {Mean(avg_acc_1):^10.2f} ")

  




In [None]:
def validate(model,val_dataloader,loss_fn):
  
    

    model.eval() 

    total_loss, total_accuracy = 0, 0
    
    val_preds = []
    val_label = []
    
    for step,batch in enumerate(val_dataloader):
      
      batch = [t.to(device) for t in batch]
      
      sent_id, mask, labels = batch
      val_label.append(labels.detach().cpu().numpy())
      
      with torch.no_grad():
        output = model(sent_id,mask)
        
      loss = loss_fn(output,labels.unsqueeze(1).float())  
      #loss = loss_fn(output,labels) 
      total_loss = total_loss + loss.item()
        
        
      pred = torch.round(output)
      #pred = torch.argmax(output,1)
      pred = pred.detach().cpu().numpy()
      pred = pred.flatten()
      val_preds.append(pred)

    # compute training loss of each epoch
    avg_loss = total_loss / len(val_dataloader)
    print('Total Loss: '+str(total_loss))
    print('Avg Loss: '+str(avg_loss))

    # flatten labels and predictions
    flat_label = np.concatenate(val_label).astype(int).ravel().tolist()
    
    
    flat_predictions = np.concatenate(val_preds).astype(int).ravel().tolist()

    

    return flat_label,flat_predictions, avg_loss

In [None]:
def Class_F1_score(val_y,new_preds):

  report = classification_report(val_y,new_preds, output_dict=True )

  return report['0']['f1-score'], report['1']['f1-score']

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import random
from past.builtins import xrange

def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = list(range(len(rs)))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in xrange(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

def balance(data,sample):

      model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
      embeddings = model.encode(data['text'].to_numpy())

      D = pairwise_distances(embeddings, metric='cosine')

      M, C = kMedoids(D, sample)
      return data.iloc[M]



def execute(data,sample):
   
    return balance(data,sample)

In [None]:
import pandas as pd

def find_length(language):
  length = []
  
  for count, value in enumerate(language):
    
    if value=='slovenian' or value=='Slovenian':
      # Slovenian
      text_path = "/content/drive/My Drive/Datasets/Slovenian/Slovene_train_set.csv"
      length.append(len(pd.read_csv(text_path).index))

    elif value=='polish' or value=='Polish':
      # Polish
      text_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_text.txt"
      tag_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_tags.txt"
      length.append(len(pd.read_csv(open(tag_path, "r", encoding="utf8")).index))
      
    elif value=='croatian' or value=='Croatian':
      # Croatian
      text_path = "/content/drive/My Drive/Datasets/Croatian/Croatian_train_set.csv"
      length.append(len(pd.read_csv(text_path).index))

    elif value=='greek' or value=='Greek':
      # Greek
      text_path = "/content/drive/My Drive/Datasets/Greek/offenseval-gr_train.csv"
      
      length.append(len(pd.read_csv(text_path).index))
  return min(length)
 


def preprocess_language(language,length,number_language):

  if language=='slovenian' or language=='Slovenian':
    # Slovenian
    text_path = "/content/drive/My Drive/Datasets/Slovenian/Slovene_train_set.csv"

    slovenian = Slovenian(text_path)
    data = slovenian.clean_data()

  elif language=='polish' or language=='Polish':
    # Polish
    text_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_text.txt"
    tag_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_tags.txt"
    translate = "/content/drive/My Drive/Datasets/Polish/Polish_translated.csv"

    polish = Polish(text_path,tag_path,translate)
    data = polish.clean_data()
    print(data)
    print(data.label.value_counts())
    if number_language>1 and len(data)>length:
          data  = execute(data,length)
  
  elif language=='croatian' or language=='Croatian':
    # Croatian
    text_path = "/content/drive/My Drive/Datasets/Croatian/Croatian_train_set.csv"

    croatian = Croatian(text_path)
    data = croatian.clean_data()
    if number_language>1 and len(data)>length:
          data  = execute(data,length)

  elif language=='greek' or language=='Greek':
    # Greek
    text_path = "/content/drive/My Drive/Datasets/Greek/offenseval-gr_train-translated.csv"

    greek = Greek(text_path)
    data = greek.clean_data()
    print(data)
    if number_language>1 and len(data)>length:
      data  = execute(data,length)
  elif language=='english' or language=='English':
    # English
    text_path = "/content/drive/My Drive/Datasets/English/English_12514.csv"

    english = English(text_path)
    data = english.clean_data()
    print(data)

    #if number_language>1 and len(data)>length:
     # data  = execute(data,12514)
  return data

def languages(lang):
  
  total_languages = []
  length = find_length(lang)
  print(len(lang))
  

  for count, value in enumerate(lang):
    
    total_languages.append(preprocess_language(value,length,len(lang)))
  
  return pd.concat(total_languages)


In [None]:
def choose_model(cmodel):
  if cmodel=='Bert' or cmodel=='BERT' or cmodel=='BERT' or cmodel=='bert':
    return BertModel, BertTokenizer
  elif cmodel=='ROBERTA' or cmodel=='XLMRoberta' or cmodel=='Roberta' or cmodel=='roberta':
    return XLMRobertaModel, XLMRobertaTokenizer
  elif cmodel=='camembert' or cmodel=='Camembert':
    return CamembertModel, CamembertTokenizer

In [None]:
def language_model(mlang):

  if mlang=='multilingual_roberta':
    return "xlm-roberta-base"

  elif mlang=='bert_greek':
    return "nlpaueb/bert-base-greek-uncased-v1"

  elif mlang=='bert_polish':
    return "dkleczek/bert-base-polish-uncased-v1"
  
  elif mlang=='bert_croatian':
    return "EMBEDDIA/crosloengual-bert"
  
  elif mlang=='bert_slovenian':
    return "EMBEDDIA/sloberta"
  
  elif mlang=='multilingual_bert':
    return "bert-base-multilingual-cased"

In [None]:
settings = {
    'language_model':'multilingual_bert', #multilingual_roberta, multilingual_bert, bert_greek, bert_polish, bert_croatian, bert_slovenian
    'model': 'bert',                #roberta, bert, camembert
    'num_epochs': 5,
    'training_language': ['polish','croatian'] #greek, english, slovenian, polish, croatian
}


In [None]:
 best_valid_loss = float('inf')

# Concat data
data = languages(settings['training_language'])

# Choose Language Model
lang_model = language_model(settings['language_model'])

# Choose model
cmodel,ctokenizer = choose_model(settings['model'])

# Train Test split
train_dataset,val_dataset = train_test_split(data,test_size=0.15,random_state=21)
  
# Transfom Data
transform = Transform_Data(train_dataset,val_dataset,lang_model,'text','label',ctokenizer)
train_encodings, train_y, val_encodings,val_y, train_dataloader, val_dataloader = transform.Execute()

# Initiate Model

model_arch = cmodel.from_pretrained(lang_model)
model = Model_Arch(model_arch,False)
model.to(device)


# Train Model
language = settings['training_language']
language = '_'.join(language)
lanmodel = settings['model']

train(model, settings['num_epochs'], train_dataloader,val_dataloader,best_valid_loss,language,lanmodel)

In [None]:
def predict(model, test_dataloader):
    
    model.eval()

    # File in drive
    path = '/content/drive/My Drive/Datasets/All/'+lanmodel+'_'+language+'_weights.pt'
    model.load_state_dict(torch.load(path))

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
            #preds = logits.detach().cpu().numpy()
            
        all_logits.append(torch.round(logits))
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)
    preds = all_logits.detach().cpu().numpy()
    preds = np.concatenate(preds).astype(int).ravel().tolist()
    

    # Apply softmax to calculate probabilities
    #probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return preds

In [None]:
def choose_language(clanguage):

  if clanguage=='Slovenian' or clanguage=='slovenian':
    # Test path Slovenian
    text_path = "/content/drive/My Drive/Datasets/Slovenian/Slovene_test_set.csv"
    
    # Clean Slovenian data
    slovenian = Slovenian(text_path)
    test_data = slovenian.clean_data()

  elif clanguage=='Polish' or clanguage=='polish':
    # Text path Polish
    text_path = "/content/drive/My Drive/Datasets/Polish/test_set_clean_only_text.txt"
    tag_path = "/content/drive/My Drive/Datasets/Polish/test_set_clean_only_tags.txt"
    
    # Clean Polish data
    polish = Polish(text_path,tag_path)
    test_data = polish.clean_data()

  elif clanguage=='Croatian' or clanguage=='croatian':
    # Text path Croatian
    text_path = "/content/drive/My Drive/Datasets/Croatian/Croatian_test_set.csv"
    
    # Clean Croatian Data
    croatian = Croatian(text_path)
    test_data = croatian.clean_data()
  
  elif clanguage=='Greek' or clanguage=='greek':
    # Text path Greek
    text_path = "/content/drive/My Drive/Datasets/Greek/offenseval-gr-test.csv"
    
    # Clean data
    greek = Greek(text_path)
    test_data = greek.clean_data()

  return test_data

In [None]:
class Polish():

    def __init__(self,data_path,tag_path):
        self.path = data_path
        self.tag_path = tag_path
        self.data = self.read_data()
        self.tag = self.read_tag()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):

        open_data = open(self.path, "r", encoding="utf8")

        return pd.DataFrame(open_data)

    def read_tag(self):

        open_tag = open(self.tag_path,'r')

        return pd.DataFrame(open_tag)

    def remove_punctuation(self,data,column):
      
      return data[column].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
  
    def lower(self):

        return self.data[self.column].str.lower()

    def rename_columns(self,data,column):

        return data.rename(columns={0:column})

    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'http\S+', '',x))

    def remove_end_line(self,data,column):

        return data[column].str.replace('\n','')

    def concat(self):

        self.data[self.label] = self.tag[self.label]

        return self.data
    
    def convert_int(self):

      return self.tag[self.label].apply(lambda x: int(x))

    def double_space(self):
      self.data[self.column].apply(lambda x: x.replace('  ',' '))


    def clean_data(self):

        text_column = self.column
        label_column = self.label

        # Rename columns in both label and text data
        self.data = self.rename_columns(self.data,text_column)

        self.tag = self.rename_columns(self.tag, label_column)

        # Remove url
        self.remove_url()

        # Remove end line character from label and text data
        self.data[text_column] = self.remove_end_line(self.data,text_column)

        self.tag[label_column] = self.remove_end_line(self.tag,label_column)


        # Remove Punctuation
        self.data[text_column] = self.remove_punctuation(self.data,text_column) 

        # Lower words in text data
        self.data[text_column] = self.lower()

        # Remove double space
        self.double_space()

        # Convert label to int
        self.tag[label_column] = self.convert_int()
        
        # Concat text and labels

        return self.concat()

In [None]:
from nltk.corpus import stopwords
class Greek():
    def __init__(self,path):
        self.path  = path
        self.data = self.read_data()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):
        return pd.read_csv(self.path)

    def rename_columns(self):

        return self.data.rename(columns={'tweet':self.column,'subtask_a':self.label})

    def replace_hashtag(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub("#[\w]+", "hashtag ", x))
        self.data[self.column] = self.data[self.column].apply(lambda x: x.replace('hashtag',''))
        return self.data[self.column]
    
    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'http\S+', '',x))

    def strip_punctuation(self):

        return self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]',' ',x))

    def lower(self):
        return self.data[self.column].str.lower()

    def remove_stopwords(self, data, column):
        data[column] = data[column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("greek")])
        return data[column].apply(lambda x: " ".join(x))

    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 0 if x=='NOT' else 1)
    
    def double_space(self):
      self.data[self.column].apply(lambda x: x.replace('  ',' '))

    def clean_data(self):
        text_column = self.column
        label_column = self.label

        # Rename Columns
        self.data = self.rename_columns()

        # Replace hashtag
        self.data[text_column] = self.replace_hashtag()

        # Remove urls
        self.remove_url()

        # Strip Punctuation
        self.data[text_column] = self.strip_punctuation()

        # Lower text
        self.data[text_column] = self.lower()

        # Remove Stopwords
        self.data[text_column] = self.remove_stopwords(self.data,text_column)

        # Remove double space
        self.double_space()

        # Binarize labels
        self.binarize_labels()

        return self.data

In [None]:
import numpy
import torch, gc


# Choose Testing dataset
test_data = choose_language('polish')

lang_model = language_model(settings['language_model'])
cmodel,ctokenizer = choose_model(settings['model'])

# Tokenize and Encode Data
tokenizer = ctokenizer.from_pretrained(lang_model)
test_encodings = tokenizer.batch_encode_plus(test_data['text'].tolist(),add_special_tokens = True, truncation=True, padding=True, max_length=60,return_tensors='pt')

# Convert input_ids and attention_mask to tensors
test_seq = torch.tensor(test_encodings['input_ids'])
test_mask = torch.tensor(test_encodings['attention_mask'])

# Sample data
test_dataset = TensorDataset(test_seq, test_mask)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

test_y = torch.tensor(test_data['label'].tolist())

# Model
bert = cmodel.from_pretrained(lang_model)
model = Model_Arch(bert,False)
model.to(device)


# Predict
test_pred = predict(model, test_dataloader)
test_label = test_y.detach().cpu().numpy()

# Print Results
print(test_label)
print(len(test_label))
print(len(test_pred))
print(classification_report(test_label,test_pred))
