<a href="https://colab.research.google.com/github/vlavrent/Multilingual-Hate-Speech-Detection/blob/main/Polish%2C_Slovenian_and_Croatian.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Polish and Slovenian Hate Speech</h1>

In [28]:
pip install transformers



In [29]:
pip install sentencepiece



In [30]:
pip install sentence_transformers



In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
import torch

if torch.cuda.is_available():

  device = torch.device("cuda")
  print(f'There are {torch.cuda.device_count()} GPU(s) available.')
  print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla K80


In [33]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from torchsampler import ImbalancedDatasetSampler
import torch
import random
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.model_selection import KFold
import torch, gc
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score,classification_report
from transformers import RobertaTokenizer, RobertaModel
from transformers import XLMRobertaTokenizer, XLMRobertaModel



nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<h1>Preprocess English</h1>

In [34]:
class English():

    def __init__(self,path):
        self.data = self.read(path)
        self.label = 'label'
        self.column = 'text'

    def read(self,path):
        return pd.read_csv(path)

    def replace_label(self,x):
        if x=='1' or x=='0' or x=='HOF':
            return 'HOF'
        else:
            return 'NOT'

    def fix_label(self):
        self.data[self.label] = self.data[self.label].apply(lambda x: self.replace_label(x))


    def replace_mentions(self):
        self.data[self.column] = self.data[self.column].apply(lambda row: re.sub("@[A-Za-z0-9]+_*[A-Za-z0-9]+", "mention", row))
        self.data[self.column] = self.data[self.column].apply(lambda row: re.sub("mention_", "mention", row))

    def remove_punctuation(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    def replace_hashtag(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub("#[\w]+", "hashtag", x))

    def remove_stopwords(self):
        self.data[self.column] = self.data[self.column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("english")])
        self.data[self.column] = self.data[self.column].apply(lambda x: " ".join(x))

    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '',x))
    
    def lower(self):
        return self.data[self.column].str.lower()

    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 0 if x=='NOT' else 1)

    def clean_data(self):
        self.fix_label()
        self.remove_url()
        self.replace_mentions()
        self.replace_hashtag()
        self.remove_punctuation()
        self.remove_stopwords()
        self.lower()
        self.binarize_labels()

        return self.data

<h1>Preprocess Polish</h1>

In [35]:
class Polish():

    def __init__(self,data_path,tag_path):
        self.path = data_path
        self.tag_path = tag_path
        self.data = self.read_data()
        self.tag = self.read_tag()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):

        open_data = open(self.path, "r", encoding="utf8")

        return pd.DataFrame(open_data)

    def read_tag(self):

        open_tag = open(self.tag_path,'r')

        return pd.DataFrame(open_tag)

    def remove_punctuation(self,data,column):
      
      return data[column].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
  
    def lower(self):

        return self.data[self.column].str.lower()

    def rename_columns(self,data,column):

        return data.rename(columns={0:column})

    def remove_mentions(self):

        return self.data[self.column].apply(lambda row: re.sub("@[A-Za-z0-9]+_[A-Za-z0-9]+","",row))

    def remove_end_line(self,data,column):

        return data[column].str.replace('\n','')

    def concat(self):

        self.data[self.label] = self.tag[self.label]

        return self.data
    
    def convert_int(self):

      return self.tag[self.label].apply(lambda x: int(x))


    def clean_data(self):

        text_column = self.column
        label_column = self.label

        # Rename columns in both label and text data
        self.data = self.rename_columns(self.data,text_column)

        self.tag = self.rename_columns(self.tag, label_column)

        # Remove Punctuation
        self.data[text_column] = self.remove_punctuation(self.data,text_column) 

        # Lower words in text data
        self.data[text_column] = self.lower()

        # Remove user mentions in text data
        self.data[text_column] = self.remove_mentions()

        # Remove end line character from label and text data
        self.data[text_column] = self.remove_end_line(self.data,text_column)

        self.tag[label_column] = self.remove_end_line(self.tag,label_column)

        # Convert label to int
        self.tag[label_column] = self.convert_int()
        

        # Concat text and labels

        return self.concat()

<h1>Preprocess Slovenian</h1>

In [36]:
class Slovenian():
    def __init__(self,path):
        self.path = path
        self.data = self.read_data()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):
        return pd.read_csv(self.path)

    def rename_columns(self):

        self.data = self.data[['text','type']]
        return self.data.rename(columns={'type':self.label})

    def strip_punctuation(self):
        return self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    def lower(self):
        return self.data[self.column].str.lower()

    def remove_stopwords(self, data, column):
        data[column] = data[column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("slovene")])
        return data[column].apply(lambda x: " ".join(x))

    def change_labels(self,x):

        change = {'Background offensive':'offensive', 'Acceptable speech':'not offensive', 'Background violence':'violence',
                  'Other offensive':'offensive', 'Inappropriate':'innappropriate', 'Other violence':'violence'}

        for k, v in change.items():

             x = x.replace(k, v)
        return x

    def convert_labels(self):

        self.data[self.label] = self.data[self.label].apply(lambda x: self.change_labels(x))

    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 1 if x=='offensive' else 0)


    def clean_data(self):

        # Rename Columns
        self.data = self.rename_columns()

        # Strip Punctuation
        self.data[self.column] = self.strip_punctuation()

        # Lowercase
        self.data[self.column] = self.lower()

        # Remove stopwords
        self.data[self.column] = self.remove_stopwords(self.data,self.column)

        self.convert_labels()

        self.binarize_labels()

        return self.data

<h1>Preprocess Croatian</h1>

In [37]:
class Croatian():
    def __init__(self,path):
        self.path = path
        self.data = self.read_data()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):
        return pd.read_csv(self.path)

    def rename_columns(self):

        self.data = self.data[['text','type']]
        return self.data.rename(columns={'type':self.label})

    def strip_punctuation(self):
        return self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    def lower(self):
        return self.data[self.column].str.lower()

    def remove_stopwords(self, data, column):
        data[column] = data[column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("slovene")])
        return data[column].apply(lambda x: " ".join(x))

    def change_labels(self,x):

        change = {'Background offensive':'offensive', 'Acceptable speech':'not offensive', 'Background violence':'offensive',
                  'Other offensive':'offensive', 'Inappropriate':'offensive', 'Other violence':'offensive'}

        for k, v in change.items():

             x = x.replace(k, v)
        return x

    def convert_labels(self):

        self.data[self.label] = self.data[self.label].apply(lambda x: self.change_labels(x))

        hate = ['offensive','not offensive']

        self.data = self.data[self.data[self.label].isin(hate)]
    
    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 1 if x=='offensive' else 0)


    def clean_data(self):

        # Rename Columns
        self.data = self.rename_columns()

        # Strip Punctuation
        self.data[self.column] = self.strip_punctuation()

        # Lowercase
        self.data[self.column] = self.lower()

        # Remove stopwords
        self.data[self.column] = self.remove_stopwords(self.data,self.column)

        self.convert_labels()

        self.binarize_labels()

        return self.data

<h1>BERT Tokenizer, Tensors and Dataloaders</h1>


In [38]:
class Transform_Data():
  def __init__(self,train,test,Bert_model,column,label):
    self.max_length = 60
    self.tokenizer = BertTokenizer.from_pretrained(Bert_model)
    self.train = train
    self.test = test
    self.column = column
    self.label = label

  def BERTTokenizer(self):

    train_encodings = self.tokenizer.batch_encode_plus(self.train[self.column].tolist(),add_special_tokens = True, truncation=True, padding=True, max_length=self.max_length,return_tensors='pt')
    train_y = torch.tensor(self.train[self.label].tolist())
  
    val_encodings = self.tokenizer.batch_encode_plus(self.test[self.column].tolist(),add_special_tokens = True, truncation=True, padding=True, max_length=self.max_length,return_tensors='pt')
    val_y = torch.tensor(self.test[self.label].tolist())

    return train_encodings, train_y, val_encodings,val_y 

  def Tensors_and_DataLoaders(self,train_encodings,train_y,val_encodings, val_y):

    #====================
    #Train data
    #====================
    train_data = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_y)

    #train_sampler = RandomSampler(train_data)
    #train_sampler = ImbalancedDatasetSampler(train_data)
    #train_sampler = WeightedRandomSampler()
    class_samples = [(train_y == 0.).sum(dim=0),(train_y == 1.).sum(dim=0)]
    total_samples = sum(class_samples)
    

    class_weights = [total_samples/class_samples[i] for i in range(len(class_samples))]
    weights = [class_weights[train_y[i]] for i in range(int(total_samples))]

    train_sampler = WeightedRandomSampler(torch.DoubleTensor(weights), int(total_samples))
    

    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)


    #====================
    #Test/Validation data
    #====================

    val_data = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_y)

    val_sampler = SequentialSampler(val_data)

    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)

    return train_dataloader, val_dataloader

  def Execute(self):

    train_encodings, train_y, val_encodings,val_y = self.BERTTokenizer()
    train_dataloader, val_dataloader = self.Tensors_and_DataLoaders(train_encodings,train_y,val_encodings, val_y)
    return train_encodings, train_y, val_encodings,val_y, train_dataloader, val_dataloader


<H1>BERT Model</H1>

In [39]:
import torch.nn as nn

class BERT_Arch(nn.Module):

    def __init__(self, bert,freeze_bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      self.freeze_bert = freeze_bert
      
      # dropout layer
      self.dropout = nn.Dropout(0.3)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,1)
      

      # sigmoid activation function
      self.sigmoid =  nn.Sigmoid()

      if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask,return_dict=False) #,return_dict=False
      
      x = self.fc1(cls_hs)
      
      # activation function
      #x = self.relu(x)
      
      # dropout
      #x = self.dropout(x)
      
      # apply softmax activation
      x = self.sigmoid(x)

      return x

<h1>Optimizer and Scheduler</h1>


In [40]:
from transformers import AdamW
from transformers import get_scheduler


def Optimizer_Scheduler(model,train_dataloader,num_epochs):
  
  optimizer = AdamW(model.parameters(), lr=5e-5)
 
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler(
                       "linear",
                       optimizer=optimizer,
                       num_warmup_steps=0,
                       num_training_steps=num_training_steps)

  return optimizer, lr_scheduler, num_training_steps

<h1>Train Model</h1>

In [41]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm

def Mean(data):
    return sum(data) / len(data)

def train(model, num_epochs, train_dataloader,val_dataloader,best_valid_loss,language,new):
  
  progress_bar = tqdm(range(num_training_steps))
  loss_fn = nn.BCELoss()
  #loss_fn = nn.BCEWithLogitsLoss()

  # Initialize arrays
  train_acc = []
  val_acc = [] 
  train_loss = []  
  val_loss = []
  avg_acc_0 = []
  avg_acc_1 = []
  total_avg_loss_train = []
  total_avg_loss_val = []

  # Set a flag for results
  flag = True
  
  for epoch in range(num_epochs):
    
    total_loss, batch_loss, batch_counts = 0, 0, 0

    predictions = []
    real_label = []

    model.train()
    for step, batch in enumerate(train_dataloader):
      batch_counts += 1
      
      # batch to GPU
      batch = [r.to(device) for r in batch]

      sent_id, mask, labels = batch
      real_label.append(labels.detach().cpu().numpy())
      
      
      # clear previous gradients
      model.zero_grad()
       
      # predictions for current batch
      output = model(sent_id, mask)

      pred = torch.round(output)

      pred = pred.detach().cpu().numpy()
      pred = pred.flatten()
      predictions.append(pred)
      
      
      # compute loss for current batch
      loss = loss_fn(output, labels.unsqueeze(1).float())

      # add total loss
      total_loss = total_loss + loss.item()


      # backpropagation to calculate gradients
      loss.backward()

      # prevent the exploding gradient problem
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


      # update parameters
      optimizer.step()

      # update scheduler 
      lr_scheduler.step()

      # clear optimizer gradients
      optimizer.zero_grad()

      progress_bar.update(1)

    # compute training loss of each batch
    avg_loss = total_loss / len(train_dataloader)

    # flatten labels and predictions
    flat_label = np.concatenate(real_label).astype(int).ravel().tolist()
    
    flat_predictions = np.concatenate(predictions).astype(int).ravel().tolist()

    # Append accuracy and validation loss for training data
    train_acc.append(f1_score(flat_label,flat_predictions))
    train_loss.append(avg_loss)

    # Validation
    val_flat_label,val_flat_predictions, val_avg_loss = validate(model,val_dataloader,loss_fn)

    if val_avg_loss < best_valid_loss:
        best_valid_loss = val_avg_loss
        torch.save(model.state_dict(), '/content/drive/My Drive/Datasets/All/best_same_'+language+'_weights.pt')
        new.append(model.state_dict())
        
    
    
    # Append accuracy and validation loss for validation data
    val_acc.append(f1_score(val_flat_label,val_flat_predictions))
    val_loss.append(val_avg_loss)

    # Compute each class Accuracy (validation set)
    acc_0,acc_1 = Class_F1_score(val_flat_label,val_flat_predictions)
    avg_acc_0.append(acc_0)
    avg_acc_1.append(acc_1)

    # Print table with insights
    if flag:
      print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Train F1-score':^9} | {'Val F1-score':^9} | {'Class_0 F1-score':^10} | {'Class_1 F1-score':^10} ")
      flag = False

    print("-"*85)
    print(f"{str(epoch + 1) +'/'+ str(num_epochs):^7} | {avg_loss:^12.6f} | {val_avg_loss:^10.6f} | {f1_score(flat_label,flat_predictions):^9.2f} | {f1_score(val_flat_label,val_flat_predictions):^9.2f} | {acc_0:^10.2f} | {acc_1:^10.2f} ")
  
  

  # Print mean of Accuracy and Loss
  print("-"*85)
  print(f"{'Average':^7} | {Mean(train_loss):^12.6f} | {Mean(val_loss):^10.6f} | {Mean(train_acc):^9.2f} | {Mean(val_acc):^9.2f} | {Mean(avg_acc_0):^10.2f} | {Mean(avg_acc_1):^10.2f} ")

  



  return train_loss,val_loss, Mean(train_loss), Mean(val_loss), best_valid_loss, new


<h1>Evaluate Model</h1>

In [42]:
def validate(model,val_dataloader,loss_fn):
  
    

    model.eval() 

    total_loss, total_accuracy = 0, 0
    
    val_preds = []
    val_label = []
    
    for step,batch in enumerate(val_dataloader):
      
      batch = [t.to(device) for t in batch]
      
      sent_id, mask, labels = batch
      val_label.append(labels.detach().cpu().numpy())
      
      with torch.no_grad():
        output = model(sent_id,mask)
        
        loss = loss_fn(output,labels.unsqueeze(1).float())     

        total_loss = total_loss + loss.item()
        
      pred = torch.round(output)
      pred = pred.detach().cpu().numpy()
      pred = pred.flatten()
      val_preds.append(pred)

    # compute training loss of each epoch
    avg_loss = total_loss / len(val_dataloader)

    # flatten labels and predictions
    flat_label = np.concatenate(val_label).astype(int).ravel().tolist()
    
    
    flat_predictions = np.concatenate(val_preds).astype(int).ravel().tolist()

    

    return flat_label,flat_predictions, avg_loss

<h1>Class F1-score</h1>

In [43]:
def Class_F1_score(val_y,new_preds):

  report = classification_report(val_y,new_preds, output_dict=True )

  return report['0']['f1-score'], report['1']['f1-score']

<h1>Class Accuracy</h1>

In [44]:
import numpy

def Class_Accuracy_1(all):
  # Class 1 Accuracy

  all['new'] = all.apply(lambda x: True if x['predicted label']==1 and x['real label']==1 else False, axis=1)

  return len(all[all['new']==True])/len(all[all['real label']==1])

def Class_Accuracy_0(all):
  # Class 0 Accuracy

  all['new'] = all.apply(lambda x: True if x['predicted label']==0 and x['real label']==0 else False, axis=1)

  return len(all[all['new']==True])/len(all[all['real label']==0])


def Class_Accuracy(val_y,new_preds):
  real = pd.DataFrame(numpy.asarray(val_y))
  real = real.rename(columns={0:'real label'})

  preds = pd.DataFrame(new_preds)
  preds = preds.rename(columns={0:'predicted label'})
  
  all = pd.concat([real,preds],axis=1)

  acc_1 = Class_Accuracy_1(all)
  acc_0 = Class_Accuracy_0(all)

  return acc_0,acc_1



<h1>Choose Language</h1>

In [45]:
def language_model(lang):

  if lang=='multilingual':
    return 'bert-base-multilingual-cased'
  
  else:
    return "dkleczek/bert-base-polish-uncased-v1"

<h1>Undersample</h1>

In [46]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import random
from past.builtins import xrange

def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = list(range(len(rs)))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in xrange(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

def compare(ratio,data,balance):
    class_tags = data['label'].value_counts().to_list()
    if ratio>=0.5:
        total = balance - min(class_tags)
        balance_class_index = class_tags.index(max(class_tags))
        class_index = class_tags.index(min(class_tags))

        new = data[data['label']==balance_class_index]
        print(new['label'].value_counts())

        model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
        embeddings = model.encode(new['text'].to_numpy())

        D = pairwise_distances(embeddings, metric='cosine')

        M, C = kMedoids(D, total)
        data1 = new.iloc[M]
    
        data2 = data[data['label']==class_index]
        return pd.concat([data1,data2],ignore_index=True)
    else:
      model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
      embeddings = model.encode(data['text'].to_numpy())

      D = pairwise_distances(embeddings, metric='cosine')

      M, C = kMedoids(D, balance)
      return data.iloc[M]


def check_imbalance_ratio(data):
    class_tags = data['label'].value_counts().to_list()
    difference = max(class_tags) - min(class_tags)
    ratio = difference/max(class_tags)
    return ratio

def balance(data,sample):
    return compare(check_imbalance_ratio(data),data,sample)

def execute(data,sample):
   
    return balance(data,sample)

<h1>Train</h1>

In [47]:
text_path = "/content/drive/My Drive/Datasets/Slovenian/Slovene_train_set.csv"

slovenian = Slovenian(text_path)
slovenian_data = slovenian.clean_data()

text_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_text.txt"
tag_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_tags.txt"

polish = Polish(text_path,tag_path)
polish_data = polish.clean_data()
print(len(polish_data))

polish_data = execute(polish_data,7990)


text_path = "/content/drive/My Drive/Datasets/Croatian/Croatian_train_set.csv"
croatian = Croatian(text_path)
croatian_data = croatian.clean_data()
print(len(croatian_data))
croatian_data  = execute(croatian_data,7990)

text_path = "/content/drive/My Drive/Datasets/English/English_8743_train.csv"

#english = English(text_path)
#english_data = english.clean_data()
#english_data  = execute(english_data,7990)

data = pd.concat([polish_data,croatian_data,slovenian_data])


kfold=KFold(n_splits=5,shuffle=True,random_state=23)

num_epochs = 5

accuracy = []
fold_loss_train = []
fold_loss_val = []

best_valid_loss = float('inf')

language = 'multilingual'
Bert_model = language_model(language)

new= []

10041
0    9190
Name: label, dtype: int64
8851


In [48]:
from sklearn.model_selection import train_test_split
train_dataset,val_dataset = train_test_split(data,test_size=0.15,random_state=21)
  
  
# Transfom Data
transform = Transform_Data(train_dataset,val_dataset,Bert_model,'text','label')
train_encodings, train_y, val_encodings,val_y, train_dataloader, val_dataloader = transform.Execute()

# Bert Model
bert = BertModel.from_pretrained(Bert_model)
model = BERT_Arch(bert,False)
model.to(device)

# Optimizer and Scheduler
optimizer, lr_scheduler, num_training_steps = Optimizer_Scheduler(model,train_dataloader,num_epochs)

# Train Model
train_loss,val_loss, total_loss_train, total_loss_val, best_valid_loss, new = train(model, num_epochs, train_dataloader,val_dataloader,best_valid_loss,language,new)
fold_loss_train.append(total_loss_train)
fold_loss_val.append(total_loss_val)



Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/3185 [00:00<?, ?it/s]

 Epoch  |  Train Loss  |  Val Loss  | Train F1-score | Val F1-score | Class_0 F1-score | Class_1 F1-score 
-------------------------------------------------------------------------------------
  1/5   |   0.493329   |  0.674940  |   0.78    |   0.71    |    0.68    |    0.71    
-------------------------------------------------------------------------------------
  2/5   |   0.372824   |  0.596239  |   0.84    |   0.74    |    0.78    |    0.74    
-------------------------------------------------------------------------------------
  3/5   |   0.266230   |  0.603857  |   0.90    |   0.69    |    0.80    |    0.69    
-------------------------------------------------------------------------------------
  4/5   |   0.193209   |  0.707355  |   0.94    |   0.71    |    0.80    |    0.71    
-------------------------------------------------------------------------------------
  5/5   |   0.147557   |  0.829226  |   0.95    |   0.70    |    0.80    |    0.70    
----------------------------

In [49]:
def predict(model,test_seq,test_mask,test_y):
  torch.cuda.empty_cache()
  
  
  path = '/content/drive/My Drive/Datasets/Slovene_Polish_Croatian/best_Loss_multilingual_weights.pt'
  model.load_state_dict(torch.load(path))

  with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()
  
  preds = numpy.round(preds)
  final = np.concatenate(preds).astype(int).ravel().tolist()
  #print(final)

  return final

In [50]:
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    path = '/content/drive/My Drive/Datasets/All/best_same_multilingual_weights.pt'
    model.load_state_dict(torch.load(path))

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
            #preds = logits.detach().cpu().numpy()
            
        all_logits.append(torch.round(logits))
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)
    preds = all_logits.detach().cpu().numpy()
    preds = np.concatenate(preds).astype(int).ravel().tolist()
    

    # Apply softmax to calculate probabilities
    #probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return preds

In [51]:
import numpy
import torch, gc

# Test paths
#text_path = "/content/drive/My Drive/Datasets/Slovenian/Slovene_test_set.csv"


# Clean data
#slovenian = Slovenian(text_path)
#test_data = slovenian.clean_data()

# paths
text_path = "/content/drive/My Drive/Datasets/Polish/test_set_clean_only_text.txt"
tag_path = "/content/drive/My Drive/Datasets/Polish/test_set_clean_only_tags.txt"

# Clean data
#polish = Polish(text_path,tag_path)
#test_data = polish.clean_data()

#Clean data
text_path = "/content/drive/My Drive/Datasets/Croatian/Croatian_test_set.csv"
croatian = Croatian(text_path)
test_data = croatian.clean_data()


#test_data = pd.concat([polish_data,slovenian_data,croatian_data])

language = 'multilingual'
Bert_model = language_model(language)

# Tokenize Data
tokenizer = BertTokenizer.from_pretrained(Bert_model)

test_encodings = tokenizer.batch_encode_plus(test_data['text'].tolist(),add_special_tokens = True, truncation=True, padding=True, max_length=60,return_tensors='pt')

test_seq = torch.tensor(test_encodings['input_ids'])
test_mask = torch.tensor(test_encodings['attention_mask'])

test_dataset = TensorDataset(test_seq, test_mask)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

test_y = torch.tensor(test_data['label'].tolist())

bert = BertModel.from_pretrained(Bert_model)
model = BERT_Arch(bert,False)
model.to(device)



test_pred = bert_predict(model, test_dataloader)
#test_pred = predict(model,test_seq,test_mask,test_y)
test_label = test_y.detach().cpu().numpy()
print(len(test_label))
print(len(test_pred))
print(classification_report(test_label,test_pred))


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2120
2120
              precision    recall  f1-score   support

           0       0.76      0.54      0.63       829
           1       0.75      0.89      0.81      1291

    accuracy                           0.75      2120
   macro avg       0.75      0.71      0.72      2120
weighted avg       0.75      0.75      0.74      2120



In [52]:
print(len(test_label))
print(test_pred)

2120
[1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0