<a href="https://colab.research.google.com/github/vlavrent/Multilingual-Hate-Speech-Detection/blob/main/Translate_Polish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [None]:
pip install sentencepiece

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch

if torch.cuda.is_available():

  device = torch.device("cuda")
  print(f'There are {torch.cuda.device_count()} GPU(s) available.')
  print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
#from torchsampler import ImbalancedDatasetSampler
import torch
import random
import numpy as np
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.model_selection import KFold
import torch, gc
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import f1_score,classification_report
from transformers import RobertaTokenizer, RobertaModel
from transformers import XLMRobertaTokenizer, XLMRobertaModel
from transformers import AutoTokenizer, AutoModelForMaskedLM
from sklearn.model_selection import train_test_split
from transformers import MarianMTModel, MarianTokenizer
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer


<h1>Preprocess Polish</h1>

In [None]:
class Polish():

    def __init__(self,data_path,tag_path):
        self.path = data_path
        self.tag_path = tag_path
        self.data = self.read_data()
        self.tag = self.read_tag()
        self.column = 'text'
        self.label = 'label'

    def read_data(self):

        open_data = open(self.path, "r", encoding="utf8")

        return pd.DataFrame(open_data)

    def read_tag(self):

        open_tag = open(self.tag_path,'r')

        return pd.DataFrame(open_tag)

    def remove_punctuation(self,data,column):
      
      return data[column].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
  
    def lower(self):

        return self.data[self.column].str.lower()

    def rename_columns(self,data,column):

        return data.rename(columns={0:column})

    def remove_mentions(self):

        return self.data[self.column].apply(lambda row: re.sub("@[A-Za-z0-9]+_[A-Za-z0-9]+","",row))

    def remove_end_line(self,data,column):

        return data[column].str.replace('\n','')

    def concat(self):

        self.data[self.label] = self.tag[self.label]

        return self.data
    
    def convert_int(self):

      return self.tag[self.label].apply(lambda x: int(x))


    def clean_data(self):

        text_column = self.column
        label_column = self.label

        # Rename columns in both label and text data
        self.data = self.rename_columns(self.data,text_column)

        self.tag = self.rename_columns(self.tag, label_column)

        # Remove Punctuation
        self.data[text_column] = self.remove_punctuation(self.data,text_column) 

        # Lower words in text data
        self.data[text_column] = self.lower()

        # Remove user mentions in text data
        self.data[text_column] = self.remove_mentions()

        # Remove end line character from label and text data
        self.data[text_column] = self.remove_end_line(self.data,text_column)

        self.tag[label_column] = self.remove_end_line(self.tag,label_column)

        # Convert label to int
        self.tag[label_column] = self.convert_int()
        
        # Concat text and labels

        return self.concat()

<h1>Preprocess English</h1>

In [None]:
class English():

    def __init__(self,path):
        self.data = self.read(path)
        self.label = 'label'
        self.column = 'text'

    def read(self,path):
        return pd.read_csv(path)

    def replace_label(self,x):
        if ('normal' in x) or (x==2):
            return 'NOT'
        else:
            return 'HOF'

    def fix_label(self):
        self.data[self.label] = self.data[self.label].apply(lambda x: self.replace_label(x))
        return self.data


    def replace_mentions(self):
        self.data[self.column] = self.data[self.column].apply(lambda row: re.sub("@[A-Za-z0-9]+_*[A-Za-z0-9]+", "mention", row))
        self.data[self.column] = self.data[self.column].apply(lambda row: re.sub("mention_", "mention", row))

    def remove_punctuation(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    def replace_hashtag(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub("#[\w]+", "hashtag", x))

    def remove_stopwords(self):
        self.data[self.column] = self.data[self.column].apply(lambda word: [i for i in word.split() if not i in stopwords.words("english")])
        self.data[self.column] = self.data[self.column].apply(lambda x: " ".join(x))

    def remove_url(self):
        self.data[self.column] = self.data[self.column].apply(lambda x: re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '',x))
    
    def lower(self):
        return self.data[self.column].str.lower()

    def binarize_labels(self):

      self.data[self.label] = self.data[self.label].apply(lambda x: 0 if x=='NOT' else 1)

    def clean_data(self):
        # Fix labels
        self.data = self.fix_label()

        # Remove urls
        self.remove_url()

        # Replace mentions
        self.replace_mentions()

        # Replace hashtags
        self.replace_hashtag()

        # Remove punctuation
        self.remove_punctuation()

        # Remove stopwords
        self.remove_stopwords()

        # Lower text
        self.lower()

        # Binarize labels
        self.binarize_labels()

        return self.data

<h1>Load English Data</h1>

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

text_path = "/content/drive/My Drive/Datasets/English/English_train_set.csv"

english = English(text_path)
data = english.fix_label()
data['label'] = data['label'].apply(lambda x: 1 if x=='HOF' else 0)

<h1>Find imbalanced class</h1>

In [None]:
text_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_text.txt"
tag_path = "/content/drive/My Drive/Datasets/Polish/training_set_clean_only_tags.txt"

# Clean Polish data (labels)
polish = Polish(text_path,tag_path)
data_p = polish.clean_data()

# Find difference of samples in both classes
classes = data_p['label'].value_counts().to_list()
abs(classes[0]-classes[1])

# Find Imbalance in classes
print(data_p['label'].value_counts())

In [None]:
from sklearn.model_selection import train_test_split

class_tags = data_p['label'].value_counts().to_list()

# Find class to balance
balance_class_index = class_tags.index(min(class_tags))
class_index = data_p['label'].value_counts().index.to_list()

# Find class not to balance
balance_class_index_not = class_tags.index(max(class_tags))
class_index_not = data_p['label'].value_counts().index.to_list()

# Choose class to sample
sample_data = data[data['label']==class_index[balance_class_index]]

# Choose class not to sample
not_sample_data = data[data['label']==class_index_not[balance_class_index_not]]

# Split into sample to translate and sample to not translate
use_sample ,translate_sample = train_test_split(sample_data,test_size=abs(class_tags[0]-class_tags[1])) 

# Data to use for training
data = pd.concat([use_sample,not_sample_data])

<h1>Load Model</h1>

In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# Load Model
model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")

# Load Tokenizer
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# Model to GPU
model.to(device)

<h1>Translate in Batches</h1>

In [None]:
# Choose a part of the data 
new = translate_sample[:200]

# Tokenize data
encoded_hi = tokenizer(new['text'].tolist(), return_tensors="pt",padding=True)

# Create DataLoader
test_dataset = TensorDataset(encoded_hi['input_ids'],encoded_hi['attention_mask'])
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset,sampler=test_sampler,batch_size=8)


translated_text = []
i = 1

# Translate in batches
for batch in test_dataloader:

  b_input,b_attn_mask = [t.to(device) for t in batch]
  
  generated_tokens = model.generate(b_input, forced_bos_token_id=tokenizer.get_lang_id("pl"))
  encoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
  pd.DataFrame(encoded).to_csv('/content/drive/My Drive/Datasets/Polish/Batch_'+str(i)+'.csv')
  i = i + 1


<h1></h1>

<h1>Concat csv files into one</h1>

In [None]:
import glob, os
import pandas as pd

os.chdir("/content/drive/My Drive/Datasets")
add = []
for file in glob.glob("Batch_*.csv"):
  add.append(pd.read_csv(file))

pd.concat(add).to_csv('/content/drive/My Drive/Datasets/Translated_Polish.csv',index=False)