# mBERT Model Multilingual Experiments

## Imports

In [None]:
! pip install transformers datasets --quiet

[K     |████████████████████████████████| 3.1 MB 4.3 MB/s 
[K     |████████████████████████████████| 298 kB 91.9 MB/s 
[K     |████████████████████████████████| 596 kB 58.3 MB/s 
[K     |████████████████████████████████| 61 kB 625 kB/s 
[K     |████████████████████████████████| 3.3 MB 72.2 MB/s 
[K     |████████████████████████████████| 895 kB 67.2 MB/s 
[K     |████████████████████████████████| 132 kB 68.4 MB/s 
[K     |████████████████████████████████| 243 kB 65.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 55.3 MB/s 
[K     |████████████████████████████████| 271 kB 90.3 MB/s 
[K     |████████████████████████████████| 192 kB 99.0 MB/s 
[K     |████████████████████████████████| 160 kB 89.9 MB/s 
[?25h

In [None]:
! sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 3s (850 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories cur

In [None]:
! transformers-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: vidhur2k
Password: 
ERROR:root:HfApi.login: This method is deprecated in favor of `set_access_token`.
Login successful
Your token: KzAihDRDhKJPpeIYHSuCobGeBLQrKfLphUuNMGfEvFhsgTosGnKOXMRtpMcOjYOwZkKowiOuxfxbgXebInUtEGpAKPkdPcqUFWwktmWphjaYRysxJKigjQmvJUiNWCGm 

Your token has been saved to /root/.huggingface/token


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import datasets
from datasets import load_dataset, Dataset, load_metric
from tqdm.auto import tqdm

In [None]:
MODEL_NAME = 'bert-base-multilingual-cased'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


In [None]:
def get_gpu_info():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)

## Preprocess the data using the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [None]:
def tokenize_function(example):
    if example['text'] is None:
      return tokenizer('', truncation=True, padding='max_length')
    return tokenizer(example['text'], truncation=True, padding='max_length')

In [None]:
def load_and_tokenize_dataset(csv_file: str):
    dataset = load_dataset('csv', data_files=csv_file)
    print(dataset['train'].column_names)
    has_unnamed_col = 'Unnamed: 0' in dataset['train'].column_names
    if has_unnamed_col:
      dataset = dataset.rename_column('Unnamed: 0', 'idx')
    dataset = dataset['train'].train_test_split(test_size=0.2)
    
    tokenized_datasets = dataset.map(tokenize_function)
    for dataset in ['train', 'test']:
        if 'id' in tokenized_datasets[dataset].column_names:
          tokenized_datasets[dataset] = tokenized_datasets[dataset].remove_columns(['id'])
        if has_unnamed_col:
          tokenized_datasets[dataset] = tokenized_datasets[dataset].remove_columns(['text', 'idx', 'token_type_ids'])
        else:
          tokenized_datasets[dataset] = tokenized_datasets[dataset].remove_columns(['text', 'token_type_ids'])
        tokenized_datasets[dataset] = tokenized_datasets[dataset].rename_column('hs', 'labels')
        tokenized_datasets[dataset].set_format('torch')
    return tokenized_datasets

In [None]:
languages = ['arabic', 'danish', 'english', 'french', 'german', 'hindi', 'indonesian', 'italian', 'portuguese', 'spanish', 'turkish']

In [None]:
ds = []
for lang in languages:
  df = pd.read_csv(f'https://raw.githubusercontent.com/vidhur2k/Multilngual-Hate-Speech/main/data/all-processed/B_{lang}_processed.csv')
  if 'Unnamed: 0' in df.columns:
    df = df.drop(['Unnamed: 0'], axis=1)
  if 'id' in df.columns:
    df = df.drop(['id'], axis=1)
  ds.append(df)  

In [None]:
train_datasets = []
test_datasets = {}
for i in range(len(languages)):
  dataset = Dataset.from_pandas(ds[i])
  dataset = dataset.train_test_split(test_size=0.2)
  train_datasets.append(dataset['train'])
  test_datasets[languages[i]] = dataset['test']

In [None]:
train_dataset = datasets.concatenate_datasets(train_datasets)

In [None]:
tokenized_dataset = train_dataset.map(tokenize_function)

  0%|          | 0/153102 [00:00<?, ?ex/s]

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
tokenized_dataset = tokenized_dataset.rename_column('hs', 'labels')        
tokenized_dataset.set_format('torch')

In [None]:
tokenized_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 153102
})

In [None]:
get_gpu_info()

Sun Dec  5 14:34:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    28W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Define Train and Test Loaders

In [None]:
def get_train_loader(tokenized_dataset, batch_size):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(tokenized_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
    return train_loader

## Model Training

#### Plan

We plan on training mBERT to perform the following classification scenarios

1. **Monolingual-Train Monolingual-Test**: Train it on language X and test on X as well.
2. **Multilingual-Train Monolingual-Test**: Train it on a set of languages ($X_1, X_2 \dots X_n$) and test on Y. We train in scenarios by both including and not including Y in the training set.

In [None]:
# Define training hyperparameters for the monolingual scenario
n_epochs = 5
lr = 5e-5
batch_size = 64

def train(train_loader):
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
  model.to(device)

  optimizer = AdamW(model.parameters(), lr=lr)
  n_training_steps = n_epochs * len(train_loader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=n_training_steps,
  )

  progress = tqdm(range(n_training_steps))
  model.train()

  for epoch in range(n_epochs):
    for batch in train_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress.update(1)

  return model


def test(test_loader, model):
  progress = tqdm(range(len(test_loader)))
  accuracy_metric = load_metric("accuracy")
  model.to(device)
  model.eval()
  preds = []
  trues = []
  for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
    preds.extend(predictions.tolist())
    trues.extend(batch['labels'].tolist())
    progress.update(1)
  
  print(accuracy_metric.compute())
  print(f'F1 Score: {f1_score(trues, preds, average="weighted")}')
  print(f'AUC Score: {roc_auc_score(trues, preds, average="weighted")}')

In [None]:
train_loader = get_train_loader(tokenized_dataset, batch_size=8)

In [None]:
multilingual_model = train(train_loader)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  0%|          | 0/95690 [00:00<?, ?it/s]

In [None]:
multilingual_model.push_to_hub("vidhur2k/multilingual-hate-speech/mBERT-Multi")

## Evaluation

We evaluate the performance of the model on a language-by-language basis