# mBERT Language Family Experiments

## Imports

In [2]:
! pip install transformers datasets --quiet

[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
[K     |████████████████████████████████| 298 kB 57.2 MB/s 
[K     |████████████████████████████████| 596 kB 56.4 MB/s 
[K     |████████████████████████████████| 61 kB 495 kB/s 
[K     |████████████████████████████████| 895 kB 59.8 MB/s 
[K     |████████████████████████████████| 3.3 MB 46.5 MB/s 
[K     |████████████████████████████████| 132 kB 59.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 46.0 MB/s 
[K     |████████████████████████████████| 243 kB 54.7 MB/s 
[K     |████████████████████████████████| 192 kB 58.8 MB/s 
[K     |████████████████████████████████| 271 kB 54.8 MB/s 
[K     |████████████████████████████████| 160 kB 57.8 MB/s 
[?25h

In [3]:
! sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,374 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 155222 files and directories c

In [3]:
! transformers-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: vidhur2k
Password: 
ERROR:root:HfApi.login: This method is deprecated in favor of `set_access_token`.
Login successful
Your token: KzAihDRDhKJPpeIYHSuCobGeBLQrKfLphUuNMGfEvFhsgTosGnKOXMRtpMcOjYOwZkKowiOuxfxbgXebInUtEGpAKPkdPcqUFWwktmWphjaYRysxJKigjQmvJUiNWCGm 

Your token has been saved to /root/.huggingface/token


In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score
import datasets
from datasets import load_dataset, Dataset, load_metric
from tqdm.auto import tqdm

In [5]:
MODEL_NAME = 'bert-base-multilingual-cased'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

Using device: cuda


In [6]:
def get_gpu_info():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)

## Preprocess the data using the tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [8]:
def tokenize_function(example):
    if example['text'] is None:
      return tokenizer('', truncation=True, padding='max_length')
    return tokenizer(example['text'], truncation=True, padding='max_length')

In [9]:
def load_and_tokenize_dataset(csv_file: str):
    dataset = load_dataset('csv', data_files=csv_file)
    print(dataset['train'].column_names)
    has_unnamed_col = 'Unnamed: 0' in dataset['train'].column_names
    if has_unnamed_col:
      dataset = dataset.rename_column('Unnamed: 0', 'idx')
    dataset = dataset['train'].train_test_split(test_size=0.2)
    
    tokenized_datasets = dataset.map(tokenize_function)
    for dataset in ['train', 'test']:
        if 'id' in tokenized_datasets[dataset].column_names:
          tokenized_datasets[dataset] = tokenized_datasets[dataset].remove_columns(['id'])
        if has_unnamed_col:
          tokenized_datasets[dataset] = tokenized_datasets[dataset].remove_columns(['text', 'idx', 'token_type_ids'])
        else:
          tokenized_datasets[dataset] = tokenized_datasets[dataset].remove_columns(['text', 'token_type_ids'])
        tokenized_datasets[dataset] = tokenized_datasets[dataset].rename_column('hs', 'labels')
        tokenized_datasets[dataset].set_format('torch')
    return tokenized_datasets

### Romance Languages

In [10]:
languages = ['french', 'italian', 'portuguese', 'spanish']

In [11]:
ds = []
for lang in languages:
  df = pd.read_csv(f'https://raw.githubusercontent.com/vidhur2k/Multilngual-Hate-Speech/main/data/all-processed/B_{lang}_processed.csv')
  if 'Unnamed: 0' in df.columns:
    df = df.drop(['Unnamed: 0'], axis=1)
  if 'id' in df.columns:
    df = df.drop(['id'], axis=1)
  ds.append(df)  

In [12]:
train_datasets = []
test_datasets = {}
for i in range(len(languages)):
  dataset = Dataset.from_pandas(ds[i])
  dataset = dataset.train_test_split(test_size=0.2)
  train_datasets.append(dataset['train'])
  test_datasets[languages[i]] = dataset['test']

In [13]:
train_dataset = datasets.concatenate_datasets(train_datasets)

In [14]:
tokenized_dataset = train_dataset.map(tokenize_function)

  0%|          | 0/25128 [00:00<?, ?ex/s]

In [15]:
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
tokenized_dataset = tokenized_dataset.rename_column('hs', 'labels')        
tokenized_dataset.set_format('torch')

In [16]:
tokenized_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 25128
})

In [17]:
get_gpu_info()

Mon Dec  6 04:29:32 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    28W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Define Train and Test Loaders

In [18]:
def get_train_loader(tokenized_dataset, batch_size):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(tokenized_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
    return train_loader

## Model Training

In [24]:
# Define training hyperparameters for the monolingual scenario
n_epochs = 5
lr = 5e-5
batch_size = 64

def train(train_loader):
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
  model.to(device)

  optimizer = AdamW(model.parameters(), lr=lr)
  n_training_steps = n_epochs * len(train_loader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=n_training_steps,
  )

  progress = tqdm(range(n_training_steps))
  model.train()

  for epoch in range(n_epochs):
    for batch in train_loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress.update(1)

  return model


def test(test_loader, model):
  progress = tqdm(range(len(test_loader)))
  accuracy_metric = load_metric("accuracy")
  model.to(device)
  model.eval()
  preds = []
  trues = []
  for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
      outputs = model(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    accuracy_metric.add_batch(predictions=predictions, references=batch["labels"])
    preds.extend(predictions.tolist())
    trues.extend(batch['labels'].tolist())
    progress.update(1)
  
  print(accuracy_metric.compute())
  print(f'F1 Score: {f1_score(trues, preds, average="weighted")}')
  print(f'AUC Score: {roc_auc_score(trues, preds, average="weighted")}')

In [20]:
train_loader = get_train_loader(tokenized_dataset, batch_size=8)

In [21]:
romance_langauge_model = train(train_loader)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  0%|          | 0/15705 [00:00<?, ?it/s]

In [23]:
romance_langauge_model.push_to_hub("vidhur2k/multilingual-hate-speech/mBERT-RomanceLang")

Cloning https://huggingface.co/vidhur2k/mBERT-RomanceLang into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.38k/679M [00:00<?, ?B/s]

To https://huggingface.co/vidhur2k/mBERT-RomanceLang
   4c94813..e2dc0e8  main -> main



'https://huggingface.co/vidhur2k/mBERT-RomanceLang/commit/e2dc0e87d49e901f5c294d2cc2b3b814d8bd4622'

### Germanic Languages

In [36]:
languages = ['danish', 'english', 'german']

In [37]:
ds = []
for lang in languages:
  df = pd.read_csv(f'https://raw.githubusercontent.com/vidhur2k/Multilngual-Hate-Speech/main/data/all-processed/B_{lang}_processed.csv')
  if 'Unnamed: 0' in df.columns:
    df = df.drop(['Unnamed: 0'], axis=1)
  if 'id' in df.columns:
    df = df.drop(['id'], axis=1)
  ds.append(df)  

In [38]:
train_datasets = []
test_datasets = {}
for i in range(len(languages)):
  dataset = Dataset.from_pandas(ds[i])
  dataset = dataset.train_test_split(test_size=0.2)
  train_datasets.append(dataset['train'])
  test_datasets[languages[i]] = dataset['test']

In [27]:
train_dataset = datasets.concatenate_datasets(train_datasets)

In [28]:
tokenized_dataset = train_dataset.map(tokenize_function)

  0%|          | 0/73743 [00:00<?, ?ex/s]

In [29]:
tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
tokenized_dataset = tokenized_dataset.rename_column('hs', 'labels')        
tokenized_dataset.set_format('torch')

In [30]:
tokenized_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 73743
})

In [31]:
train_loader = get_train_loader(tokenized_dataset, batch_size=8)

In [32]:
germanic_language_model = train(train_loader)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

  0%|          | 0/46090 [00:00<?, ?it/s]

In [33]:
germanic_language_model.push_to_hub("vidhur2k/multilingual-hate-speech/mBERT-GermanicLang")

Cloning https://huggingface.co/vidhur2k/mBERT-GermanicLang into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.38k/679M [00:00<?, ?B/s]

To https://huggingface.co/vidhur2k/mBERT-GermanicLang
   0c4a95b..aad867f  main -> main



'https://huggingface.co/vidhur2k/mBERT-GermanicLang/commit/aad867fc2edb0aa7d46fea8046523572794b44c9'

## Evaluation

We evaluate the performance of the model on a language-by-language basis

In [13]:
romance_langauge_model = AutoModelForSequenceClassification.from_pretrained('vidhur2k/mBERT-RomanceLang')

Downloading:   0%|          | 0.00/895 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679M [00:00<?, ?B/s]

In [16]:
def tokenize_test_dataset(dataset):
  tokenized_dataset = dataset.map(tokenize_function)
  tokenized_dataset = tokenized_dataset.remove_columns(['text', 'token_type_ids'])
  tokenized_dataset = tokenized_dataset.rename_column('hs', 'labels')        
  tokenized_dataset.set_format('torch')
  return tokenized_dataset

In [20]:
def get_test_loader(tokenized_dataset, batch_size):
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_loader = DataLoader(tokenized_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
    return train_loader

### French

In [18]:
french_tokenized_dataset = tokenize_test_dataset(test_datasets['french'])

  0%|          | 0/206 [00:00<?, ?ex/s]

In [22]:
french_loader = get_test_loader(french_tokenized_dataset, batch_size = 8)

In [25]:
test(french_loader, romance_langauge_model)

  0%|          | 0/26 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

{'accuracy': 0.7961165048543689}
F1 Score: 0.7057465232222513
AUC Score: 0.5


### Portuguese

In [26]:
portuguese_tokenized_dataset = tokenize_test_dataset(test_datasets['portuguese'])

  0%|          | 0/1134 [00:00<?, ?ex/s]

In [27]:
portuguese_loader = get_test_loader(portuguese_tokenized_dataset, batch_size = 8)

In [28]:
test(portuguese_loader, romance_langauge_model)

  0%|          | 0/142 [00:00<?, ?it/s]

{'accuracy': 0.7874779541446209}
F1 Score: 0.6938508268881562
AUC Score: 0.5


### Italian

In [29]:
italian_tokenized_dataset = tokenize_test_dataset(test_datasets['italian'])

  0%|          | 0/2424 [00:00<?, ?ex/s]

In [30]:
italian_loader = get_test_loader(italian_tokenized_dataset, batch_size = 8)

In [31]:
test(italian_loader, romance_langauge_model)

  0%|          | 0/303 [00:00<?, ?it/s]

{'accuracy': 0.7058580858085809}
F1 Score: 0.5841466432012005
AUC Score: 0.5


### Spanish

In [32]:
spanish_tokenized_dataset = tokenize_test_dataset(test_datasets['spanish'])

  0%|          | 0/2520 [00:00<?, ?ex/s]

In [33]:
spanish_loader = get_test_loader(spanish_tokenized_dataset, batch_size = 8)

In [34]:
test(spanish_loader, romance_langauge_model)

  0%|          | 0/315 [00:00<?, ?it/s]

{'accuracy': 0.6543650793650794}
F1 Score: 0.5176531618484126
AUC Score: 0.5


## Germanic Languages

In [35]:
germanic_language_model = AutoModelForSequenceClassification.from_pretrained('vidhur2k/mBERT-GermanicLang')

Downloading:   0%|          | 0.00/895 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679M [00:00<?, ?B/s]

### English

In [40]:
english_tokenized_dataset = tokenize_test_dataset(test_datasets['english'])

  0%|          | 0/16389 [00:00<?, ?ex/s]

In [41]:
english_loader = get_test_loader(english_tokenized_dataset, batch_size = 8)

In [42]:
test(english_loader, germanic_language_model)

  0%|          | 0/2049 [00:00<?, ?it/s]

{'accuracy': 0.8228689974983221}
F1 Score: 0.742909543113799
AUC Score: 0.5


### German

In [43]:
german_tokenized_dataset = tokenize_test_dataset(test_datasets['german'])

  0%|          | 0/1393 [00:00<?, ?ex/s]

In [44]:
german_loader = get_test_loader(german_tokenized_dataset, batch_size = 8)

In [45]:
test(german_loader, germanic_language_model)

  0%|          | 0/175 [00:00<?, ?it/s]

{'accuracy': 0.7803302225412778}
F1 Score: 0.6840475418567492
AUC Score: 0.5


### Danish

In [46]:
danish_tokenized_dataset = tokenize_test_dataset(test_datasets['danish'])

  0%|          | 0/655 [00:00<?, ?ex/s]

In [47]:
danish_loader = get_test_loader(danish_tokenized_dataset, batch_size = 8)

In [48]:
test(danish_loader, germanic_language_model)

  0%|          | 0/82 [00:00<?, ?it/s]

{'accuracy': 0.867175572519084}
F1 Score: 0.805487694506688
AUC Score: 0.5
