In this notebook, you are shown **how to train** a BERT2BERT model initialized with AraBERT pre-trained parameters on the Arabic empathetic message-response dataset. A gradio demo is also provided at the end.

In [1]:
#Install dependencies
!pip install git-python==1.0.3
!pip install sacrebleu==1.4.2
!pip install rouge_score
!pip install farasapy
!git clone https://github.com/aub-mind/arabert
!pip install pyarabic
!pip install datasets
!pip install transformers==4.2
!git clone  https://github.com/tareknaous/dialectal-conv/

Collecting git-python==1.0.3
  Downloading git_python-1.0.3-py2.py3-none-any.whl (1.9 kB)
Collecting gitpython
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[?25l[K     |█▉                              | 10 kB 13.5 MB/s eta 0:00:01[K     |███▋                            | 20 kB 8.9 MB/s eta 0:00:01[K     |█████▍                          | 30 kB 6.7 MB/s eta 0:00:01[K     |███████▎                        | 40 kB 6.6 MB/s eta 0:00:01[K     |█████████                       | 51 kB 3.6 MB/s eta 0:00:01[K     |██████████▉                     | 61 kB 4.3 MB/s eta 0:00:01[K     |████████████▋                   | 71 kB 4.4 MB/s eta 0:00:01[K     |██████████████▌                 | 81 kB 5.0 MB/s eta 0:00:01[K     |████████████████▎               | 92 kB 5.0 MB/s eta 0:00:01[K     |██████████████████              | 102 kB 4.2 MB/s eta 0:00:01[K     |████████████████████            | 112 kB 4.2 MB/s eta 0:00:01[K     |█████████████████████▊          | 122 kB 4.

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14
Cloning into 'arabert'...
remote: Enumerating objects: 564, done.[K
remote: Counting objects: 100% (350/350), done.[K
remote: Compressing objects: 100% (252/252), done.[K
remote: Total 564 (delta 188), reused 247 (delta 92), pack-reused 214[K
Receiving objects: 100% (564/564), 9.16 MiB | 9.02 MiB/s, done.
Resolving deltas: 100% (311/311), done.
Collecting pyarabic
  Downloading PyArabic-0.6.14-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 4.3 MB/s 
Installing collected packages: pyarabic
Successfully installed pyarabic-0.6.14
Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |███████████████████████████

In [2]:
#Fetch dataset
!wget https://raw.githubusercontent.com/aub-mind/Arabic-Empathetic-Chatbot/master/arabic-empathetic-conversations.csv

--2022-03-06 19:15:23--  https://raw.githubusercontent.com/aub-mind/Arabic-Empathetic-Chatbot/master/arabic-empathetic-conversations.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7873052 (7.5M) [text/plain]
Saving to: ‘arabic-empathetic-conversations.csv’


2022-03-06 19:15:24 (65.0 MB/s) - ‘arabic-empathetic-conversations.csv’ saved [7873052/7873052]



In [6]:
!wget https://raw.githubusercontent.com/tareknaous/Arabic-Empathetic-Chatbot/master/model/ArabicEmpatheticDialogues.py

--2022-03-06 19:16:28--  https://raw.githubusercontent.com/tareknaous/Arabic-Empathetic-Chatbot/master/model/ArabicEmpatheticDialogues.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2896 (2.8K) [text/plain]
Saving to: ‘ArabicEmpatheticDialogues.py’


2022-03-06 19:16:28 (35.2 MB/s) - ‘ArabicEmpatheticDialogues.py’ saved [2896/2896]



In [3]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset 
import transformers
from transformers import BertTokenizer, EncoderDecoderModel
from sacrebleu import corpus_bleu
from transformers import BertTokenizerFast, EncoderDecoderModel
from transformers import TrainingArguments
from dataclasses import dataclass, field
from typing import Optional

In [4]:
encoder_max_length=75
decoder_max_length=75
model_name = "aubmindlab/bert-base-arabert"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

Downloading:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/637 [00:00<?, ?B/s]

In [8]:
all_data = load_dataset("ArabicEmpatheticDialogues.py")
train_data = all_data['train'].train_test_split(test_size=0.1,seed=42)['train']
val_data = all_data['train'].train_test_split(test_size=0.1,seed=42)['test']
dev_data = val_data.train_test_split(test_size=0.5,seed=42)['train']
test_data = val_data.train_test_split(test_size=0.5,seed=42)['test']



100%|██████████| 241M/241M [00:14<00:00, 16.6MiB/s]




Downloading and preparing dataset arabic_emp_conv/arabic_emp_conv to /root/.cache/huggingface/datasets/arabic_emp_conv/arabic_emp_conv/1.0.0/af81e5e61abf371a71d8d27b4483bf0141f8b51c942075e3fed5df7b73769b6d...


Downloading:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

Dataset arabic_emp_conv downloaded and prepared to /root/.cache/huggingface/datasets/arabic_emp_conv/arabic_emp_conv/1.0.0/af81e5e61abf371a71d8d27b4483bf0141f8b51c942075e3fed5df7b73769b6d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/arabic_emp_conv/arabic_emp_conv/1.0.0/af81e5e61abf371a71d8d27b4483bf0141f8b51c942075e3fed5df7b73769b6d/cache-a5353ab9425d6084.arrow and /root/.cache/huggingface/datasets/arabic_emp_conv/arabic_emp_conv/1.0.0/af81e5e61abf371a71d8d27b4483bf0141f8b51c942075e3fed5df7b73769b6d/cache-03b6f1e917fb3633.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/arabic_emp_conv/arabic_emp_conv/1.0.0/af81e5e61abf371a71d8d27b4483bf0141f8b51c942075e3fed5df7b73769b6d/cache-3e839d01210aa172.arrow and /root/.cache/huggingface/datasets/arabic_emp_conv/arabic_emp_conv/1.0.0/af81e5e61abf371a71d8d27b4483bf0141f8b51c942075e3fed5df7b73769b6d/cache-73a31fa43d2af265.arrow


In [9]:
print("Length of train data",len(train_data))
print("Length of dev data",len(dev_data))
print("Length of test data",len(test_data))

Length of train data 32965
Length of dev data 1831
Length of test data 1832


In [10]:
def process_data_to_model_inputs(batch):                                                               
    # Tokenizer will automatically set [BOS] <text> [EOS]                                               
    inputs = tokenizer(batch["context"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["response"], padding="max_length", truncation=True, max_length=decoder_max_length)
                                                                                                        
    batch["input_ids"] = inputs.input_ids                                                               
    batch["attention_mask"] = inputs.attention_mask                                                     
    batch["decoder_input_ids"] = outputs.input_ids                                                      
    batch["labels"] = outputs.input_ids.copy()                                                          
    # mask loss for padding                                                                             
    batch["labels"] = [                                                                                 
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]                     
    batch["decoder_attention_mask"] = outputs.attention_mask                                                                              
                                                                                                         
    return batch

In [11]:
batch_size=16

In [12]:

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["context", "response"],
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

dev_data = dev_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["context", "response"],
)
dev_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

test_data = test_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["context", "response"],
)
test_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

  0%|          | 0/2061 [00:00<?, ?ba/s]

  0%|          | 0/115 [00:00<?, ?ba/s]

  0%|          | 0/115 [00:00<?, ?ba/s]

In [23]:
from transformers import EncoderDecoderModel

arabert2arabert = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name, tie_encoder_decoder=False)

Some weights of the model checkpoint at aubmindlab/bert-base-arabert were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias

In [24]:
#set special tokens
arabert2arabert.config.decoder_start_token_id = tokenizer.cls_token_id                                             
arabert2arabert.config.eos_token_id = tokenizer.sep_token_id
arabert2arabert.config.pad_token_id = tokenizer.pad_token_id

#sensible parameters for beam search
#set decoding params                               
arabert2arabert.config.max_length = 64
arabert2arabert.config.early_stopping = True

arabert2arabert.config.num_beams = 1
arabert2arabert.config.vocab_size = arabert2arabert.config.encoder.vocab_size

In [25]:
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [26]:
import torch
import torch.nn as nn

def compute_metrics(pred):
  labels_ids = pred.label_ids
  #pred_ids = torch.argmax(pred.predictions,dim=2)
  pred_ids = pred.predictions  

  # all unnecessary tokens are removed
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  labels_ids[labels_ids == -100] = tokenizer.pad_token_id
  label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  return {"bleu": round(corpus_bleu(pred_str , [label_str]).score, 4)}

In [27]:
#Set training arguments 
training_args = Seq2SeqTrainingArguments(
    output_dir="./model",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    gradient_accumulation_steps = 2,
    predict_with_generate=True,
    do_eval=True,
    evaluation_strategy ="epoch",
    do_train=True,
    logging_steps=500,  
    save_steps= 32965 // ( batch_size * 2),  
    warmup_steps=100,
    eval_steps=10,
    #max_steps=16, # delete for full training
    num_train_epochs=5,# uncomment for full training
    overwrite_output_dir=True,
    save_total_limit=0,
    fp16=True, 
)

In [28]:
# instantiate trainer
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=arabert2arabert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=dev_data,
    tokenizer=tokenizer
)

In [29]:
#Train
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Runtime,Samples Per Second
0,3.3627,3.091511,0.6558,98.2532,18.636
1,2.9483,2.8987,1.033,74.9163,24.441
2,2.6832,2.824916,1.3317,93.0634,19.675
3,2.4887,2.814663,1.5541,86.5416,21.157
4,2.3271,2.836491,1.5754,89.5628,20.444


TrainOutput(global_step=5150, training_loss=2.8740955063903217, metrics={'train_runtime': 2038.0928, 'train_samples_per_second': 2.527, 'total_flos': 44326223502336000, 'epoch': 5.0})

In [30]:
#Save tokenizer and model
trainer._save("/content/drive/MyDrive/thesis work/bert2bert_repo_updated/Arabic-Empathetic-Chatbot/model/model")
tokenizer.save_pretrained("./arabert2arabert")

('./arabert2arabert/tokenizer_config.json',
 './arabert2arabert/special_tokens_map.json',
 './arabert2arabert/vocab.txt',
 './arabert2arabert/added_tokens.json')

In [31]:
#Evaluate
eval_output = trainer.evaluate()

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [34]:
eval_output

{'epoch': 5.0,
 'eval_bleu': 1.5754,
 'eval_loss': 2.8364908695220947,
 'eval_runtime': 93.5242,
 'eval_samples_per_second': 19.578}

In [32]:
#Compute perplexity
import math
perplexity = math.exp(eval_output["eval_loss"])
print('\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))


Evaluate Perplexity:      17.06


**Gradio Demo** \\
This allows you to create a sharable web application of the model

In [35]:
!pip install gradio
import gradio as gr

Collecting gradio
  Downloading gradio-2.8.7-py3-none-any.whl (656 kB)
[K     |████████████████████████████████| 656 kB 4.2 MB/s 
[?25hCollecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)
Collecting orjson
  Downloading orjson-3.6.7-cp37-cp37m-manylinux_2_24_x86_64.whl (255 kB)
[K     |████████████████████████████████| 255 kB 64.7 MB/s 
Collecting paramiko
  Downloading paramiko-2.9.2-py2.py3-none-any.whl (210 kB)
[K     |████████████████████████████████| 210 kB 51.5 MB/s 
[?25hCollecting pycryptodome
  Downloading pycryptodome-3.14.1-cp35-abi3-manylinux2010_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 43.2 MB/s 
Collecting uvicorn
  Downloading uvicorn-0.17.5-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.2 MB/s 
[?25hCollecting python-multipart
  Downloading python-multipart-0.0.5.tar.gz (32 kB)
Collecting fastapi
  Downloading fastapi-0.75.0-py3-none-any.whl (54 kB)
[K     |████████

In [36]:
from transformers import EncoderDecoderModel, AutoTokenizer
from datasets import load_dataset 
from arabert.preprocess import ArabertPreprocessor
from torch.utils.data.dataloader import DataLoader
from transformers import default_data_collator
from torch.utils.data.sampler import SequentialSampler
import torch
from tqdm.notebook import tqdm

In [38]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)



In [None]:
tokenizer = AutoTokenizer.from_pretrained("./arabert2arabert")
model = EncoderDecoderModel.from_pretrained("./arabert2arabert")

model.to("cuda")
model.eval()
print("done")

In [37]:
def generate_response(text, minimum_length, k):
  text_clean = arabert_prep.preprocess(text)
  inputs = tokenizer.encode_plus(text_clean,return_tensors='pt')
  outputs = model.generate(input_ids = inputs.input_ids.to("cuda"),
                   attention_mask = inputs.attention_mask.to("cuda"),
                   num_beams=1,
                   do_sample = True,
                   min_length=minimum_length,
                   top_k = k,
                   temperature = 1,
                   length_penalty =2)
  preds = tokenizer.batch_decode(outputs) 
  response = str(preds)
  response = response.replace("\'", '')
  response = response.replace("[[CLS]", '')
  response = response.replace("[SEP]]", '')
  response = str(arabert_prep.desegment(response))
  return response

In [40]:
gr.Interface(fn=generate_response,
              inputs=[
          gr.inputs.Textbox(),
          gr.inputs.Slider(5, 20, step=1, label='Minimum Output Length'),
          gr.inputs.Slider(10, 1000, step=10, label='Top-K'),
          ],
             outputs="text").launch(share=True)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://55439.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<fastapi.applications.FastAPI at 0x7f6733d3f790>,
 'http://127.0.0.1:7860/',
 'https://55439.gradio.app')