In this notebook, you are shown **how to train** a BERT2BERT model initialized with AraBERT pre-trained parameters on the Arabic empathetic message-response dataset. A gradio demo is also provided at the end.

In [1]:
#Install dependencies
!pip install git-python==1.0.3
!pip install sacrebleu==1.4.2
!pip install rouge_score
!pip install farasapy
!git clone https://github.com/aub-mind/arabert
!pip install pyarabic
!pip install datasets
!pip install transformers==4.27.0
#!git clone  https://github.com/tareknaous/dialectal-conv/

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git-python==1.0.3
  Downloading git_python-1.0.3-py2.py3-none-any.whl (1.9 kB)
Collecting gitpython (from git-python==1.0.3)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gitdb<5,>=4.0.1 (from gitpython->git-python==1.0.3)
  Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython->git-python==1.0.3)
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython, git-python
Successfully installed git-python-1.0.3 gitdb-4.0.10 gitpython-3.1.31 smmap-5.0.0
Looking in indexes: https://pypi.org/simple, https://us-pyth

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=938537ad33141d9a4d8fd9d3289eab0d192fc36875b8533cd89c589b0a57bcb0
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14
Cloning into 'arabert'...


In [2]:
#Fetch dataset
!wget https://raw.githubusercontent.com/sylvanayakhni/Arabic-empathetic-dialogues/main/arabic_empathetic_dialogues_above_2turns.csv

--2023-06-08 11:03:23--  https://raw.githubusercontent.com/sylvanayakhni/Arabic-empathetic-dialogues/main/arabic_empathetic_dialogues_above_2turns.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10034879 (9.6M) [text/plain]
Saving to: ‘arabic_empathetic_dialogues_above_2turns.csv’


2023-06-08 11:03:24 (211 MB/s) - ‘arabic_empathetic_dialogues_above_2turns.csv’ saved [10034879/10034879]



In [3]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset 
import transformers
from transformers import BertTokenizer, EncoderDecoderModel
from sacrebleu import corpus_bleu
from transformers import BertTokenizerFast, EncoderDecoderModel
from transformers import Seq2SeqTrainingArguments
from dataclasses import dataclass, field
from typing import Optional

In [4]:
encoder_max_length=150
decoder_max_length=150
model_name = "aubmindlab/bert-base-arabert"

tokenizer = BertTokenizerFast.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.26M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

In [5]:
vocab = tokenizer.get_vocab()
len(vocab)

64000

In [6]:
special_tokens = {
    'bos_token': '<|startoftext|>',
    'additional_special_tokens': ['< | speaker - 1 | >', '< | speaker - 2 | >', '<|pad|>', '<|mask|>']
}

In [7]:
_ = tokenizer.add_special_tokens(special_tokens)
vocab = tokenizer.get_vocab()
len(vocab)

64005

In [8]:
bos_id = vocab['<|startoftext|>']
speaker_1_id = vocab['< | speaker - 1 | >']
speaker_2_id = vocab['< | speaker - 2 | >']
mask = vocab['<|mask|>']
pad_id = vocab['[PAD]']

In [9]:
all_data = load_dataset("ArabicEmpatheticDialogues.py")
train_data = all_data['train'].train_test_split(test_size=0.1,seed=42)['train']
val_data = all_data['train'].train_test_split(test_size=0.1,seed=42)['test']
dev_data = val_data.train_test_split(test_size=0.5,seed=42)['train']
test_data = val_data.train_test_split(test_size=0.5,seed=42)['test']



100%|██████████| 241M/241M [00:19<00:00, 12.5MiB/s]




Downloading and preparing dataset arabic_empathetic_dialogues/arabic_emp_conv to /root/.cache/huggingface/datasets/arabic_empathetic_dialogues/arabic_emp_conv/1.0.0/1b9df87a8aa595b32fc415699e2322bf31e086e84b03e3505e7d0efe63f0233b...


Downloading data:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset arabic_empathetic_dialogues downloaded and prepared to /root/.cache/huggingface/datasets/arabic_empathetic_dialogues/arabic_emp_conv/1.0.0/1b9df87a8aa595b32fc415699e2322bf31e086e84b03e3505e7d0efe63f0233b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



In [10]:
print("Length of train data",len(train_data))
print("Length of dev data",len(dev_data))
print("Length of test data",len(test_data))

Length of train data 17495
Length of dev data 972
Length of test data 972


In [11]:
def generate_token_speaker_ids(input_ids):
  
  dialogues_with_speaker_ids = []
  for dialogue in input_ids:
    token_type_ids = []
    type_id = speaker_1_id
    for token in dialogue:

      if token == speaker_1_id:
          type_id = speaker_1_id
          token_type_ids.append(type_id)
      elif token == speaker_2_id:
          type_id = speaker_2_id
          token_type_ids.append(type_id)
      elif token == pad_id:
          type_id = speaker_1_id
          token_type_ids.append(type_id)        
      else:
          token_type_ids.append(type_id)
    token_type_ids = token_type_ids[:150]
    dialogues_with_speaker_ids.append(token_type_ids)
  return dialogues_with_speaker_ids

In [12]:
def generate_inputs(input_ids):

    input_dialogues = []

    for dialogue in input_ids:
      input_tokens =[]
      last_index = -1
      
      for i in range(len(dialogue) - 1, -1, -1):
          if dialogue[i] == speaker_2_id:
            last_index = i
            break

      for i in range(len(dialogue)):
        if i<= last_index:
          input_tokens.append(dialogue[i])
        else: 
          input_tokens.append(tokenizer.pad_token_id)
      input_tokens = input_tokens[:150]
      input_dialogues.append(input_tokens)

    return input_dialogues

In [13]:
def mask_except_reply(input_ids, speaker_2_id):

    masked_dialogues = []

    for dialogue in input_ids:
      masked_tokens =[]
      last_index = -1
      
      for i in range(len(dialogue) - 1, -1, -1):
          if dialogue[i] == speaker_2_id:
            last_index = i
            break
      
      count =0 
      for j in range(last_index, len(dialogue)-1):
        masked_tokens.append(dialogue[j])
        count = count+1

      for i in range(count, len(dialogue)-1):
        masked_tokens.append(tokenizer.pad_token_id)

      masked_tokens.append(tokenizer.pad_token_id)
      masked_tokens = masked_tokens[:150]
      masked_dialogues.append(masked_tokens)

    return masked_dialogues

In [14]:
def process_data_to_model_inputs(batch):                                                             
    # Tokenizer will automatically set [BOS] <text> [EOS]
    inputs = tokenizer(batch["dialogue"], padding="max_length", truncation=True, max_length=encoder_max_length)

    batch['input_ids'] = generate_inputs(inputs.input_ids)

    batch["attention_mask"] = [[1 if token!=tokenizer.pad_token_id else 0 for token in inputs] for inputs in batch['input_ids']]
    
    batch['token_type_ids'] = generate_token_speaker_ids(batch['input_ids'])
    
    batch["decoder_input_ids"] = mask_except_reply(inputs.input_ids, speaker_2_id)

    #batch['decoder_speaker_ids'] = [[speaker_2_id if token!=tokenizer.pad_token_id else token for token in encoder_ids] for encoder_ids in batch['decoder_input_ids']]

    batch["decoder_token_type_ids"] = [[speaker_2_id for token in encoder_ids] for encoder_ids in batch["decoder_input_ids"]]
    
    batch["decoder_attention_mask"] = [[0 if token == tokenizer.pad_token_id else 1 for token in labels] for labels in batch['decoder_input_ids']]

    batch["labels"] = [                                                                                 
    [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch['decoder_input_ids']]

    return batch

In [15]:
batch_size=16

In [16]:
train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["dialogue", "emotion"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

dev_data = dev_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["dialogue"]
)
dev_data.set_format(
    type="torch", columns=["input_ids", "attention_mask","decoder_input_ids", "decoder_attention_mask", "labels"],
)

test_data = test_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size,
    remove_columns=["dialogue"]
)
test_data.set_format(
    type="torch", columns=["input_ids", "attention_mask","decoder_input_ids", "decoder_attention_mask", "labels"],
)

Map:   0%|          | 0/17495 [00:00<?, ? examples/s]

Map:   0%|          | 0/972 [00:00<?, ? examples/s]

Map:   0%|          | 0/972 [00:00<?, ? examples/s]

In [17]:
train_data[100]

{'input_ids': tensor([29756, 64001,  6145, 32026,   126,   816,  3000,   486,  2781,  4274,
           125, 15577,   125,   818, 11226,  2475,   124,   897,  1016,   126,
           781,  3823,   125,  4817,   125, 64002,  7153,   401, 14419,   448,
          3000,  6990,   125,  6834,   124,   834,    11,   893, 15937,   126,
          3000, 47951,  2781,   404, 38874,  1012, 20727,    83, 64001,  6145,
          3099,   126,   816,  3000,   486,   781,  3000,  3091,  5467,  3000,
         47204,   816, 21865,   125,   834,   894, 40302,   781, 51699, 64002,
         29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757,
         29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757,
         29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757,
         29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757,
         29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757, 29757,
         29757, 29757, 29757, 29757, 29

In [19]:
print(len(train_data[200]['input_ids']))
print(len(train_data[200]['attention_mask']))
#print(len(train_data[200]['token_type_ids']))
print(len(train_data[200]['decoder_input_ids']))
print(len(train_data[200]['decoder_attention_mask']))
print(len(train_data[200]['labels']))

150
150
150
150
150


In [20]:
# Create an instance of your custom model
arabert2arabert = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name, tie_encoder_decoder=False)

Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabert were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at aubmindlab/bert-base-arabert were not used when initializing BertLMHeadModel: ['cls.seq_relationship.wei

In [21]:
arabert2arabert.encoder.resize_token_embeddings(len(vocab))

Embedding(64005, 768)

In [22]:
arabert2arabert.decoder.resize_token_embeddings(len(vocab))

Embedding(64005, 768)

In [23]:
import torch.nn as nn
#arabert2arabert.encoder.embeddings.token_type_embeddings = nn.Embedding(64003, 768)

In [None]:
#arabert2arabert.decoder.bert.embeddings.token_type_embeddings = nn.Embedding(64003, 768)

In [None]:
arabert2arabert

In [25]:
#set special tokens
arabert2arabert.config.decoder_start_token_id = tokenizer.cls_token_id                                             
arabert2arabert.config.eos_token_id = tokenizer.sep_token_id
arabert2arabert.config.pad_token_id = tokenizer.pad_token_id


#sensible parameters for beam search
#set decoding params                               
arabert2arabert.config.max_length = 150
arabert2arabert.config.early_stopping = True

arabert2arabert.config.num_beams = 1
arabert2arabert.config.vocab_size = arabert2arabert.config.encoder.vocab_size

In [26]:
def compute_metrics(pred):
  labels_ids = pred.label_ids
  #pred_ids = torch.argmax(pred.predictions,dim=2)
  pred_ids = pred.predictions  

  # all unnecessary tokens are removed
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  labels_ids[labels_ids == -100] = tokenizer.pad_token_id
  label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  return {"bleu": round(corpus_bleu(pred_str , [label_str]).score, 4)}

In [None]:
!pip install accelerate -U

In [28]:
#Set training arguments 
training_args = Seq2SeqTrainingArguments(
    output_dir="./dialogue_model",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps = 2,
    predict_with_generate=True,
    do_eval=True,
    evaluation_strategy ="epoch",
    do_train=True,
    logging_steps=500,  
    save_steps= 32965 // ( batch_size * 2),  
    warmup_steps=100,
    eval_steps=10,
    #max_steps=16, # delete for full training
    num_train_epochs=5,# uncomment for full training
    overwrite_output_dir=True,
    save_total_limit=0,
    fp16=True, 
)

In [29]:
from transformers import Seq2SeqTrainer

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=arabert2arabert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=dev_data,
)

In [30]:
#Train
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu
1,0.2938,22.466631,0.0
2,0.0007,22.953604,0.0003




Epoch,Training Loss,Validation Loss,Bleu
1,0.2938,22.466631,0.0
2,0.0007,22.953604,0.0003
3,0.0002,23.039093,0.0012
4,0.0015,23.603058,0.0006
5,0.0001,23.552732,0.0007




TrainOutput(global_step=2735, training_loss=0.05417548025629638, metrics={'train_runtime': 3553.648, 'train_samples_per_second': 24.616, 'train_steps_per_second': 0.77, 'total_flos': 1.57238950483575e+16, 'train_loss': 0.05417548025629638, 'epoch': 5.0})

In [None]:
#Evaluate
eval_output = trainer.evaluate()

In [None]:
#Compute perplexity
import math
perplexity = math.exp(eval_output["eval_loss"])
print('\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))

In [31]:
#Save tokenizer and model
trainer._save("./multi_turn_arabert2arabert")
tokenizer.save_pretrained("./multi_turn_arabert2arabert")

('./multi_turn_arabert2arabert/tokenizer_config.json',
 './multi_turn_arabert2arabert/special_tokens_map.json',
 './multi_turn_arabert2arabert/vocab.txt',
 './multi_turn_arabert2arabert/added_tokens.json',
 './multi_turn_arabert2arabert/tokenizer.json')

In [32]:
from arabert.preprocess import ArabertPreprocessor
from itertools import chain
from transformers import AutoTokenizer

In [33]:
tokenizer = AutoTokenizer.from_pretrained("./multi_turn_arabert2arabert")
model = EncoderDecoderModel.from_pretrained("./multi_turn_arabert2arabert", ignore_mismatched_sizes=True)

In [None]:
import torch.nn as nn
model.encoder.embeddings.token_type_embeddings = nn.Embedding(64003, 768)
model.decoder.bert.embeddings.token_type_embeddings = nn.Embedding(64003, 768)

In [None]:
model

In [35]:
vocab = tokenizer.get_vocab()
len(vocab)

64005

In [36]:
import torch
device = torch.device('cuda')
model.to(device)
model.eval()
print("done")

done


In [37]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)



In [39]:
RESET_PROMPT = 'reset'
MAX_LEN = 150
TOP_P = 1

In [61]:
def chat():
    print('[Entering chat session ...]')
    print(f'To quit the conversation and reset memory, please type "{RESET_PROMPT}"')
    
    query_history = []
            
    while True:
        utterance = input('You: ')
        
        # Exit session if user types the RESET prompt
        if utterance == RESET_PROMPT:
            print(f'[Exiting chat session]')
            break
            
        # Add speaker 1 id to start of query and encode it using the tokenizer
        utterance = "<|speaker-1|>" + utterance
        print(utterance)
        utterance = arabert_prep.preprocess(utterance)
        print(utterance)
        inputs = tokenizer.encode_plus(utterance,return_tensors='pt')
        #inputs = tokenizer.encode_plus(utterance)
        outputs = model.generate(input_ids = inputs.input_ids.to("cuda"),
                  attention_mask = inputs.attention_mask.to("cuda"),
                  do_sample = True,
                  min_length=10,
                  top_k = 100,
                  temperature = 1.5,
                  length_penalty =0.5)
        print("outputs", outputs)
        preds = tokenizer.batch_decode(outputs) 
        response = str(preds)
        print("response")
        response = response.replace("\'", '')
        response = response.replace("[[CLS]", '')
        response = response.replace("[SEP]]", '')
        response = str(arabert_prep.desegment(response))
        print(f'Bot: {response}')

In [46]:
def chat():
    print('[Entering chat session ...]')
    print(f'To quit the conversation and reset memory, please type "{RESET_PROMPT}"')
    
    query_history = []
            
    while True:
        utterance = input('You: ')
        
        # Exit session if user types the RESET prompt
        if utterance == RESET_PROMPT:
            print(f'[Exiting chat session]')
            break
            
        # Add speaker 1 id to start of query and encode it using the tokenizer
        utterance = "<|speaker-1|>" + utterance
    
        # if len(query_history) >= MAX_TURNS:
        #    num_exceeded = len(query_history) - MAX_TURNS
        #   query_history = query_history[num_exceeded:]
            
        # Add beginning of sequence and end of sequence ids to input_ids, and convert it to a tensor
        #input_ids = [bos_id] + list(chain.from_iterable(query_history)) + [speaker_2_id]
         # Determine the speaker of the first turn based on the first speaker id
        start_sp_id = query_history[0][1]
        
        # Determine the speaker of the next turn
        next_sp_id = speaker_2_id 

        # Create token type ids for each turn based on the speaker of the turn
        token_type_ids = [[start_sp_id] * len(turn) if h % 2 == 0 else [next_sp_id] * len(turn) for h, turn in enumerate(query_history)]
        print(token_type_ids)
        # Add beginning of sequence and end of sequence ids to token_type_ids, and convert it to a tensor
        #token_type_ids = [start_sp_id] + list(chain.from_iterable(token_type_ids)) + [speaker_2_id]

        # Determine the length of the input_ids tensor
        input_len = len(input_ids)
        
        # Convert input_ids and token_type_ids to PyTorch tensors, add an extra dimension, and move to the device (GPU)
        input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
        attention_mask = torch.LongTensor(attention_mask).unsqueeze(0).to(device)
        token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(device)

         # generate a response from the model given some input
        output_ids = model.generate(input_ids=input_ids, 
                                    #token_type_ids=token_type_ids,
                                    attention_mask = attention_mask,
                                    #pad_token_id=eos_id, 
                                    do_sample=True, 
                                    top_p=TOP_P, 
                                    max_length=MAX_LEN)
        print("output_ids", output_ids)
        preds = tokenizer.batch_decode(output_ids) 
        print("preds", preds)
        response = str(preds)
        print("response", response)
        response = response.replace("\'", '')
        response = response.replace("[[CLS]", '')
        response = response.replace("[SEP]]", '')
        response = str(arabert_prep.desegment(response))

        # extract the generated sequence from the output and remove the input sequence
        #output_ids = output_ids[0].tolist()[input_len:]
        
        # convert the generated sequence of token ids into text
        #response = tokenizer.decode(output_ids, skip_special_tokens=True)
        print(f'Bot: {response}')
        
        # append the generated sequence to the query history as token ids
        query_history.append([speaker_2_id] + tokenizer.encode(response))    
        

In [62]:
chat()

[Entering chat session ...]
To quit the conversation and reset memory, please type "reset"
You: أنا أشعر بالحرج الشديد
<|speaker-1|>أنا أشعر بالحرج الشديد
< | speaker - 1 | > أنا أشعر ب+ ال+ حرج ال+ شديد




outputs tensor([[29756, 64002,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,  6077,
          6077,  6077,  6077,  6077,  6077, 

**Gradio Demo** \\
This allows you to create a sharable web application of the model

In [None]:
!pip install gradio
import gradio as gr

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer
from datasets import load_dataset 
from arabert.preprocess import ArabertPreprocessor
from torch.utils.data.dataloader import DataLoader
from transformers import default_data_collator
from torch.utils.data.sampler import SequentialSampler
import torch
from tqdm.notebook import tqdm

In [None]:
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./arabert2arabert")
model = EncoderDecoderModel.from_pretrained("./arabert2arabert")

model.to("cuda")
model.eval()
print("done")

In [None]:
def generate_response(text, minimum_length, k):
  text_clean = arabert_prep.preprocess(text)
  inputs = tokenizer.encode_plus(text_clean,return_tensors='pt')
  outputs = model.generate(input_ids = inputs.input_ids.to("cuda"),
                   attention_mask = inputs.attention_mask.to("cuda"),
                   num_beams=1,
                   do_sample = True,
                   min_length=minimum_length,
                   top_k = k,
                   temperature = 1,
                   length_penalty =2)
  preds = tokenizer.batch_decode(outputs) 
  response = str(preds)
  response = response.replace("\'", '')
  response = response.replace("[[CLS]", '')
  response = response.replace("[SEP]]", '')
  response = str(arabert_prep.desegment(response))
  return response

In [None]:
gr.Interface(fn=generate_response,
              inputs=[
          gr.inputs.Textbox(),
          gr.inputs.Slider(5, 20, step=1, label='Minimum Output Length'),
          gr.inputs.Slider(10, 1000, step=10, label='Top-K'),
          ],
             outputs="text").launch(share=True)