<a href="https://colab.research.google.com/github/zeyadahmed10/Arabic-MRC/blob/Training/AraBERT_Base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install & import required packages

---



In [1]:
import os
import shutil
from collections import Counter, deque
#import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel , AutoTokenizer, ElectraForQuestionAnswering
from Preprocess.arabertpreprocess import ArabertPreprocessor
torch.manual_seed(3407)


<torch._C.Generator at 0x7f4b04a04ab0>

## Creating new directories for AAQAD and Download it
---

In [2]:
data_urls = ['https://drive.google.com/uc?id=1V5ziIZe__pGg14nH42WyMEFz444XPWf7','https://drive.google.com/uc?id=19nj9jiCdJlHwAfgUTJ_Z8jg1cB34yfjv','https://drive.google.com/uc?id=1z0XksuTwnqhiX1guxkmjYmoNA_JZ6SUN' ]
if not os.path.exists('Data'):
  os.mkdir('Data')
  %cd Data
  with open('data_url.txt','a') as f:
    names = ['train', 'validation', 'test']
    for i in range(3):
      f.write(names[i]+','+data_urls[i]+'\n')
  !pip install gdown
  !gdown https://drive.google.com/uc?id=1V5ziIZe__pGg14nH42WyMEFz444XPWf7 -O AAQAD\-train.json
  !gdown https://drive.google.com/uc?id=19nj9jiCdJlHwAfgUTJ_Z8jg1cB34yfjv -O AAQAD\-dev.json
  !gdown https://drive.google.com/uc?id=1z0XksuTwnqhiX1guxkmjYmoNA_JZ6SUN -O AAQAD\-test.json
  %cd .. 


## Load data and preprocessing
---

In [3]:
##DATA TREE AND TYPE##
'''
aaqad_dev_dict['data']##list of articles
aaqad_dev_dict['data'][0]## dictionary of paragraph -- keys(title, paragraph)
aaqad_dev_dict['data'][0]['paragraphs'] ##list of contexts
aaqad_dev_dict['data'][0]['paragraphs'][0] ## dictionary of context and crossponding QAs pairs --keys(context, qas)
aaqad_dev_dict['data'][0]['paragraphs'][0]['qas'] ##list of QAs pair
aaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0] ##dictionary of the elements of each question --keys(id, is_impossible,question, answers)
aaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0]['answers'] ##dictionary of start index and answer text --keys(answer_start, text)'''

"\naaqad_dev_dict['data']##list of articles\naaqad_dev_dict['data'][0]## dictionary of paragraph -- keys(title, paragraph)\naaqad_dev_dict['data'][0]['paragraphs'] ##list of contexts\naaqad_dev_dict['data'][0]['paragraphs'][0] ## dictionary of context and crossponding QAs pairs --keys(context, qas)\naaqad_dev_dict['data'][0]['paragraphs'][0]['qas'] ##list of QAs pair\naaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0] ##dictionary of the elements of each question --keys(id, is_impossible,question, answers)\naaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0]['answers'] ##dictionary of start index and answer text --keys(answer_start, text)"

In [4]:
def add_end_index(answer, context):
  ## 1 if span mathc the context 0 otherwise
  text = answer['text']
  start_idx = answer['answer_start']
  end_idx = start_idx + len(text)
  if text == context[start_idx:end_idx]:
    answer['answer_end'] = end_idx
    return False
  for i in range(1,3):
    if text == context[start_idx-i:end_idx-i]:
      answer['answer_end']= end_idx-1
      answer['answer_start'] = start_idx-1
      return False
  return True

In [5]:
def _preprocess(context,question, answer, prep):
    answer['text'] = prep.preprocess(answer['text'])
    context = prep.preprocess(context)
    question = prep.preprocess(question)
    res = context.find(answer['text'])
    answer['answer_start'] = res
    return context, question, answer, res

In [6]:
def Read_AAQAD(path,electra_prep):
  contexts =[]
  answers =[]
  questions =[]
  plausible = []
  cnt = 0
  with open(path) as f:
    aaqad_dict = json.load(f)
    for article in aaqad_dict['data']:
      for passage in article['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
          question = qa['question']
          if 'plausible_answers' in qa.keys():# there is two cases if the question have no answer then use plausible answer
            access = 'plausible_answers'
            plausible.append(False)
          else:
            access = 'answers'
            plausible.append(True)
          for answer in qa[access]:
            context,question, answer, res =  _preprocess(context,question, answer, electra_prep)
            if res==-1:
              cnt+=1
              continue
            flag = add_end_index(answer, context) #if false dont add the 
            cnt =cnt + flag
            if not flag:
              contexts.append(context)
              answers.append(answer)
              questions.append(question)
  return contexts,questions,answers,plausible,cnt

In [7]:
model_name = 'araelectra-base-discriminator'
electraprep= ArabertPreprocessor(model_name=model_name)
train_contexts, train_questions, train_answers,train_plausible, train_span_error = Read_AAQAD('Data/AAQAD-train.json', electraprep)
val_contexts, val_questions, val_answers,val_plausible, val_span_error = Read_AAQAD('Data/AAQAD-dev.json', electraprep)
test_contexts, test_questions, test_answers,test_plausible, test_span_error = Read_AAQAD('Data/AAQAD-test.json',electraprep)

In [8]:
total_error = train_span_error + val_span_error + test_span_error
ratio = total_error/17817 #initial size of the data
print(f"Size of the data set before dropping the misslabeled spans: 17817 & after: {len(train_answers)+len(val_answers)+len(test_answers)}")
print(f"Size of each split: \n 1-Train: {len(train_answers)} \n 2-Validation: {len(val_answers)} \n 3-Test: {len(test_answers)}")
print(f"percentage of span's error {ratio}")
print(f"Number of errors for each split:\n 1-Train: {train_span_error} \n 2-Validation: {val_span_error}\n 3-Test: {test_span_error}")


Size of the data set before dropping the misslabeled spans: 17817 & after: 17816
Size of each split: 
 1-Train: 12629 
 2-Validation: 1926 
 3-Test: 3261
percentage of span's error 5.6126171633832856e-05
Number of errors for each split:
 1-Train: 0 
 2-Validation: 0
 3-Test: 1


## Tokenization
---

In [9]:

#model_name = "aubmindlab/bert-base-arabertv02"
#arabert_tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False)
#model = AutoModel.from_pretrained(model_name)

In [10]:
#arabert_tokenizer.save_pretrained('CachedPretrained/bert-base-arabertv02')
#model.save_pretrained('CachedPretrained/bert-base-arabertv02')

In [11]:
#Creating the tokenizer
model_name = 'aubmindlab/araelectra-base-discriminator'
electra_tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False)
train_encodings = electra_tokenizer(train_questions, train_contexts, truncation=True, padding=True, return_tensors="pt")
val_encodings = electra_tokenizer(val_questions, val_contexts, truncation=True, padding=True, return_tensors="pt")
test_encodings = electra_tokenizer(test_questions, test_contexts,truncation= True, padding= True, return_tensors="pt")


The encoding is dictionary of ['input_ids', 'token_type_ids', 'attention_mask'] <br>
Input_ids: are the token of each sequence

In [12]:
train_answers[0]

{'text': 'تصل المسافة التي', 'answer_start': 213, 'answer_end': 229}

In [13]:
train_contexts[0]
train_questions[0]

'ما الذي تعتمد عليه بعض الطيور الكبيرة ذات الجناحين لمساعدتها على الارتفاع ؟'

In [14]:
train_encodings.char_to_token(0,train_answers[0]['answer_start'], 1)

61

In [15]:
def index_to_token_position(encodings , answers):
  start_positions = list()
  end_positions = list()
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'], 1))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'], 1))
    #if context truncated
    if start_positions[-1] is None: 
      start_positions[-1] = electra_tokenizer.model_max_length
    #if end index is space
    itt = 1
    while end_positions[-1] is None: 
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-itt, 1)
      itt = itt + 1 
  encodings.update({'start_positions': torch.tensor(start_positions), 'end_positions': torch.tensor(end_positions)})
  encodings['start_positions'] = encodings['start_positions'].view(len(answers), 1)
  encodings['end_positions'] = encodings['end_positions'].view(len(answers), 1)

In [16]:
index_to_token_position(train_encodings, train_answers)
index_to_token_position(val_encodings, val_answers)
index_to_token_position(test_encodings, test_answers)
#index_to_token_position(model_encodings, train_answers[:2])

In [17]:
def add_weights_tensor(encodings, plausible):
  plausible = torch.tensor(plausible)
  weights = torch.ones(plausible.shape)
  weights[plausible==True]=2.0
  weights = weights.view(-1,1)
  encodings.update({'weights':weights})

In [18]:
add_weights_tensor(train_encodings, train_plausible)
add_weights_tensor(val_encodings, val_plausible)
add_weights_tensor(test_encodings, test_plausible)
#add_weights_tensor(model_encodings,train_plausible[:2])

In [19]:
def is_truncated(start_pos):
  cnt = 0
  for pos in start_pos:
    if pos==512:
      cnt+=1
  return cnt

print(is_truncated(train_encodings['start_positions']))
print(is_truncated(val_encodings['start_positions']))
print(is_truncated(test_encodings['start_positions']))

20
0
3


In [20]:
test_encodings['start_positions'].shape

torch.Size([3261, 1])

In [21]:
#for metrics
#decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
#print(decoded_string)

## Dataset and Dataloader
---

In [22]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [23]:
class AqadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = AqadDataset(train_encodings)
val_dataset = AqadDataset(val_encodings)
#model_dataset = AqadDataset(model_encodings)

In [24]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = 8, shuffle= True)
#model_loader = DataLoader(model_dataset, batch_size = 2, shuffle= True)

In [25]:
print(train_loader.batch_size, val_loader.batch_size)

8 8


## Evaluation Metrics
---

In [26]:
def EM_score(pred, GT):
  if torch.equal(pred, GT):return 1
  return 0

In [27]:
def F1_score(prediction, ground_truth):
    prediction_tokens = prediction.tolist()
    ground_truth_tokens = ground_truth.tolist()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

## Checkpoint Saving And Loading
---

In [28]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best checkpoint
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [29]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to saved checkpoint
    model: model to load checkpoint parameters into       
    optimizer: optimizer defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['val_loss']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [30]:
def order_exp(base_path, exp_name):
  exp_path = os.path.join(base_path, exp_name)
  if not os.path.exists(exp_path):
    os.mkdir(exp_path)
  curr_ckp_path = os.path.join(exp_path,'curr.pt')
  best_ckp_path = os.path.join(exp_path, 'best.pt')
  return curr_ckp_path, best_ckp_path, exp_path

## Modeling
---

In [31]:
model_name = 'aubmindlab/araelectra-base-discriminator'
QA_AraElectra = ElectraForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at aubmindlab/araelectra-base-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/araelectra-base-discriminator and are newly initialized: ['qa_outputs.weight'

In [32]:
#print(QA_AraElectra)
for layer in QA_AraElectra.electra.encoder.layer[:4]:
    for param in layer.parameters():
        param.requires_grad = False
print(sum(p.numel() for p in QA_AraElectra.parameters()))
print(sum(p.numel() for p in QA_AraElectra.parameters() if p.requires_grad == True))

134604290
106252802


In [33]:
def evaluate(data_loader, model, log, log_path=None, train_loss=None): 
  model.eval()
  with torch.no_grad():
    F1 = EM = Total = 0
    total_loss = 0.0
    #loop = tqdm(data_loader)
    #loop = tqdm(data_loader, leave=True)
    for batch_idx, batch in enumerate(data_loader):
      #moving tensors to gpu
      #print(f"this is batch size {data_loader.batch_size}")
      
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_start = batch['start_positions'].to(device)
      gt_end = batch['end_positions'].to(device)
      #weights = batch['weights'].to(device)
      #print(f"this is tensor size {gt_start.shape}")
      #predictions
      outputs = model(tokens, masks, tokens_type, start_positions=gt_start, end_positions=gt_end)
      #calculating loss
      loss = outputs.loss
      #update average total loss 
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss - total_loss)) 
      #calculating f1 score and EM
      curr_batch_size = gt_start.shape[0]
      #print(curr_batch_size)
      for i in range(curr_batch_size):
        #print(f"this is tensor index {i}")
        start_gt, end_gt = batch['start_positions'][i], batch['end_positions'][i]
        gt_tokens = batch['input_ids'][i][start_gt.item():end_gt.item()+1]
        start_pred, end_pred = torch.argmax(outputs.start_logits[i],dim=0), torch.argmax(outputs.end_logits[i],dim =0)
        pred_tokens = batch['input_ids'][i][start_pred.item():end_pred.item()+1]
        F1 += F1_score(pred_tokens, gt_tokens)
        EM += EM_score(torch.tensor([start_pred, end_pred]), torch.tensor([start_gt,end_gt]))
        Total +=1
    EM = 100.0 *EM/Total
    F1 = 100.0 * F1 /Total
    #saving evaluation results
    
    if(log):
      log_path = os.path.join(log_path,'res.txt')
      if not os.path.exists(log_path):
          with open(log_path,'w') as f:
              f.write('EM,f1,ValidationLoss,TrainLoss \n')
      with open(log_path, 'a') as f:
        #validation resultss
        f.write(f"{EM:.2f},{F1:.2f},{total_loss:.2f},{train_loss:.2f} \n") 
    model.train()
    print(f"Validation Results: EM:{EM:.2f}, f1: {F1:.2f}, loss: {total_loss:.2f}")
    return EM, F1, total_loss

In [34]:
def train(model,start_epoch, num_epochs, optimizer,max_compined_metric, train_loader, val_loader, log, exp_name):
  curr_ckp_path, best_ckp_path, exp_path = order_exp('Runs/AraElectraAQAD/train', exp_name)
  lastfive = deque()
  eps = 1e-4
  model.train()
  for epoch in range(start_epoch,num_epochs):
    total_loss = 0.0
    loop = tqdm(train_loader, leave=True)
    for batch_idx, batch in enumerate(loop):
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_start = batch['start_positions'].to(device)
      gt_end = batch['end_positions'].to(device)
      outputs = model(tokens, masks, tokens_type, start_positions=gt_start, end_positions=gt_end)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss - total_loss)) 
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())

    val_em, val_f1, val_loss = evaluate(val_loader, model , log, exp_path, total_loss)
    curr_compined_metric = val_em + (1.5*val_f1)
    checkpoint = {
            'epoch': epoch + 1,
            'val_loss': val_loss,
            'val_em': val_em,
            'val_f1': val_f1,
            'compined_metric': curr_compined_metric, 
            'train_loss':total_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
    if curr_compined_metric>=max_compined_metric:
      max_compined_metric = curr_compined_metric
      save_ckp(checkpoint, True, curr_ckp_path, best_ckp_path)
    else:
      save_ckp(checkpoint, False, curr_ckp_path, best_ckp_path)
    #callback
    if len(lastfive)==5:
        totalsum = sum(lastfive)/5
        if abs(totalsum-curr_compined_metric)==eps:
            return model
        lastfive.popleft()
        lastfive.append(curr_compined_metric)
    else:
        lastfive.append(curr_compined_metric)
  return model


In [35]:
num_epochs = 10
learning_rate = 3e-5
optimizer = torch.optim.Adam(QA_AraElectra.parameters(), lr=learning_rate)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
QA_AraElectra.to(device)

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [36]:
trained_model = train(QA_AraElectra,0, num_epochs, optimizer,0.0, train_loader, val_loader, True, 'first')

  0%|          | 0/1579 [00:00<?, ?it/s]  0%|          | 0/1579 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 11.17 GiB total capacity; 793.62 MiB already allocated; 26.50 MiB free; 834.00 MiB reserved in total by PyTorch)