In [2]:
import os
import shutil
from collections import Counter, dqueue
import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel , AutoTokenizer, ElectraForQuestionAnswering
from Preprocess.arabertpreprocess import ArabertPreprocessor
torch.manual_seed(3407)

<torch._C.Generator at 0x7f203c10adb0>

In [3]:
def add_end_index(answer, context):
  ## 1 if span mathc the context 0 otherwise
  text = answer['text']
  start_idx = answer['answer_start']
  end_idx = start_idx + len(text)
  if text == context[start_idx:end_idx]:
    answer['answer_end'] = end_idx
    return False
  for i in range(1,3):
    if text == context[start_idx-i:end_idx-i]:
      answer['answer_end']= end_idx-1
      answer['answer_start'] = start_idx-1
      return False
  return True

In [31]:
def electra_preprocess(context, question, answer, electraprep):
    answer['text'] = electraprep.preprocess(answer['text'][0])
    context = electraprep.preprocess(context)
    question = electraprep.preprocess(question)
    res = context.find(answer['text'])
    answer['answer_start'] = res
    return context, question ,answer, res

In [32]:
def generate_examples(filepath):
        """This function returns the examples in the raw (text) form."""
        #logger.info("generating examples from = %s", filepath)
        with open(filepath, encoding="utf-8") as f:
            arcd = json.load(f)
            for article in arcd["data"]:
                title = article.get("title", "").strip()
                for paragraph in article["paragraphs"]:
                    context = paragraph["context"].strip()
                    for qa in paragraph["qas"]:
                        question = qa["question"].strip()
                        id_ = qa["id"]

                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                        answers = [answer["text"].strip() for answer in qa["answers"]]

                        # Features currently used are "context", "question", and "answers".
                        # Others are extracted here for the ease of future expansions.
                        yield id_, {
                            "title": title,
                            "context": context,
                            "question": question,
                            "id": id_,
                            "answers": {"answer_start": answer_starts, "text": answers},
                        }

In [36]:
def Read_ARCD(path, electraprep):
  contexts, questions, answers = [], [], []
  for x, data in generate_examples(path):
    #print(data['answers']['text'][0])
    context, question, answer, res = electra_preprocess(data['context'],data['question'], data['answers'], electraprep)
    #print(answer, context)
    #print(res)
    if res==-1:
        continue
    add_end_index(answer, context)
    contexts.append(context)
    questions.append(question)
    answers.append(answer)
  return contexts, questions, answers

model_name = 'araelectra-base-discriminator'
electraprep= ArabertPreprocessor(model_name=model_name)
train_contexts, train_questions, train_answers = Read_ARCD('Data/arcd-train.json', electraprep)
val_contexts, val_questions, val_answers = Read_ARCD('Data/arcd-test.json', electraprep)

In [37]:
print(len(train_contexts))
print(len(val_contexts))

693
702


## Encodings

In [39]:
#Creating the tokenizer
model_name = 'aubmindlab/araelectra-base-discriminator'
electra_tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False)
train_encodings = electra_tokenizer(train_questions, train_contexts, truncation=True, padding=True, return_tensors="pt")
val_encodings = electra_tokenizer(val_questions, val_contexts, truncation=True, padding=True, return_tensors="pt")
#test_encodings = electra_tokenizer(test_questions, test_contexts,truncation= True, padding= True, return_tensors="pt")

Downloading:   0%|          | 0.00/503 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/825k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/392 [00:00<?, ?B/s]

In [42]:
val_encodings.input_ids.shape

torch.Size([702, 472])

In [43]:
def index_to_token_position(encodings , answers):
  start_positions = list()
  end_positions = list()
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'], 1))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'], 1))
    #if context truncated
    if start_positions[-1] is None: 
      start_positions[-1] = electra_tokenizer.model_max_length
    #if end index is space
    itt = 1
    while end_positions[-1] is None: 
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-itt, 1)
      itt = itt + 1 
  encodings.update({'start_positions': torch.tensor(start_positions), 'end_positions': torch.tensor(end_positions)})
  encodings['start_positions'] = encodings['start_positions'].view(len(answers), 1)
  encodings['end_positions'] = encodings['end_positions'].view(len(answers), 1)

In [44]:
index_to_token_position(train_encodings, train_answers)
index_to_token_position(val_encodings, val_answers)

In [45]:
def is_truncated(start_pos):
  cnt = 0
  for pos in start_pos:
    if pos==512:
      cnt+=1
  return cnt

print(is_truncated(train_encodings['start_positions']))
print(is_truncated(val_encodings['start_positions']))


0
0


## Dataset and DataLoader

In [46]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [47]:
class AqadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = AqadDataset(train_encodings)
val_dataset = AqadDataset(val_encodings)

In [48]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = 8, shuffle= True)

In [49]:
def EM_score(pred, GT):
  if torch.equal(pred, GT):return 1
  return 0

In [50]:
def F1_score(prediction, ground_truth):
    prediction_tokens = prediction.tolist()
    ground_truth_tokens = ground_truth.tolist()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [51]:
def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best checkpoint
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

In [52]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to saved checkpoint
    model: model to load checkpoint parameters into       
    optimizer: optimizer defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['val_loss']
    # return model, optimizer, epoch value, min validation loss 
    return model, optimizer, checkpoint['epoch'], valid_loss_min

In [53]:
def order_exp(base_path, exp_name):
  exp_path = os.path.join(base_path, exp_name)
  if not os.path.exists(exp_path):
    os.mkdir(exp_path)
  curr_ckp_path = os.path.join(exp_path,'curr.pt')
  best_ckp_path = os.path.join(exp_path, 'best.pt')
  return curr_ckp_path, best_ckp_path, exp_path

## Modeling

In [57]:
model_name = 'aubmindlab/araelectra-base-discriminator'
QA_AraElectra = ElectraForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at aubmindlab/araelectra-base-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at aubmindlab/araelectra-base-discriminator and are newly initialized: ['qa_outputs.weight'

In [67]:
def evaluate(data_loader, model, log, log_path=None, train_loss=None): 
  model.eval()
  with torch.no_grad():
    F1 = EM = Total = 0
    total_loss = 0.0
    #loop = tqdm(data_loader)
    #loop = tqdm(data_loader, leave=True)
    for batch_idx, batch in enumerate(data_loader):
      #moving tensors to gpu
      #print(f"this is batch size {data_loader.batch_size}")
      
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_start = batch['start_positions'].to(device)
      gt_end = batch['end_positions'].to(device)
      #weights = batch['weights'].to(device)
      #print(f"this is tensor size {gt_start.shape}")
      #predictions
      outputs = model(tokens, masks, tokens_type, start_positions=gt_start, end_positions=gt_end)
      #calculating loss
      loss = outputs.loss
      #update average total loss 
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss - total_loss)) 
      #calculating f1 score and EM
      curr_batch_size = gt_start.shape[0]
      #print(curr_batch_size)
      for i in range(curr_batch_size):
        #print(f"this is tensor index {i}")
        start_gt, end_gt = batch['start_positions'][i], batch['end_positions'][i]
        gt_tokens = batch['input_ids'][i][start_gt.item():end_gt.item()+1]
        start_pred, end_pred = torch.argmax(outputs.start_logits[i],dim=0), torch.argmax(outputs.end_logits[i],dim =0)
        pred_tokens = batch['input_ids'][i][start_pred.item():end_pred.item()+1]
        F1 += F1_score(pred_tokens, gt_tokens)
        EM += EM_score(torch.tensor([start_pred, end_pred]), torch.tensor([start_gt,end_gt]))
        Total +=1
    EM = 100.0 *EM/Total
    F1 = 100.0 * F1 /Total
    #saving evaluation results
    
    if(log):
      log_path = os.path.join(log_path,'res.txt')
      if not os.path.exists(log_path):
          with open(log_path,'w') as f:
              f.write('EM,f1,ValidationLoss,TrainLoss')
      with open(log_path, 'a') as f:
        #validation resultss
        f.write(f"{EM:.2f},{F1:.2f},{total_loss:.2f},{train_loss:.2f} \n") 
    model.train()
    print(f"Validation Results: EM:{EM:.2f}, f1: {F1:.2f}, loss: {total_loss:.2f}")
    return EM, F1, total_loss

In [64]:
def train(model,start_epoch, num_epochs, optimizer,min_val_loss, train_loader, val_loader, log, exp_name):
  curr_ckp_path, best_ckp_path, exp_path = order_exp('Runs/AraElectraArcd/train', exp_name)
  model.train()
  for epoch in range(start_epoch,num_epochs):
    total_loss = 0.0
    loop = tqdm(train_loader, leave=True)
    for batch_idx, batch in enumerate(loop):
      tokens = batch['input_ids'].to(device)
      masks = batch['attention_mask'].to(device)
      tokens_type = batch['token_type_ids'].to(device)
      gt_start = batch['start_positions'].to(device)
      gt_end = batch['end_positions'].to(device)
      outputs = model(tokens, masks, tokens_type, start_positions=gt_start, end_positions=gt_end)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      total_loss = total_loss + ((1 / (batch_idx + 1)) * (loss - total_loss)) 
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())

    val_em, val_f1, val_loss = evaluate(val_loader, model , log, exp_path, total_loss)
    checkpoint = {
            'epoch': epoch + 1,
            'val_loss': val_loss,
            'val_em': val_em,
            'val_f1': val_f1,
            'train_loss':total_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }
    if val_loss<=min_val_loss:
      min_val_loss = val_loss
      save_ckp(checkpoint, True, curr_ckp_path, best_ckp_path)
    else:
      save_ckp(checkpoint, False, curr_ckp_path, best_ckp_path)
  return model


In [65]:
num_epochs = 30
learning_rate = 3e-5
optimizer = torch.optim.Adam(QA_AraElectra.parameters(), lr=learning_rate)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
QA_AraElectra.to(device)

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(64000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [68]:
trained_model = train(QA_AraElectra,0, num_epochs, optimizer,np.inf, train_loader, val_loader, True, 'first')

Epoch 0: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=2.7]  
Epoch 1: 100%|██████████| 87/87 [02:16<00:00,  1.57s/it, loss=0.767]
Epoch 2: 100%|██████████| 87/87 [02:16<00:00,  1.57s/it, loss=1.16] 
Epoch 3: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.11]  
Epoch 4: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.122] 
Epoch 5: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.942] 
Epoch 6: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.157] 
Epoch 7: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.0696]
Epoch 8: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.0236] 
Epoch 9: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.0968] 
Epoch 10: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.0261] 
Epoch 11: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.0446] 
Epoch 12: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.00361]
Epoch 13: 100%|██████████| 87/87 [02:17<00:00,  1.58s/it, loss=0.00398]
Epoch 14: 100

Validation Results: EM:28.49, f1: 59.56, loss: 2.07
Validation Results: EM:30.06, f1: 60.22, loss: 2.12
Validation Results: EM:28.63, f1: 61.03, loss: 2.33
Validation Results: EM:30.20, f1: 61.92, loss: 2.40
Validation Results: EM:29.63, f1: 59.77, loss: 2.71
Validation Results: EM:31.34, f1: 61.47, loss: 2.72
Validation Results: EM:30.48, f1: 60.96, loss: 2.95
Validation Results: EM:29.63, f1: 60.47, loss: 2.99
Validation Results: EM:30.48, f1: 61.26, loss: 3.09
Validation Results: EM:30.77, f1: 61.64, loss: 3.13
Validation Results: EM:32.62, f1: 62.49, loss: 3.14
Validation Results: EM:29.20, f1: 61.27, loss: 3.29
Validation Results: EM:30.91, f1: 61.62, loss: 3.27
Validation Results: EM:29.91, f1: 61.21, loss: 3.44
Validation Results: EM:32.76, f1: 63.06, loss: 3.36
Validation Results: EM:28.92, f1: 60.83, loss: 3.46
Validation Results: EM:29.06, f1: 60.56, loss: 3.36
Validation Results: EM:30.48, f1: 62.68, loss: 3.57
Validation Results: EM:30.06, f1: 60.11, loss: 3.36
Validation R