<a href="https://colab.research.google.com/github/zeyadahmed10/Arabic-MRC/blob/Training/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install & import required packages

---



In [1]:
!pip install transformers



In [2]:
import os
import numpy as np
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel , AutoTokenizer

## Mount google drive & creating new directories for AAQAD
---

In [3]:
if not os.path.exists('/content/drive/MyDrive/MRC'):
  os.mkdir('/content/drive/MyDrive/MRC')
  os.mkdir('/content/drive/MyDrive/MRC/aaqad')

In [4]:
#uncomment if you did not download the data
""" !pip install gdown
!gdown https://drive.google.com/uc?id=1jhUmWb9eHVATqhrWKAXxSE2gqJ53-wk6 -O AAQAD.json
!gdown https://drive.google.com/uc?id=1V5ziIZe__pGg14nH42WyMEFz444XPWf7 -O AAQAD\-train.json
!gdown https://drive.google.com/uc?id=19nj9jiCdJlHwAfgUTJ_Z8jg1cB34yfjv -O AAQAD\-dev.json
!gdown https://drive.google.com/uc?id=1z0XksuTwnqhiX1guxkmjYmoNA_JZ6SUN -O AAQAD\-test.json 

src_path ='/content/'
dest_path = '/content/drive/MyDrive/MRC/aaqad/'
files_name =['AAQAD.json', 'AAQAD-train.json', 'AAQAD-dev.json', 'AAQAD-test.json']
for name in files_name:
  os.rename(src_path+name, dest_path+name)
"""

" !pip install gdown\n!gdown https://drive.google.com/uc?id=1jhUmWb9eHVATqhrWKAXxSE2gqJ53-wk6 -O AAQAD.json\n!gdown https://drive.google.com/uc?id=1V5ziIZe__pGg14nH42WyMEFz444XPWf7 -O AAQAD\\-train.json\n!gdown https://drive.google.com/uc?id=19nj9jiCdJlHwAfgUTJ_Z8jg1cB34yfjv -O AAQAD\\-dev.json\n!gdown https://drive.google.com/uc?id=1z0XksuTwnqhiX1guxkmjYmoNA_JZ6SUN -O AAQAD\\-test.json \n\nsrc_path ='/content/'\ndest_path = '/content/drive/MyDrive/MRC/aaqad/'\nfiles_name =['AAQAD.json', 'AAQAD-train.json', 'AAQAD-dev.json', 'AAQAD-test.json']\nfor name in files_name:\n  os.rename(src_path+name, dest_path+name)\n"

## Load data and preprocessing
---

In [5]:
##DATA TREE AND TYPE##
'''
aaqad_dev_dict['data']##list of articles
aaqad_dev_dict['data'][0]## dictionary of paragraph -- keys(title, paragraph)
aaqad_dev_dict['data'][0]['paragraphs'] ##list of contexts
aaqad_dev_dict['data'][0]['paragraphs'][0] ## dictionary of context and crossponding QAs pairs --keys(context, qas)
aaqad_dev_dict['data'][0]['paragraphs'][0]['qas'] ##list of QAs pair
aaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0] ##dictionary of the elements of each question --keys(id, is_impossible,question, answers)
aaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0]['answers'] ##dictionary of start index and answer text --keys(answer_start, text)'''

"\naaqad_dev_dict['data']##list of articles\naaqad_dev_dict['data'][0]## dictionary of paragraph -- keys(title, paragraph)\naaqad_dev_dict['data'][0]['paragraphs'] ##list of contexts\naaqad_dev_dict['data'][0]['paragraphs'][0] ## dictionary of context and crossponding QAs pairs --keys(context, qas)\naaqad_dev_dict['data'][0]['paragraphs'][0]['qas'] ##list of QAs pair\naaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0] ##dictionary of the elements of each question --keys(id, is_impossible,question, answers)\naaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0]['answers'] ##dictionary of start index and answer text --keys(answer_start, text)"

In [6]:
def add_end_index(answer, context):
  ## 1 if span mathc the context 0 otherwise
  text = answer['text']
  start_idx = answer['answer_start']
  end_idx = start_idx + len(text)
  if text == context[start_idx:end_idx]:
    answer['answer_end'] = end_idx
    return False
  for i in range(1,3):
    if text == context[start_idx-i:end_idx-i]:
      answer['answer_end']= end_idx-1
      answer['answer_start'] = start_idx-1
      return False
  return True

In [7]:
def Read_AAQAD(path):
  contexts =[]
  answers =[]
  questions =[]
  plausible = []
  cnt = 0
  with open(path) as f:
    aaqad_dict = json.load(f)
    for article in aaqad_dict['data']:
      for passage in article['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
          question = qa['question']
          if 'plausible_answers' in qa.keys():# there is two cases if the question have no answer then use plausible answer
            access = 'plausible_answers'
            plausible.append(1)
          else:
            access = 'answers'
            plausible.append(0)
          for answer in qa[access]:
            flag = add_end_index(answer, context) #if false dont add the 
            cnt =cnt + flag
            if not flag:
              contexts.append(context)
              answers.append(answer)
              questions.append(question)
  return contexts,questions,answers,plausible,cnt

In [8]:
train_contexts, train_questions, train_answers,train_plausible, train_span_error = Read_AAQAD('/content/drive/MyDrive/MRC/aaqad/AAQAD-train.json')
val_contexts, val_questions, val_answers,val_plausible, val_span_error = Read_AAQAD('/content/drive/MyDrive/MRC/aaqad/AAQAD-dev.json')
test_contexts, test_questions, test_answers,test_plausible, test_span_error = Read_AAQAD('/content/drive/MyDrive/MRC/aaqad/AAQAD-test.json')

In [9]:
total_error = train_span_error + val_span_error + test_span_error
ratio = total_error/17817 #initial size of the data
print(f"Size of the data set before dropping the misslabeled spans: 17817 & after: {len(train_answers)+len(val_answers)+len(test_answers)}")
print(f"Size of each split: \n 1-Train: {len(train_answers)} \n 2-Validation: {len(val_answers)} \n 3-Test: {len(test_answers)}")
print(f"percentage of span's error {ratio}")
print(f"Number of errors for each split:\n 1-Train: {train_span_error} \n 2-Validation: {val_span_error}\n 3-Test: {test_span_error}")


Size of the data set before dropping the misslabeled spans: 17817 & after: 17753
Size of each split: 
 1-Train: 12595 
 2-Validation: 1915 
 3-Test: 3243
percentage of span's error 0.0035920749845653028
Number of errors for each split:
 1-Train: 34 
 2-Validation: 11
 3-Test: 19


## Tokenization
---

In [10]:
#Creating the tokenizer
model_name = "aubmindlab/bert-base-arabertv02"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False)

train_encodings = arabert_tokenizer(train_questions, train_contexts, truncation=True, padding=True, return_tensors="pt")
val_encodings = arabert_tokenizer(val_questions, val_contexts, truncation=True, padding=True, return_tensors="pt")
test_encodings = arabert_tokenizer(test_questions, test_contexts,truncation= True, padding= True, return_tensors="pt")

The encoding is dictionary of ['input_ids', 'token_type_ids', 'attention_mask'] <br>
Input_ids: are the token of each sequence

In [11]:
train_answers[0]

{'answer_end': 227, 'answer_start': 211, 'text': 'تصل المسافة التي'}

In [12]:
train_contexts[0]

'هجرة الطيور هي رحلة موسمية تقوم بها أسراب من الطيور قاطعين مسافات هائلة عبر الصحاري وقمم الجبال العالية والمحيطات. تصل هذه الطيور إلى هدفها في وقت واحد يتطابق مع الوقت التي وصلت فيه في العام السابق. بعض الأنواع تصل المسافة التي تقطعها في هجرتها إلى 50 الف كيلومتر في السنة، البعض الآخر تستمر بالطيران بدون انقطاع لمدة تصل إلى 100 ساعة مع منظومة تحديد دقيقة للاتجاهات عند تلك الطيور. بعض الأنواع لها القدرة على الطيران لمسافات طويلة، ليلا ونهارا، دون توقف. هذه القدرة هامة للغاية للتمكن من عبور الصحاري الكبرى الممتدة لالاف الكيلومترات بدون طعام أو ماء. قبل بدء رحلتهم لعبور الصحراء تقوم الطيور بأكل طعام غني بالدهون مثل حبوب الذرة.\n'

In [13]:
train_encodings.char_to_token(0,train_answers[0]['answer_start'], 1)

61

In [14]:
def index_to_token_position(encodings , answers):
  start_positions = list()
  end_positions = list()
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'], 1))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'], 1))
    #if context truncated
    if start_positions[-1] is None: 
      start_positions[-1] = arabert_tokenizer.model_max_length
    #if end index is space
    itt = 1
    while end_positions[-1] is None: 
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-itt, 1)
      itt = itt + 1 
  encodings.update({'start_positions': torch.tensor(start_positions), 'end_positions': torch.tensor(end_positions)})

In [15]:
index_to_token_position(train_encodings, train_answers)
index_to_token_position(val_encodings, val_answers)
index_to_token_position(test_encodings, test_answers)

In [16]:
def is_truncated(start_pos):
  cnt = 0
  for pos in start_pos:
    if pos==512:
      cnt+=1
  return cnt

print(is_truncated(train_encodings['start_positions']))
print(is_truncated(val_encodings['start_positions']))
print(is_truncated(test_encodings['start_positions']))

20
0
3


In [17]:
#for metrics
#decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
#print(decoded_string)

In [18]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

## Modeling
---

In [19]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

In [20]:
class AqadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = AqadDataset(train_encodings)
val_dataset = AqadDataset(val_encodings)

In [21]:
AraBert = AutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
AraBert.parameters()

<generator object Module.parameters at 0x7f760a2a9f50>

In [23]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = 16, shuffle= True)

In [24]:
first = train_dataset.__getitem__(0)
first.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [25]:
second = {
    'input_ids':first['input_ids'],
    'token_type_ids':first['token_type_ids'],
    'attention_mask':first['attention_mask']
}


In [26]:
encodings = arabert_tokenizer(train_questions[0:3], train_contexts[0:3], truncation=True, padding=True, return_tensors="pt")
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained(model_name)
outputs = model(**encodings, start_positions=train_encodings['start_positions'][:3], end_positions=train_encodings['end_positions'][:3])
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv02 were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized fr

In [31]:
loss

tensor(5.1345, grad_fn=<DivBackward0>)

In [36]:
starts = torch.argmax(start_scores, dim=-1)
ends = torch.argmax(end_scores, dim=-1)
print(starts)
print(ends)

tensor([15, 85, 14])
tensor([ 9, 37, 23])


In [27]:
#for i, x in enumerate(train_loader):
#  output= AraBert(**x)

In [28]:
output = AraBert(**second)

ValueError: ignored

In [None]:
AraBert(input_ids = first['input_ids'], token_type_ids = first['token_type_ids'],attention_mask = first['attention_mask'])

In [None]:
class AraBertBase(nn.Module):
  def __init__(self, BERT):
    super().__init__()
    self.base = BERT
  def forward(self, X):
    output_dict = self.base(**X)
    
