<a href="https://colab.research.google.com/github/zeyadahmed10/Arabic-MRC/blob/DataCleaning/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install & import required packages

---



In [1]:
!pip install transformers



In [2]:
import os
import numpy as np
import json
import tensorflow as tf

## Mount google drive & creating new directories for AAQAD
---

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
if not os.path.exists('/content/drive/MyDrive/MRC'):
  os.mkdir('/content/drive/MyDrive/MRC')
  os.mkdir('/content/drive/MyDrive/MRC/aaqad')

In [5]:
#uncomment if you did not download the data
""" !pip install gdown
!gdown https://drive.google.com/uc?id=1jhUmWb9eHVATqhrWKAXxSE2gqJ53-wk6 -O AAQAD.json
!gdown https://drive.google.com/uc?id=1V5ziIZe__pGg14nH42WyMEFz444XPWf7 -O AAQAD\-train.json
!gdown https://drive.google.com/uc?id=19nj9jiCdJlHwAfgUTJ_Z8jg1cB34yfjv -O AAQAD\-dev.json
!gdown https://drive.google.com/uc?id=1z0XksuTwnqhiX1guxkmjYmoNA_JZ6SUN -O AAQAD\-test.json 

src_path ='/content/'
dest_path = '/content/drive/MyDrive/MRC/aaqad/'
files_name =['AAQAD.json', 'AAQAD-train.json', 'AAQAD-dev.json', 'AAQAD-test.json']
for name in files_name:
  os.rename(src_path+name, dest_path+name)
"""

" !pip install gdown\n!gdown https://drive.google.com/uc?id=1jhUmWb9eHVATqhrWKAXxSE2gqJ53-wk6 -O AAQAD.json\n!gdown https://drive.google.com/uc?id=1V5ziIZe__pGg14nH42WyMEFz444XPWf7 -O AAQAD\\-train.json\n!gdown https://drive.google.com/uc?id=19nj9jiCdJlHwAfgUTJ_Z8jg1cB34yfjv -O AAQAD\\-dev.json\n!gdown https://drive.google.com/uc?id=1z0XksuTwnqhiX1guxkmjYmoNA_JZ6SUN -O AAQAD\\-test.json \n\nsrc_path ='/content/'\ndest_path = '/content/drive/MyDrive/MRC/aaqad/'\nfiles_name =['AAQAD.json', 'AAQAD-train.json', 'AAQAD-dev.json', 'AAQAD-test.json']\nfor name in files_name:\n  os.rename(src_path+name, dest_path+name)\n"

## Load data and preprocessing
---

In [6]:
with open('/content/drive/MyDrive/MRC/aaqad/AAQAD-dev.json') as f:
  aaqad_dev_dict= json.load(f)

In [7]:
##DATA TREE AND TYPE##
aaqad_dev_dict['data']##list of articles
aaqad_dev_dict['data'][0]## dictionary of paragraph -- keys(title, paragraph)
aaqad_dev_dict['data'][0]['paragraphs'] ##list of contexts
aaqad_dev_dict['data'][0]['paragraphs'][0] ## dictionary of context and crossponding QAs pairs --keys(context, qas)
aaqad_dev_dict['data'][0]['paragraphs'][0]['qas'] ##list of QAs pair
aaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0] ##dictionary of the elements of each question --keys(id, is_impossible,question, answers)
aaqad_dev_dict['data'][0]['paragraphs'][0] ['qas'][0]['answers'] ##dictionary of start index and answer text --keys(answer_start, text)

[{'answer_start': 6,
  'text': 'لديفيس تطوير مصطلح تبديل الرزم الذي ألهم مُطوري الشبكات'}]

In [8]:
def add_end_index(answer, context):
  ## 1 if span mathc the context 0 otherwise
  text = answer['text']
  start_idx = answer['answer_start']
  end_idx = start_idx + len(text)
  if text == context[start_idx:end_idx]:
    answer['answer_end'] = end_idx
    return False
  for i in range(1,3):
    if text == context[start_idx-i:end_idx-i]:
      answer['answer_end']= end_idx-1
      answer['answer_start'] = start_idx-1
      return False
  return True

In [9]:
def Read_AAQAD(path):
  contexts =[]
  answers =[]
  questions =[]
  cnt = 0
  with open(path) as f:
    aaqad_dict = json.load(f)
    for article in aaqad_dict['data']:
      for passage in article['paragraphs']:
        context = passage['context']
        for qa in passage['qas']:
          question = qa['question']
          if 'plausible_answers' in qa.keys():# there is two cases if the question have no answer then use plausible answer
            access = 'plausible_answers'
          else:
            access = 'answers'
          for answer in qa[access]:
            flag = add_end_index(answer, context) #if false dont add the 
            cnt =cnt + flag
            if not flag:
              contexts.append(context)
              answers.append(answer)
              questions.append(question)
  return contexts,questions,answers,cnt

In [10]:
train_contexts, train_questions, train_answers, train_span_error = Read_AAQAD('/content/drive/MyDrive/MRC/aaqad/AAQAD-train.json')
val_contexts, val_questions, val_answers, val_span_error = Read_AAQAD('/content/drive/MyDrive/MRC/aaqad/AAQAD-dev.json')
test_contexts, test_questions, test_answers, test_span_error = Read_AAQAD('/content/drive/MyDrive/MRC/aaqad/AAQAD-test.json')

In [11]:
total_error = train_span_error + val_span_error + test_span_error
ratio = total_error/17817 #initial size of the data
print(f"Size of the data set before dropping the misslabeled spans: 17817 & after: {len(train_answers)+len(val_answers)+len(test_answers)}")
print(f"Size of each split: \n 1-Train: {len(train_answers)} \n 2-Validation: {len(val_answers)} \n 3-Test: {len(test_answers)}")
print(f"percentage of span's error {ratio}")
print(f"Number of errors for each split:\n 1-Train: {train_span_error} \n 2-Validation: {val_span_error}\n 3-Test: {test_span_error}")


Size of the data set before dropping the misslabeled spans: 17817 & after: 17753
Size of each split: 
 1-Train: 12595 
 2-Validation: 1915 
 3-Test: 3243
percentage of span's error 0.0035920749845653028
Number of errors for each split:
 1-Train: 34 
 2-Validation: 11
 3-Test: 19


## Tokenization
---

In [12]:
from transformers import AutoTokenizer
#Creating the tokenizer
model_name = "aubmindlab/bert-base-arabertv02"
arabert_tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False)

train_encodings = arabert_tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = arabert_tokenizer(val_contexts, val_questions, truncation=True, padding=True)
test_encodings = arabert_tokenizer(test_contexts,test_questions, truncation= True, padding= True)

The encoding is dictionary of ['input_ids', 'token_type_ids', 'attention_mask'] <br>
Input_ids: are the token of each sequence

In [13]:
def index_to_token_position(encodings , answers):
  start_positions = list()
  end_positions = list()
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
    #if context truncated
    if start_positions[-1] is None: 
      start_positions[-1] = arabert_tokenizer.model_max_length
    #if end index is space
    itt = 1
    while end_positions[-1] is None: 
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end']-itt)
      itt = itt + 1 
      
  return start_positions, end_positions

In [14]:
train_start_pos, train_end_pos =index_to_token_position(train_encodings, train_answers)
val_start_pos, val_end_pos = index_to_token_position(val_encodings, val_answers)
test_start_pos ,test_end_pos = index_to_token_position(test_encodings, test_answers)

In [15]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [16]:
def if_any_truncated(positions, max_token = arabert_tokenizer.model_max_length):
  cnt = 0
  for pos in positions:
    if pos == max_token:
      cnt = cnt + 1
  return cnt
print(if_any_truncated(train_start_pos))
print(if_any_truncated(val_start_pos))
print(if_any_truncated(test_start_pos))

20
0
3


## Converting data to tensor for modeling
---

In [17]:
train_span = tf.convert_to_tensor([train_start_pos, train_end_pos])
val_span = tf.convert_to_tensor([val_start_pos, val_end_pos])
test_span = tf.convert_to_tensor([test_start_pos, test_end_pos])
train_span = tf.transpose(train_span)
val_span = tf.transpose(val_span)
test_span = tf.transpose(test_span)

In [18]:
print(train_span.shape)

(12595, 2)
