In [3]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import farasa
from farasa.segmenter import FarasaSegmenter 




In [4]:
# Load the dataset
# Specify the file path
file_path = "./dataset/train.txt"

# Read the contents of the file located at file_path 
# and append each line to the list data_before_preprocessing
with open(file_path, 'r', encoding='utf-8') as file:
    data_before_preprocessing = file.readlines()
    # remove '\n' from each line
    data_before_preprocessing = [line.strip() for line in data_before_preprocessing]
    
    

In [5]:
# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any non-Arabic letters
def remove_non_arabic(text):
    text = re.sub(r'[^\u0600-\u06FF\s]|،|؛', '', text)
    return text

def input_preprocessing_text(text):
    # Correct most common errors on word like repetetion of harakats, or tanween before alef
    text = araby.autocorrect(text)

    # Remove any non-Arabic letters
    text = remove_non_arabic(text)

    # Remove diacritics
    text = remove_diacritics(text)

    # Tokenize
    text = araby.tokenize(text)

    return text

def save_tokenized_input(text):
    words = input_preprocessing_text(text)
    # Write and append on the tokenized input to a file
    with open('./generatedFiles/tokenized_input.txt', 'a', encoding='utf-8') as file:
        for word in words:
            file.write(word + '\n')

def save_gold_output(text):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    # Tokenize
    text = araby.tokenize(text)

    # Write and append on the gold output to a file
    with open('./generatedFiles/gold_output.txt', 'a', encoding='utf-8') as file:
        for word in text:
            # if last word in the text don't add '\n'
            file.write(word + '\n')

# Extract diacritics by returning a list containing a tuple of 3 elements: (letter, tashkeel, shadda)
def extract_arabic_diacritics(tokenized_text):
    diacritics_list = []
    for word in tokenized_text:
        extracted_word, tashkeel, shadda = araby.separate(word, extract_shadda=True)
        for i in range(len(extracted_word)):
            diacritics_list.append((extracted_word[i], araby.name(tashkeel[i]), araby.name(shadda[i])))
    return diacritics_list

In [4]:
# # RUN ONE TIME ONLY THIS CODE AGAIN 
# # Generate Gold Input file
# for i in range(len(data_before_preprocessing)):
#     save_tokenized_input(data_before_preprocessing[i])

In [5]:
# #RUN ONE TIME ONLY THIS CODE AGAIN
# # Generate Gold Output file
# for i in range(len(data_before_preprocessing)):
#     test = data_before_preprocessing[i]
#     text1 = save_gold_output(test)

In [6]:
# For testing

test = "قَالَ ابْنُ الْقَاسِمِ : قَالَ مَالِكٌ فِي مَكِّيٍّ أَحْرَمَ بِحَجَّةٍ مِنْ الْحَرَمِ ثُمَّ أُحْصِرَ ، أَنَّهُ يَخْرُجُ إلَى الْحِلِّ فَيُلَبِّي مِنْ هُنَاكَ لِأَنَّهُ أَمَرَ مَنْ فَاتَهُ الْحَجُّ وَقَدْ أَحْرَمَ مِنْ مَكَّةَ ، أَنْ يَخْرُجَ إلَى الْحِلِّ فَيَعْمَلَ فِيمَا بَقِيَ عَلَيْهِ مَا يَعْمَلُ الْمُعْتَمِرُ وَيُحِلُّ .( 2 / 437 ) "
text2 = input_preprocessing_text(test)
print(text2)
text3 = remove_non_arabic(test)
text3 = araby.tokenize(text3)
diacritics_list = extract_arabic_diacritics(text3)
print(diacritics_list)

['قال', 'ابن', 'القاسم', 'قال', 'مالك', 'في', 'مكي', 'أحرم', 'بحجة', 'من', 'الحرم', 'ثم', 'أحصر', 'أنه', 'يخرج', 'إلى', 'الحل', 'فيلبي', 'من', 'هناك', 'لأنه', 'أمر', 'من', 'فاته', 'الحج', 'وقد', 'أحرم', 'من', 'مكة', 'أن', 'يخرج', 'إلى', 'الحل', 'فيعمل', 'فيما', 'بقي', 'عليه', 'ما', 'يعمل', 'المعتمر', 'ويحل']
[('ق', 'فتحة', 'تطويل'), ('ا', 'تطويل', 'تطويل'), ('ل', 'فتحة', 'تطويل'), ('ا', 'تطويل', 'تطويل'), ('ب', 'سكون', 'تطويل'), ('ن', 'ضمة', 'تطويل'), ('ا', 'تطويل', 'تطويل'), ('ل', 'سكون', 'تطويل'), ('ق', 'فتحة', 'تطويل'), ('ا', 'تطويل', 'تطويل'), ('س', 'كسرة', 'تطويل'), ('م', 'كسرة', 'تطويل'), ('ق', 'فتحة', 'تطويل'), ('ا', 'تطويل', 'تطويل'), ('ل', 'فتحة', 'تطويل'), ('م', 'فتحة', 'تطويل'), ('ا', 'تطويل', 'تطويل'), ('ل', 'كسرة', 'تطويل'), ('ك', 'ضمتان', 'تطويل'), ('ف', 'كسرة', 'تطويل'), ('ي', 'تطويل', 'تطويل'), ('م', 'فتحة', 'تطويل'), ('ك', 'سكون', 'شدة'), ('ي', 'كسرة', 'شدة'), ('أ', 'فتحة', 'تطويل'), ('ح', 'سكون', 'تطويل'), ('ر', 'فتحة', 'تطويل'), ('م', 'فتحة', 'تطويل'), ('ب', 'كسرة', 

In [7]:
# Important functions in PyArabic

# araby.tokenize(text) # Tokenize the sentence text into words
# araby.is_arabicrange(text) # Check if the text is Arabic
# araby.sentence_tokenize(text) # Tokenize the text into sentences
# araby.strip_tashkeel(text) # Remove diacritics (FATHA, DAMMA, KASRA, SUKUN, SHADDA, FATHATAN, DAMMATAN, KASRATAN)
# araby.strip_diacritics(text) # Remove diacritics (Small Alef الألف الخنجرية, Harakat + Shadda, Quranic marks)
# araby.strip_tatweel(text) # Remove tatweel
# araby.strip_shadda(text) # Remove shadda
# araby.autocorrect(text) # Correct most common errors on word like repetetion of harakats,or tanwin befor alef
# araby.arabicrange() # Return a list of Arabic characters

# New Functions in PyArabic
# araby.vocalized_similarity(word1, word2) # if the two words has the same letters and the same harakats, this function return True. 
# The two words can be full vocalized, or partial vocalized

# araby.vocalizedlike(word1, word2) Same as vocalized_similarity but return True and False

# araby.joint(word1, word2) # joint the letters with the marks the length ot letters and marks must be equal return word



# Return the text, its tashkeel and shadda if extract_shadda is True
# text, marks, shada = araby.separate(text,extract_shadda=True) # Separate diacritics from the text
# print (text)
# for m in marks:
#     print (araby.name(m))

# for s in shada:
#     print (araby.name(s))

In [7]:
# read the tokenized input file
with open('./generatedFiles/tokenized_input.txt', 'r', encoding='utf-8') as file:
    tokenized_input = file.readlines()
    print(len(tokenized_input))
    # remove '\n' from each line
    tokenized_input = [line.strip() for line in tokenized_input]
    # put in tokenized_input list the tokenized input of length 1
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]
    # get the inde

print(tokenized_input[:10])



2104308
['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']


In [6]:
print(len(tokenized_input))

2104308


# **Core Word (CW) Diacritization**

## **Feature Extraction**

### 1 - Characters: here we extract each character from all tokenized words and create a vector of size 50 for each character.

In [8]:
tokenizer_char = Tokenizer(char_level=True)
tokenizer_char.fit_on_texts(tokenized_input)
sequences_char = tokenizer_char.texts_to_sequences(tokenized_input)
char_features = pad_sequences(sequences_char)   # padding the sequences to have the same length as the longest sequence (word)
char_embeddings = np.random.rand(len(tokenizer_char.word_index) + 1, 50)

In [9]:
print(char_features.shape) # (number of words, max length of word in the dataset)


print(char_embeddings.shape)

# 38 rows: 37 unique characters identified by the tokenizer, 1 row for handling characters not seen in the training data
# 50 columns: Each character is encoded as a 50-dimensional vector

(2104308, 13)
(38, 50)


In [10]:
print(char_features[0]) 
# the number of non zero elements corresponds to the length of the word 
# and the value of each element corresponds to the index of the character in the tokenizer
# which means that every character now is encoded as a number and this number is the index of the character in the tokenizer

[ 0  0  0  0  0  0  0  0  0 13  5  1  7]


In [10]:
print(char_embeddings[0])
# this is the embedding of each character in the tokenizer

[0.98932928 0.643462   0.77532937 0.1227929  0.73089188 0.21288858
 0.99034029 0.15313401 0.44557446 0.12192174 0.44356161 0.12061557
 0.8478695  0.76412183 0.7398288  0.04048527 0.08625787 0.03671214
 0.3239838  0.75342392 0.92016011 0.11164682 0.36365469 0.55828169
 0.47184493 0.41743149 0.58200443 0.81894359 0.52881037 0.70788089
 0.74585946 0.9119129  0.72889559 0.41261597 0.81483764 0.03791911
 0.7136551  0.4430647  0.53942118 0.49148931 0.47247786 0.39238231
 0.49273873 0.51496099 0.08505062 0.17991849 0.14699755 0.07638961
 0.4219707  0.02121006]


In [11]:
print(char_embeddings[char_features[0]].shape)
# 13 is the word of characters and 50 is the embedding size of each character

print(char_embeddings[char_features[0]])
# this is the embedding of each character in the first tokenized word, this is the 1st feature and the input of the model

(13, 50)
[[0.98932928 0.643462   0.77532937 0.1227929  0.73089188 0.21288858
  0.99034029 0.15313401 0.44557446 0.12192174 0.44356161 0.12061557
  0.8478695  0.76412183 0.7398288  0.04048527 0.08625787 0.03671214
  0.3239838  0.75342392 0.92016011 0.11164682 0.36365469 0.55828169
  0.47184493 0.41743149 0.58200443 0.81894359 0.52881037 0.70788089
  0.74585946 0.9119129  0.72889559 0.41261597 0.81483764 0.03791911
  0.7136551  0.4430647  0.53942118 0.49148931 0.47247786 0.39238231
  0.49273873 0.51496099 0.08505062 0.17991849 0.14699755 0.07638961
  0.4219707  0.02121006]
 [0.98932928 0.643462   0.77532937 0.1227929  0.73089188 0.21288858
  0.99034029 0.15313401 0.44557446 0.12192174 0.44356161 0.12061557
  0.8478695  0.76412183 0.7398288  0.04048527 0.08625787 0.03671214
  0.3239838  0.75342392 0.92016011 0.11164682 0.36365469 0.55828169
  0.47184493 0.41743149 0.58200443 0.81894359 0.52881037 0.70788089
  0.74585946 0.9119129  0.72889559 0.41261597 0.81483764 0.03791911
  0.7136551  0

### 2 - The position of the character in a word segment. For example, given the word “wAlktAb” , which is composed of three segments “w+Al+ktAb”. Letters were marked as “B” if they begin a segment, “M” if they are in the middle of a segment, “E” if they end a segment, and “S” if they are single letter segments. So for “w+Al+ktAb”, the corresponding character positions are “S+BE+BMME.”

In [12]:
segmenter = FarasaSegmenter(interactive=True) # The default behaviour




100%|██████████| 241M/241M [05:22<00:00, 749kiB/s] 




In [14]:
def get_seg_tags(word):                 # word = "wAlktAb"
    segments = segmenter.segment(word)  # segments will be a list: ["w", "Al", "ktAb"]
    segments = segments.split('+')
    seg_tags = []
    for segment in segments:
        if len(segment) == 1:
            seg_tags.append("S")
        else:
            seg_tags.append("B")  # First letter
            seg_tags.extend("M" * (len(segment) - 2))  # Middle letters
            seg_tags.append("E")  # Last letter
    return segments, seg_tags

word = "كقلمه"
segments, seg_tags = get_seg_tags(word)
print("Segmented word:", segments)
print("SEG tags:", seg_tags)


Segmented word: ['ك', 'قلم', 'ه']
SEG tags: ['S', 'B', 'M', 'E', 'S']


In [55]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 25m 4.5s TO RUN
# # The Output of this code is the input_segments.txt file

# for i in range(len(tokenized_input)):
#     segments, seg_tags = get_seg_tags(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/input_segments.txt', 'a', encoding='utf-8') as file:
#         for tag in seg_tags:
#             file.write(tag)
#         file.write('\n')

In [15]:
# read the input_segments file
with open('./generatedFiles/input_segments.txt', 'r', encoding='utf-8') as file:
    input_segments = file.readlines()
    print(len(input_segments))
    # remove '\n' from each line
    input_segments = [line.strip() for line in input_segments]
    # put in tokenized_input list the tokenized input of length 1
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]
    # get the inde

print(input_segments[:10])

2104308
['BMES', 'BE', 'BME', 'BEBME', 'BES', 'BME', 'BME', 'BEBMMME', 'BME', 'BMES']


In [16]:
tokenizer_tags = Tokenizer(char_level=True)
tokenizer_tags.fit_on_texts(input_segments)
sequences_tags = tokenizer_tags.texts_to_sequences(input_segments)
tags_features = pad_sequences(sequences_tags)   
tags_embeddings = np.random.rand(len(tokenizer_tags.word_index) + 1, 50)

In [17]:
print(tags_features.shape) 
print(tags_embeddings.shape)

(2104308, 13)
(5, 50)


### 3 - PRIOR: diacritics seen in the training set per segment. Since we used a character-level model, this feature informed the model with word-level information. For example, the word “ktAb”  was observed to have two diacritized forms in the training set, namely “kitaAb” ( – book) and “kut∼aAb” ( – writers). The first letter in the word (“k”) accepted the diacritics “i” and “u.” Thus, given a binary vector representing whether a character is allowed to assume any of the eight primitive Arabic diacritic marks (a, i, u, o, K, N, F, and ∼ in order), the first letter would be given the following vector “01100000.” If a word segment was never observed during training, then the vector for all letters therein would be set to 11111111.

In [11]:
# read the gold_output file
with open('./generatedFiles/gold_output.txt', 'r', encoding='utf-8') as file:
    gold_output = file.readlines()
    print(len(gold_output))
    # remove '\n' from each line
    gold_output = [line.strip() for line in gold_output]
    # put in tokenized_input list the tokenized input of length 1
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]
    # get the inde

print(gold_output[:10])

2104308
['قَوْلُهُ', 'أَوْ', 'قَطَعَ', 'الْأَوَّلُ', 'يَدَهُ', 'إلَخْ', 'قَالَ', 'الزَّرْكَشِيُّ', 'ابْنُ', 'عَرَفَةَ']


In [12]:

print(gold_output[:10])

['قَوْلُهُ', 'أَوْ', 'قَطَعَ', 'الْأَوَّلُ', 'يَدَهُ', 'إلَخْ', 'قَالَ', 'الزَّرْكَشِيُّ', 'ابْنُ', 'عَرَفَةَ']


In [13]:
print(len(tokenized_input))
print(tokenized_input[:10])

2104308
['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']


In [2]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SUKUN': '\u0652',
    'SHADDA': '\u0651',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D',
    'TANWEEN': '\u0640'
}

In [35]:
# firstly, initialize an empty dictionary for the 'prior' feature
# the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, SUKUN, SHADDA with FATHA, SHADDA with DAMMA, SHADDA with KASRA, NULL) as a binary vector

# then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
def get_prior(tokenized_input, gold_output):
    prior = {} # this dictionary will hold a key of tuple of 2 elements (word, character) and 
    for i in range(len(tokenized_input)):
        if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
            # get the indices of the word in the tokenized input array
            indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
            print(indices)
            # get the words in the gold_output array with the same indices
            words = [gold_output[j] for j in indices]
            for indx, charac in enumerate(tokenized_input[i]):
                for word in words:
                    # extract the diacritics of word[indx]
                    prior[(tokenized_input[i], charac, indx)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
                    if diacritics_mapping['SHADDA'] in word[indx]:
                        prior[(tokenized_input[i], charac, indx)][5] = 1 if diacritics_mapping['DAMMA'] in word[indx: indx+2] else 0
                        prior[(tokenized_input[i], charac, indx)][4] = 1 if diacritics_mapping['FATHA'] in word[indx: indx+2] else 0
                        prior[(tokenized_input[i], charac, indx)][6] = 1 if diacritics_mapping['KASRA'] in word[indx: indx+2] else 0
                        prior[(tokenized_input[i], charac, indx)][7] = 1 if not  diacritics_mapping['FATHA'] in word[indx: indx+2] and not diacritics_mapping['DAMMA'] in word[indx: indx+2]  and not diacritics_mapping['KASRA'] in word[indx: indx+2] else 0
                    else:
                        prior[(tokenized_input[i], charac,indx)][0] = 1 if diacritics_mapping['FATHA'] in word[indx: indx+2] else 0
                        prior[(tokenized_input[i], charac,indx)][1] = 1 if diacritics_mapping['DAMMA'] in word[indx: indx+2] else 0
                        prior[(tokenized_input[i], charac,indx)][2] = 1 if diacritics_mapping['KASRA'] in word[indx: indx+2] else 0
                        prior[(tokenized_input[i], charac,indx)][3] = 1 if diacritics_mapping['SUKUN'] in word[indx: indx+2] else 0
    return prior

In [36]:
test_tokenized_input = ['كإنكار', 'كإنكار', 'بقذر','بقذر', 'أكثر', 'أكثر']
test_gold_output = ['كَإِنْكَارِ','كَإِنْكَارٍ', 'بِقَذَر', 'بِقَذَرٍ','أكْثَرَ', 'أَكْثَرُ']
print (get_prior(test_tokenized_input, test_gold_output))

[0, 1]
[2, 3]
[4, 5]
{('كإنكار', 'ك', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'إ', 1): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ن', 2): [0, 0, 1, 0, 0, 0, 0, 0], ('كإنكار', 'ك', 3): [0, 0, 1, 0, 0, 0, 0, 0], ('كإنكار', 'ا', 4): [0, 0, 0, 1, 0, 0, 0, 0], ('كإنكار', 'ر', 5): [0, 0, 0, 1, 0, 0, 0, 0], ('بقذر', 'ب', 0): [0, 0, 1, 0, 0, 0, 0, 0], ('بقذر', 'ق', 1): [0, 0, 1, 0, 0, 0, 0, 0], ('بقذر', 'ذ', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ر', 3): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'أ', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ك', 1): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ث', 2): [0, 0, 0, 1, 0, 0, 0, 0], ('أكثر', 'ر', 3): [0, 0, 0, 1, 0, 0, 0, 0]}


In [38]:
letter, tashkeel, shadda = araby.separate('زَّ', extract_shadda=True)
enkar = 'كَإِنْكَارِ'
print(enkar[4:6])
print( diacritics_mapping['FATHA'] in enkar[0:1])
print( diacritics_mapping['SHADDA'] in 'زَّ')
print( diacritics_mapping['DAMMA'] in 'زَّ')

نْ
False
True
False
