In [72]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import farasa
from farasa.segmenter import FarasaSegmenter 
import unicodedata

In [73]:
# Load the dataset
# Specify the file path
file_path = "./dataset/train.txt"

# Read the contents of the file located at file_path 
# and append each line to the list data_before_preprocessing
with open(file_path, 'r', encoding='utf-8') as file:
    data_before_preprocessing = file.readlines()
    # remove '\n' from each line
    data_before_preprocessing = [line.strip() for line in data_before_preprocessing]
    
    

In [74]:
# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any non-Arabic letters
def remove_non_arabic(text):
    text = re.sub(r'[^\u0600-\u06FF\s]|،|؛', '', text)
    return text

def input_preprocessing_text(text):
    # Correct most common errors on word like repetetion of harakats, or tanween before alef
    text = araby.autocorrect(text)

    # Remove any non-Arabic letters
    text = remove_non_arabic(text)

    # Remove diacritics
    text = remove_diacritics(text)

    # Tokenize
    text = araby.tokenize(text)

    return text

def save_tokenized_input(text):
    words = input_preprocessing_text(text)
    # Write and append on the tokenized input to a file
    with open('./generatedFiles/tokenized_input.txt', 'a', encoding='utf-8') as file:
        for word in words:
            file.write(word + '\n')

def save_gold_output(text):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    # Tokenize
    text = araby.tokenize(text)

    # Write and append on the gold output to a file
    with open('./generatedFiles/gold_output.txt', 'a', encoding='utf-8') as file:
        for word in text:
            # if last word in the text don't add '\n'
            file.write(word + '\n')


def is_not_arabic_diacritic(char):
   category = unicodedata.category(char)
   return not (category == 'Mn' or category == 'Mc')


In [75]:
# Example usage:
character = 'ذْ'
if is_not_arabic_diacritic(character[1]):
   print("The character is not an Arabic diacritic.")
else:
   print("The character is an Arabic diacritic.")


# Testing of is_not_arabic_diacritic() function with gettting the index of the first non diacritic character in the word
word = 'زَّراع'
 
for i in range(1, len(word)): # start from 1 because the first character is not a diacritic
    if is_not_arabic_diacritic(word[i]):
        print(i)
        break 

The character is an Arabic diacritic.
3


In [76]:
# # RUN ONE TIME ONLY THIS CODE AGAIN 
# # Generate Gold Input file
# for i in range(len(data_before_preprocessing)):
#     save_tokenized_input(data_before_preprocessing[i])

In [77]:
# #RUN ONE TIME ONLY THIS CODE AGAIN
# # Generate Gold Output file
# for i in range(len(data_before_preprocessing)):
#     test = data_before_preprocessing[i]
#     text1 = save_gold_output(test)

In [78]:
# For testing
test = "قَالَ ابْنُ الْقَاسِمِ : قَالَ مَالِكٌ فِي مَكِّيٍّ أَحْرَمَ بِحَجَّةٍ مِنْ الْحَرَمِ ثُمَّ أُحْصِرَ ، أَنَّهُ يَخْرُجُ إلَى الْحِلِّ فَيُلَبِّي مِنْ هُنَاكَ لِأَنَّهُ أَمَرَ مَنْ فَاتَهُ الْحَجُّ وَقَدْ أَحْرَمَ مِنْ مَكَّةَ ، أَنْ يَخْرُجَ إلَى الْحِلِّ فَيَعْمَلَ فِيمَا بَقِيَ عَلَيْهِ مَا يَعْمَلُ الْمُعْتَمِرُ وَيُحِلُّ .( 2 / 437 ) "
text2 = input_preprocessing_text(test)
print(text2)
text3 = remove_non_arabic(test)

['قال', 'ابن', 'القاسم', 'قال', 'مالك', 'في', 'مكي', 'أحرم', 'بحجة', 'من', 'الحرم', 'ثم', 'أحصر', 'أنه', 'يخرج', 'إلى', 'الحل', 'فيلبي', 'من', 'هناك', 'لأنه', 'أمر', 'من', 'فاته', 'الحج', 'وقد', 'أحرم', 'من', 'مكة', 'أن', 'يخرج', 'إلى', 'الحل', 'فيعمل', 'فيما', 'بقي', 'عليه', 'ما', 'يعمل', 'المعتمر', 'ويحل']


In [79]:
# Important functions in PyArabic

# araby.tokenize(text) # Tokenize the sentence text into words
# araby.is_arabicrange(text) # Check if the text is Arabic
# araby.sentence_tokenize(text) # Tokenize the text into sentences
# araby.strip_tashkeel(text) # Remove diacritics (FATHA, DAMMA, KASRA, SUKUN, SHADDA, FATHATAN, DAMMATAN, KASRATAN)
# araby.strip_diacritics(text) # Remove diacritics (Small Alef الألف الخنجرية, Harakat + Shadda, Quranic marks)
# araby.strip_tatweel(text) # Remove tatweel
# araby.strip_shadda(text) # Remove shadda
# araby.autocorrect(text) # Correct most common errors on word like repetetion of harakats,or tanwin befor alef
# araby.arabicrange() # Return a list of Arabic characters

# New Functions in PyArabic
# araby.vocalized_similarity(word1, word2) # if the two words has the same letters and the same harakats, this function return True. 
# The two words can be full vocalized, or partial vocalized

# araby.vocalizedlike(word1, word2) Same as vocalized_similarity but return True and False

# araby.joint(word1, word2) # joint the letters with the marks the length ot letters and marks must be equal return word



# Return the text, its tashkeel and shadda if extract_shadda is True
# text, marks, shada = araby.separate(text,extract_shadda=True) # Separate diacritics from the text
# print (text)
# for m in marks:
#     print (araby.name(m))

# for s in shada:
#     print (araby.name(s))

In [80]:
# read the tokenized input file
with open('./generatedFiles/tokenized_input.txt', 'r', encoding='utf-8') as file:
    tokenized_input = file.readlines()
    print(len(tokenized_input))
    # Remove '\n' from each line
    tokenized_input = [line.strip() for line in tokenized_input]
    # Put the tokenized input of length 1 in tokenized_input list 
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]

print(tokenized_input[:10])



2104308
['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']


In [81]:
print(len(tokenized_input))

2104308


# **Core Word (CW) Diacritization**

## **Feature Extraction**

### 1 - Characters: 
Here we extract each character from all tokenized words and create a vector of size 50 for each character.

In [82]:
tokenizer_char = Tokenizer(char_level=True)
tokenizer_char.fit_on_texts(tokenized_input)
sequences_char = tokenizer_char.texts_to_sequences(tokenized_input)
char_features = pad_sequences(sequences_char)   # padding the sequences to have the same length as the longest sequence (word)
char_embeddings = np.random.rand(len(tokenizer_char.word_index) + 1, 50)

In [83]:
print(char_features.shape) # (number of words, max length of word in the dataset)


print(char_embeddings.shape)

# 38 rows: 37 unique characters identified by the tokenizer, 1 row for handling characters not seen in the training data
# 50 columns: Each character is encoded as a 50-dimensional vector

(2104308, 13)
(38, 50)


In [84]:
print(char_features[0]) 
# the number of non zero elements corresponds to the length of the word 
# and the value of each element corresponds to the index of the character in the tokenizer
# which means that every character now is encoded as a number and this number is the index of the character in the tokenizer

[ 0  0  0  0  0  0  0  0  0 13  5  1  7]


In [85]:
print(char_embeddings[0])
# this is the embedding of each character in the tokenizer

[0.87308275 0.55668268 0.52625606 0.32360008 0.44546368 0.76296881
 0.7079179  0.92835719 0.44056443 0.85968953 0.63172786 0.93754451
 0.01035784 0.9892755  0.24747822 0.02079794 0.19932182 0.03765827
 0.62777767 0.42783875 0.00906027 0.80896086 0.74178992 0.89335734
 0.88450038 0.71197471 0.84951462 0.38654383 0.73169906 0.11028904
 0.34822191 0.36890854 0.46469679 0.00348348 0.31818113 0.12862515
 0.47998968 0.08691513 0.86585093 0.35933273 0.28516861 0.03217353
 0.72911621 0.6652026  0.20728891 0.74144347 0.29083527 0.88022849
 0.01210624 0.56578011]


In [86]:
print(char_embeddings[char_features[0]].shape)
# 13 is the word of characters and 50 is the embedding size of each character

print(char_embeddings[char_features[0]])
# this is the embedding of each character in the first tokenized word, this is the 1st feature and the input of the model

(13, 50)
[[0.87308275 0.55668268 0.52625606 0.32360008 0.44546368 0.76296881
  0.7079179  0.92835719 0.44056443 0.85968953 0.63172786 0.93754451
  0.01035784 0.9892755  0.24747822 0.02079794 0.19932182 0.03765827
  0.62777767 0.42783875 0.00906027 0.80896086 0.74178992 0.89335734
  0.88450038 0.71197471 0.84951462 0.38654383 0.73169906 0.11028904
  0.34822191 0.36890854 0.46469679 0.00348348 0.31818113 0.12862515
  0.47998968 0.08691513 0.86585093 0.35933273 0.28516861 0.03217353
  0.72911621 0.6652026  0.20728891 0.74144347 0.29083527 0.88022849
  0.01210624 0.56578011]
 [0.87308275 0.55668268 0.52625606 0.32360008 0.44546368 0.76296881
  0.7079179  0.92835719 0.44056443 0.85968953 0.63172786 0.93754451
  0.01035784 0.9892755  0.24747822 0.02079794 0.19932182 0.03765827
  0.62777767 0.42783875 0.00906027 0.80896086 0.74178992 0.89335734
  0.88450038 0.71197471 0.84951462 0.38654383 0.73169906 0.11028904
  0.34822191 0.36890854 0.46469679 0.00348348 0.31818113 0.12862515
  0.47998968 0

### 2 - The position of the character in a word segment:
For example, given the word “wAlktAb” , which is composed of three segments “w+Al+ktAb”. Letters were marked as “B” if they begin a segment, “M” if they are in the middle of a segment, “E” if they end a segment, and “S” if they are single letter segments. So for “w+Al+ktAb”, the corresponding character positions are “S+BE+BMME.”

In [87]:
segmenter = FarasaSegmenter(interactive=True) # The default behaviour



In [88]:
def get_seg_tags(word):                 # word = "wAlktAb"
    segments = segmenter.segment(word)  # segments will be a list: ["w", "Al", "ktAb"]
    segments = segments.split('+')
    seg_tags = []
    for segment in segments:
        if len(segment) == 1:
            seg_tags.append("S")
        else:
            seg_tags.append("B")  # First letter
            seg_tags.extend("M" * (len(segment) - 2))  # Middle letters
            seg_tags.append("E")  # Last letter
    return segments, seg_tags

word = "كقلمه"
segments, seg_tags = get_seg_tags(word)
print("Segmented word:", segments)
print("SEG tags:", seg_tags)


Segmented word: ['ك', 'قلم', 'ه']
SEG tags: ['S', 'B', 'M', 'E', 'S']


In [89]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 25m 4.5s TO RUN
# # The Output of this code is the input_segments.txt file

# for i in range(len(tokenized_input)):
#     segments, seg_tags = get_seg_tags(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/input_segments.txt', 'a', encoding='utf-8') as file:
#         for tag in seg_tags:
#             file.write(tag)
#         file.write('\n')

In [90]:
# Read the input_segments file
with open('./generatedFiles/input_segments.txt', 'r', encoding='utf-8') as file:
    input_segments = file.readlines()
    print(len(input_segments))
    # Remove '\n' from each line
    input_segments = [line.strip() for line in input_segments]
    # Put the tokenized input of length 1 in the tokenized_input list
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]

print(input_segments[:10])

2104308
['BMES', 'BE', 'BME', 'BEBME', 'BES', 'BME', 'BME', 'BEBMMME', 'BME', 'BMES']


In [91]:
tokenizer_tags = Tokenizer(char_level=True)
tokenizer_tags.fit_on_texts(input_segments)
sequences_tags = tokenizer_tags.texts_to_sequences(input_segments)
tags_features = pad_sequences(sequences_tags)   
tags_embeddings = np.random.rand(len(tokenizer_tags.word_index) + 1, 50)

In [92]:
print(tags_features.shape) 
print(tags_embeddings.shape)

(2104308, 13)
(5, 50)


### 3 - PRIOR: 
diacritics seen in the training set per segment. Since we used a character-level model, this feature informed the model with word-level information. For example, the word “ktAb”  was observed to have two diacritized forms in the training set, namely “kitaAb” ( – book) and “kut∼aAb” ( – writers). The first letter in the word (“k”) accepted the diacritics “i” and “u.” Thus, given a binary vector representing whether a character is allowed to assume any of the eight primitive Arabic diacritic marks (a, i, u, o, K, N, F, and ∼ in order), the first letter would be given the following vector “01100000.” If a word segment was never observed during training, then the vector for all letters therein would be set to 11111111.

In [93]:
# read the gold_output file
with open('./generatedFiles/gold_output.txt', 'r', encoding='utf-8') as file:
    gold_output = file.readlines()
    print(len(gold_output))
    # remove '\n' from each line
    gold_output = [line.strip() for line in gold_output]
    # put in tokenized_input list the tokenized input of length 1
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]
    # get the inde

print(gold_output[:10])

2104308
['قَوْلُهُ', 'أَوْ', 'قَطَعَ', 'الْأَوَّلُ', 'يَدَهُ', 'إلَخْ', 'قَالَ', 'الزَّرْكَشِيُّ', 'ابْنُ', 'عَرَفَةَ']


In [94]:
print(len(tokenized_input))
print(tokenized_input[:10])

2104308
['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']


In [95]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SHADDA': '\u0651',
    'SUKUN': '\u0652',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D'
}

In [96]:
# # Extract diacritics by returning a list containing a tuple of 3 elements: (letter, tashkeel, shadda)
# def extract_arabic_diacritics(word):
#     diacritics_list = []
#     extracted_word, tashkeel, shadda = araby.separate(word, extract_shadda=True)
#     for i in range(len(extracted_word)):
#         print(f'{araby.name(extracted_word[i])} {araby.name(tashkeel[i])} {araby.name(shadda[i])}')
#         diacritics_list.append((extracted_word[i], (tashkeel[i].encode("utf8")).decode(), (shadda[i].encode("utf8")).decode()))
#     return diacritics_list

In [97]:
# # firstly, initialize an empty dictionary for the 'prior' feature
# # the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# # then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
# def get_prior(tokenized_input, gold_output):
#     prior = {} # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
#     for i in range(len(tokenized_input)):
#         if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
#             # get the indices of the word in the tokenized input array
#             indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
#             print(indices)
#             # get the words in the gold_output array with the same indices
#             words = [gold_output[j] for j in indices]
#             extracted_diac_all_words = []
#             for word in words:
#                 extracted_diac_all_words.append(extract_arabic_diacritics(word))
#             for indx, charac in enumerate(tokenized_input[i]):
#                 for extracted_diac_per_word in extracted_diac_all_words:
#                     # extract the diacritics of word[indx]
#                     prior[(tokenized_input[i], charac, indx)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
#                     if diacritics_mapping['SHADDA'] in extracted_diac_per_word[indx]:
#                         prior[(tokenized_input[i], charac, indx)][4] = 1 if diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][5] = 1 if diacritics_mapping['DAMMA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][6] = 1 if diacritics_mapping['KASRA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][7] = 1 if not  diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] and not diacritics_mapping['DAMMA'] in word[indx: indx+2]  and not diacritics_mapping['KASRA'] in word[indx: indx+2] else 0
#                     else:
#                         prior[(tokenized_input[i], charac,indx)][0] = 1 if diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][1] = 1 if diacritics_mapping['DAMMA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][2] = 1 if diacritics_mapping['KASRA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][3] = 1 if diacritics_mapping['SUKUN'] in extracted_diac_per_word[indx] else 0
#     return prior

In [98]:
letter, tashkeel, shadda = araby.separate('زَّ', extract_shadda=True)   # SHADDA + FATHA Example
# letter, tashkeel, shadda = araby.separate('وَ', extract_shadda=True)   # FATHA Example
# letter, tashkeel, shadda = araby.separate('مً', extract_shadda=True)   # FATHATAN Example
# letter, tashkeel, shadda = araby.separate('عٌ', extract_shadda=True)   # DAMMATAN Example
# letter, tashkeel, shadda = araby.separate('يُّ', extract_shadda=True)   # SHADDA + DAMMA Example
# letter, tashkeel, shadda = araby.separate('ذْ', extract_shadda=True)   # SUKUN Example
enkar = 'كَإِنْكَارِ'
# print(enkar[4:6])
# print( diacritics_mapping['FATHA'] in enkar[0:1])
# print( diacritics_mapping['SHADDA'] in 'زَّ')
# print( diacritics_mapping['DAMMA'] in 'زَّ')

print('FATHA in tashkeel: ', diacritics_mapping['FATHA'] in tashkeel)
print('DAMMA in tashkeel: ', diacritics_mapping['DAMMA'] in tashkeel)
print('KASRA in tashkeel: ', diacritics_mapping['KASRA'] in tashkeel)
print('SUKUN in tashkeel: ', diacritics_mapping['SUKUN'] in tashkeel)
print('FATHATAN in tashkeel: ', diacritics_mapping['FATHATAN'] in tashkeel)
print('DAMMATAN in tashkeel: ', diacritics_mapping['DAMMATAN'] in tashkeel)
print('KASRATAN in tashkeel: ', diacritics_mapping['KASRATAN'] in tashkeel)
print('SHADDA in tashkeel: ', diacritics_mapping['SHADDA'] in tashkeel)
print('=============================')
print('FATHA in shadda: ', diacritics_mapping['FATHA'] in shadda)
print('DAMMA in shadda: ', diacritics_mapping['DAMMA'] in shadda)
print('KASRA in shadda: ', diacritics_mapping['KASRA'] in shadda)
print('SUKUN in shadda: ', diacritics_mapping['SUKUN'] in shadda)
print('FATHATAN in shadda: ', diacritics_mapping['FATHATAN'] in shadda)
print('DAMMATAN in shadda: ', diacritics_mapping['DAMMATAN'] in shadda)
print('KASRATAN in shadda: ', diacritics_mapping['KASRATAN'] in shadda)
print('SHADDA in shadda: ', diacritics_mapping['SHADDA'] in shadda)

print('testt', (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['FATHA'] not in tashkeel and diacritics_mapping['DAMMA'] not in tashkeel and diacritics_mapping['KASRA'] not in tashkeel))
print('yarab', (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda))

FATHA in tashkeel:  True
DAMMA in tashkeel:  False
KASRA in tashkeel:  False
SUKUN in tashkeel:  True
FATHATAN in tashkeel:  False
DAMMATAN in tashkeel:  False
KASRATAN in tashkeel:  False
SHADDA in tashkeel:  False
FATHA in shadda:  False
DAMMA in shadda:  False
KASRA in shadda:  False
SUKUN in shadda:  False
FATHATAN in shadda:  False
DAMMATAN in shadda:  False
KASRATAN in shadda:  False
SHADDA in shadda:  True
testt False
yarab False


In [106]:
# firstly, initialize an empty dictionary for the 'prior' feature
# the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
def get_prior(tokenized_input, gold_output):
    prior = {}  # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
    for i in range(len(tokenized_input)):
        if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
            # get the indices of the word in the tokenized input array
            indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
            # print(indices)
            # get the words in the gold_output array with the same indices
            words = []
            maxi_len = 0
            for j in indices:
                if gold_output[j] not in words:
                    words.append(gold_output[j])
                    maxi_len = max(maxi_len, len(gold_output[j]))

            for t in range(len(tokenized_input[i])):
                prior[(tokenized_input[i], tokenized_input[i][t], t)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
            
            indx2 = 0
            for word in words:
                indx = 0
                while indx < maxi_len:
                    # extract the diacritics of word[indx]
                    for iter in range(indx+1, len(word)):
                        if is_not_arabic_diacritic(word[iter]):
                            # print(iter)
                            letter, tashkeel, shadda = araby.separate(word[indx: iter], extract_shadda=True) 
                            if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][0] = 1 
                            if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][1] = 1
                            if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][2] = 1
                            if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][3] = 1
                            if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][4] = 1
                            if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][5] = 1
                            if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):  
                                prior[(tokenized_input[i], word[indx], indx2)][6] = 1 # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                            if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], indx2)][7] = 1
                            indx = iter - 1
                            indx2 += 1
                            break 
                    indx += 1
                indx2 = 0


                indx = len(word) - 1    # my assumption is that the last character in the not a diacritic
                if (not is_not_arabic_diacritic(word[len(word) - 1]) and is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is not, then the index of the last character is len(word) - 2
                    indx = len(word) - 2
                elif (not is_not_arabic_diacritic(word[len(word) - 1]) and not is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is also a diacritic (in shadda case), then the index of the last character is len(word) - 3
                    indx = len(word) - 3


                if (tokenized_input[i], word[indx], indx) not in prior:
                    letter, tashkeel, shadda = araby.separate(word[indx: len(word)], extract_shadda=True) 
                    if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][0] = 1
                    if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][1] = 1
                    if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][2] = 1 
                    if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][3] = 1 
                    if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][4] = 1 
                    if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][5] = 1
                    if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):
                        prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][6] = 1  # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                    if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][7] = 1
                    
    return prior

In [100]:
test_tokenized_input = ['كإنكار', 'كإنكار', 'بقذر','بقذر', 'أكثر', 'أكثر', 'الزركشي']
test_gold_output = ['كَإِنْكَارِ','كَإِنْكَارٍ', 'بِقَذَر', 'بِقَذَرٍ','أكْثَرَ', 'أَكْثَرُ', 'الزَّرْكَشِيُّ']
print (get_prior(test_tokenized_input, test_gold_output))

{('كإنكار', 'ك', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'إ', 1): [0, 0, 1, 0, 0, 0, 0, 0], ('كإنكار', 'ن', 2): [0, 0, 0, 0, 0, 0, 1, 0], ('كإنكار', 'ك', 3): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ا', 4): [0, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ر', 5): [0, 0, 1, 0, 0, 1, 0, 0], ('بقذر', 'ب', 0): [0, 0, 1, 0, 0, 0, 0, 0], ('بقذر', 'ق', 1): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ذ', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ر', 3): [0, 0, 0, 0, 0, 1, 0, 0], ('أكثر', 'أ', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ك', 1): [0, 0, 0, 0, 0, 0, 1, 0], ('أكثر', 'ث', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ر', 3): [1, 1, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ا', 0): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ل', 1): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ز', 2): [1, 0, 0, 0, 0, 0, 0, 1], ('الزركشي', 'ر', 3): [0, 0, 0, 0, 0, 0, 1, 0], ('الزركشي', 'ك', 4): [1, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ش', 5): [0, 0, 1, 0, 0, 0, 0, 0], ('الزركشي', 'ي', 6): [0, 1, 0, 0, 0, 0, 0, 1]}


In [107]:
# write in a file the prior feature
prior_feature = get_prior(tokenized_input, gold_output)
with open('./generatedFiles/prior_feature.txt', 'w', encoding='utf-8') as file:
    for key, value in prior_feature.items():
        file.write(f'{key}: {value}\n')

In [104]:
# read the prior feature file in a dictionary called prior_feature 
prior_feature = {}
with open('./generatedFiles/prior_feature.txt', 'r', encoding='utf-8') as file:
    for line in file:
        key, value = line.strip().split(':')
        key = key.strip()
        value = value.strip()
        key = key[1:-1].split(',')
        value = value[1:-1].split(',')
        key = (key[0][1:-1], key[1][2:-1], int(key[2]))
        value = [int(i) for i in value]
        prior_feature[key] = value


print(prior_feature)

{('كإنكار', 'ك', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'إ', 1): [0, 0, 1, 0, 0, 0, 0, 0], ('كإنكار', 'ن', 2): [0, 0, 0, 0, 0, 0, 1, 0], ('كإنكار', 'ك', 3): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ا', 4): [0, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ر', 5): [0, 0, 1, 0, 0, 1, 0, 0], ('بقذر', 'ب', 0): [0, 0, 1, 0, 0, 0, 0, 0], ('بقذر', 'ق', 1): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ذ', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ر', 3): [0, 0, 0, 0, 0, 1, 0, 0], ('أكثر', 'أ', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ك', 1): [0, 0, 0, 0, 0, 0, 1, 0], ('أكثر', 'ث', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ر', 3): [1, 1, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ا', 0): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ل', 1): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ز', 2): [1, 0, 0, 0, 0, 0, 0, 1], ('الزركشي', 'ر', 3): [0, 0, 0, 0, 0, 0, 1, 0], ('الزركشي', 'ك', 4): [1, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ش', 5): [0, 0, 1, 0, 0, 0, 0, 0], ('الزركشي', 'ي', 6): [0, 1, 0, 0, 0, 0, 0, 1]}
