In [1]:
import pickle
import numpy as np
import pandas as pd
import re
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import farasa
from farasa.segmenter import FarasaSegmenter 
import unicodedata
import torch




In [2]:
# # Run on GPU
# use_cuda = torch.cuda.is_available()
# device = torch.device("cuda" if use_cuda else "cpu")
# print (device)
# # print the cpu or gpu
# print(torch.cuda.get_device_name(0))
# # print the number of gpus you have
# print(torch.cuda.device_count())
# # print current gpu
# print(torch.cuda.current_device())

In [4]:
def read_data(file_path):
    """
    Read the contents of the file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()

        # remove '\n' from each line
        data = [line.strip() for line in data]
    return data


In [15]:
data_before_preprocessing = read_data("./dataset/train.txt")
print(len(data_before_preprocessing))

50000


In [7]:
def read_pickle_file(file_path):
    """
    Read the contents of the pickle file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

In [22]:
def save_words_in_file(path, words, permission='w'):
    """
    Save the words in the file located at path 
    """
    with open(path, permission, encoding='utf-8') as file:
            for word in words:
                file.write(word + '\n')

In [29]:
# set for arabic letters
arabic_letters = set(read_pickle_file("./Delivery/arabic_letters.pickle"))

print(len(arabic_letters))
print(arabic_letters)

36
{'ج', 'ؤ', 'ح', 'ة', 'ذ', 'ر', 'ش', 'ض', 'ف', 'غ', 'ب', 'س', 'ث', 'ظ', 'ئ', 'ت', 'ع', 'م', 'آ', 'ا', 'ص', 'ط', 'أ', 'ن', 'ل', 'ز', 'و', 'د', 'ق', 'ء', 'ى', 'ي', 'خ', 'ك', 'ه', 'إ'}


In [30]:
# set for arabic letters
diacritics = set(read_pickle_file("./Delivery/diacritics.pickle"))

print(len(diacritics))
print(diacritics)

8
{'ِ', 'ٌ', 'ُ', 'ً', 'ْ', 'ّ', 'َ', 'ٍ'}


In [31]:
# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any letters not found in set arabic_letters and not found in set diacritics
def remove_non_arabic(text):
    text = re.sub(r'[^\s' + ''.join(arabic_letters) + ''.join(diacritics) + ']', '', text)
    return text

def input_preprocessing_text(text):
    # Correct most common errors on word like repetetion of harakats, or tanween before alef
    text = araby.autocorrect(text)

    # Remove any non-Arabic letters
    text = remove_non_arabic(text)

    # Remove diacritics
    text = remove_diacritics(text)

    # Tokenize
    text = araby.tokenize(text)

    return text

def save_tokenized_input(text,path="./generatedFiles/training/tokenized_input.txt", permission='w'):
    words = input_preprocessing_text(text)
    save_words_in_file(path, words, permission)
    

def save_gold_output(text,path="./generatedFiles/training/gold_output.txt", permission='w'):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    # Tokenize
    text = araby.tokenize(text)

    save_words_in_file(path, text, permission)


def is_not_arabic_diacritic(char):
   category = unicodedata.category(char)
   return not (category == 'Mn' or category == 'Mc')

In [12]:
# Example usage:
character = 'ذْ'
if is_not_arabic_diacritic(character[1]):
   print("The character is not an Arabic diacritic.")
else:
   print("The character is an Arabic diacritic.")


# Testing of is_not_arabic_diacritic() function with gettting the index of the first non diacritic character in the word
word = 'زَّراع'
 
for i in range(1, len(word)): # start from 1 because the first character is not a diacritic
    if is_not_arabic_diacritic(word[i]):
        print(i)
        break 

The character is an Arabic diacritic.
3


In [32]:
# # RUN ONE TIME ONLY THIS CODE AGAIN 
# # Generate Gold Input file
# for i in range(len(data_before_preprocessing)):
#     save_tokenized_input(data_before_preprocessing[i], permission='a')

In [33]:
# # RUN ONE TIME ONLY THIS CODE AGAIN
# # Generate Gold Output file
# for i in range(len(data_before_preprocessing)):
#     save_gold_output(data_before_preprocessing[i], permission='a')

In [25]:
# Important functions in PyArabic

# araby.tokenize(text) # Tokenize the sentence text into words
# araby.is_arabicrange(text) # Check if the text is Arabic
# araby.sentence_tokenize(text) # Tokenize the text into sentences
# araby.strip_tashkeel(text) # Remove diacritics (FATHA, DAMMA, KASRA, SUKUN, SHADDA, FATHATAN, DAMMATAN, KASRATAN)
# araby.strip_diacritics(text) # Remove diacritics (Small Alef الألف الخنجرية, Harakat + Shadda, Quranic marks)
# araby.strip_tatweel(text) # Remove tatweel
# araby.strip_shadda(text) # Remove shadda
# araby.autocorrect(text) # Correct most common errors on word like repetetion of harakats,or tanwin befor alef
# araby.arabicrange() # Return a list of Arabic characters

# New Functions in PyArabic
# araby.vocalized_similarity(word1, word2) # if the two words has the same letters and the same harakats, this function return True. 
# The two words can be full vocalized, or partial vocalized

# araby.vocalizedlike(word1, word2) Same as vocalized_similarity but return True and False

# araby.joint(word1, word2) # joint the letters with the marks the length ot letters and marks must be equal return word



# Return the text, its tashkeel and shadda if extract_shadda is True
# text, marks, shada = araby.separate(text,extract_shadda=True) # Separate diacritics from the text
# print (text)
# for m in marks:
#     print (araby.name(m))

# for s in shada:
#     print (araby.name(s))

In [37]:
# Read tokenized_input file
tokenized_input = read_data("./generatedFiles/training/tokenized_input.txt")
print(tokenized_input[:10])
print(len(tokenized_input))

['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']
2101983


# **Core Word (CW) Diacritization**

## **Feature Extraction**

### 1 - Characters: 
Here we extract each character from all tokenized words and create a vector of size 50 for each character.

In [60]:
embedding_size = 40

In [61]:
tokenizer_char = Tokenizer(char_level=True)
tokenizer_char.fit_on_texts(tokenized_input)
sequences_char = tokenizer_char.texts_to_sequences(tokenized_input)
char_features = pad_sequences(sequences_char)   # padding the sequences to have the same length as the longest sequence (word)
char_embeddings = np.random.rand(len(tokenizer_char.word_index) + 1, embedding_size)

In [62]:
print(char_features.shape) # (number of words, max length of word in the dataset)


print(char_embeddings.shape)

# 38 rows: 37 unique characters identified by the tokenizer, 1 row for handling characters not seen in the training data
# 50 columns: Each character is encoded as a 50-dimensional vector

(2101983, 13)
(37, 40)


In [63]:
print(char_features[0]) 
# the number of non zero elements corresponds to the length of the word 
# and the value of each element corresponds to the index of the character in the tokenizer
# which means that every character now is encoded as a number and this number is the index of the character in the tokenizer

[ 0  0  0  0  0  0  0  0  0 13  5  1  7]


In [64]:
print(char_embeddings[0])
# this is the embedding of each character in the tokenizer

[0.0376407  0.74802505 0.93254551 0.48121817 0.84841083 0.43022197
 0.22941891 0.76855345 0.01788595 0.951768   0.71797562 0.77546778
 0.31634662 0.36002511 0.59748075 0.33367336 0.49021447 0.46607053
 0.3086154  0.00277977 0.01419312 0.77322404 0.16897078 0.89678494
 0.88448913 0.14431937 0.52945295 0.35772365 0.50441221 0.85165924
 0.26063738 0.21964733 0.08728334 0.37856344 0.29654234 0.63440095
 0.28947857 0.5985842  0.19602107 0.43650657]


In [65]:
print(char_embeddings[char_features[0]].shape)
# 13 is the word of characters and 50 is the embedding size of each character

print(char_embeddings[char_features[0]])
# this is the embedding of each character in the first tokenized word, this is the 1st feature and the input of the model

(13, 40)
[[0.0376407  0.74802505 0.93254551 0.48121817 0.84841083 0.43022197
  0.22941891 0.76855345 0.01788595 0.951768   0.71797562 0.77546778
  0.31634662 0.36002511 0.59748075 0.33367336 0.49021447 0.46607053
  0.3086154  0.00277977 0.01419312 0.77322404 0.16897078 0.89678494
  0.88448913 0.14431937 0.52945295 0.35772365 0.50441221 0.85165924
  0.26063738 0.21964733 0.08728334 0.37856344 0.29654234 0.63440095
  0.28947857 0.5985842  0.19602107 0.43650657]
 [0.0376407  0.74802505 0.93254551 0.48121817 0.84841083 0.43022197
  0.22941891 0.76855345 0.01788595 0.951768   0.71797562 0.77546778
  0.31634662 0.36002511 0.59748075 0.33367336 0.49021447 0.46607053
  0.3086154  0.00277977 0.01419312 0.77322404 0.16897078 0.89678494
  0.88448913 0.14431937 0.52945295 0.35772365 0.50441221 0.85165924
  0.26063738 0.21964733 0.08728334 0.37856344 0.29654234 0.63440095
  0.28947857 0.5985842  0.19602107 0.43650657]
 [0.0376407  0.74802505 0.93254551 0.48121817 0.84841083 0.43022197
  0.22941891 

### 2 - The position of the character in a word segment:
For example, given the word “wAlktAb” , which is composed of three segments “w+Al+ktAb”. Letters were marked as “B” if they begin a segment, “M” if they are in the middle of a segment, “E” if they end a segment, and “S” if they are single letter segments. So for “w+Al+ktAb”, the corresponding character positions are “S+BE+BMME.”

In [66]:
segmenter = FarasaSegmenter(interactive=True) # The default behaviour



In [67]:
def get_seg_tags(word):                 # word = "wAlktAb"
    segments = segmenter.segment(word)  # segments will be a list: ["w", "Al", "ktAb"]
    segments = segments.split('+')
    seg_tags = []
    for segment in segments:
        if len(segment) == 1:
            seg_tags.append("S")
        else:
            seg_tags.append("B")  # First letter
            seg_tags.extend("M" * (len(segment) - 2))  # Middle letters
            seg_tags.append("E")  # Last letter
    return segments, seg_tags

word = "كقلمه"
segments, seg_tags = get_seg_tags(word)
print("Segmented word:", segments)
print("SEG tags:", seg_tags)


Segmented word: ['ك', 'قلم', 'ه']
SEG tags: ['S', 'B', 'M', 'E', 'S']


In [45]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 25m 4.5s TO RUN
# # The Output of this code is the input_segments.txt file

# for i in range(len(tokenized_input)):
#     segments, seg_tags = get_seg_tags(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/training/input_segments.txt', 'a', encoding='utf-8') as file:
#         for tag in seg_tags:
#             file.write(tag)
#         file.write('\n')

In [68]:
input_segments = read_data("./generatedFiles/training/input_segments.txt")
print(len(input_segments))
print(input_segments[:10])

2101983
['BMES', 'BE', 'BME', 'BEBME', 'BES', 'BME', 'BME', 'BEBMMME', 'BME', 'BMES']


In [69]:
tokenizer_tags = Tokenizer(char_level=True)
tokenizer_tags.fit_on_texts(input_segments)
sequences_tags = tokenizer_tags.texts_to_sequences(input_segments)
tags_features = pad_sequences(sequences_tags)   
tags_embeddings = np.random.rand(len(tokenizer_tags.word_index) + 1, embedding_size)

In [70]:
print(tags_features.shape) 
print(tags_embeddings.shape)
tags_features[0]

(2101983, 13)
(5, 40)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 2, 4])

In [71]:
tags_embeddings[tags_features[0][0]]

array([0.80563705, 0.22085559, 0.39391021, 0.80762699, 0.65970431,
       0.95923876, 0.73655634, 0.66568013, 0.93521805, 0.24291673,
       0.94681661, 0.3884334 , 0.20181407, 0.76158965, 0.6947282 ,
       0.59655757, 0.18586715, 0.37187113, 0.54049574, 0.66541688,
       0.74685704, 0.65370439, 0.52893353, 0.93585392, 0.06437579,
       0.58098025, 0.85920853, 0.18683659, 0.95713172, 0.73926071,
       0.10130143, 0.17342781, 0.43268227, 0.86831371, 0.68545115,
       0.50133654, 0.4366274 , 0.56874657, 0.61716146, 0.81716216])

### 3 - PRIOR: 
diacritics seen in the training set per segment. Since we used a character-level model, this feature informed the model with word-level information. For example, the word “ktAb”  was observed to have two diacritized forms in the training set, namely “kitaAb” ( – book) and “kut∼aAb” ( – writers). The first letter in the word (“k”) accepted the diacritics “i” and “u.” Thus, given a binary vector representing whether a character is allowed to assume any of the eight primitive Arabic diacritic marks (a, i, u, o, K, N, F, and ∼ in order), the first letter would be given the following vector “01100000.” If a word segment was never observed during training, then the vector for all letters therein would be set to 11111111.

In [51]:
gold_output = read_data("./generatedFiles/training/gold_output.txt")
print(len(gold_output))
print(gold_output[:10])

2101983
['قَوْلُهُ', 'أَوْ', 'قَطَعَ', 'الْأَوَّلُ', 'يَدَهُ', 'إلَخْ', 'قَالَ', 'الزَّرْكَشِيُّ', 'ابْنُ', 'عَرَفَةَ']


In [52]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SHADDA': '\u0651',
    'SUKUN': '\u0652',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D'
}

In [118]:
# # Extract diacritics by returning a list containing a tuple of 3 elements: (letter, tashkeel, shadda)
# def extract_arabic_diacritics(word):
#     diacritics_list = []
#     extracted_word, tashkeel, shadda = araby.separate(word, extract_shadda=True)
#     for i in range(len(extracted_word)):
#         print(f'{araby.name(extracted_word[i])} {araby.name(tashkeel[i])} {araby.name(shadda[i])}')
#         diacritics_list.append((extracted_word[i], (tashkeel[i].encode("utf8")).decode(), (shadda[i].encode("utf8")).decode()))
#     return diacritics_list

In [119]:
# # firstly, initialize an empty dictionary for the 'prior' feature
# # the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# # then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
# def get_prior(tokenized_input, gold_output):
#     prior = {} # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
#     for i in range(len(tokenized_input)):
#         if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
#             # get the indices of the word in the tokenized input array
#             indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
#             print(indices)
#             # get the words in the gold_output array with the same indices
#             words = [gold_output[j] for j in indices]
#             extracted_diac_all_words = []
#             for word in words:
#                 extracted_diac_all_words.append(extract_arabic_diacritics(word))
#             for indx, charac in enumerate(tokenized_input[i]):
#                 for extracted_diac_per_word in extracted_diac_all_words:
#                     # extract the diacritics of word[indx]
#                     prior[(tokenized_input[i], charac, indx)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
#                     if diacritics_mapping['SHADDA'] in extracted_diac_per_word[indx]:
#                         prior[(tokenized_input[i], charac, indx)][4] = 1 if diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][5] = 1 if diacritics_mapping['DAMMA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][6] = 1 if diacritics_mapping['KASRA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][7] = 1 if not  diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] and not diacritics_mapping['DAMMA'] in word[indx: indx+2]  and not diacritics_mapping['KASRA'] in word[indx: indx+2] else 0
#                     else:
#                         prior[(tokenized_input[i], charac,indx)][0] = 1 if diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][1] = 1 if diacritics_mapping['DAMMA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][2] = 1 if diacritics_mapping['KASRA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][3] = 1 if diacritics_mapping['SUKUN'] in extracted_diac_per_word[indx] else 0
#     return prior

In [120]:
letter, tashkeel, shadda = araby.separate('زَّ', extract_shadda=True)   # SHADDA + FATHA Example
# letter, tashkeel, shadda = araby.separate('وَ', extract_shadda=True)   # FATHA Example
# letter, tashkeel, shadda = araby.separate('مً', extract_shadda=True)   # FATHATAN Example
# letter, tashkeel, shadda = araby.separate('عٌ', extract_shadda=True)   # DAMMATAN Example
# letter, tashkeel, shadda = araby.separate('يُّ', extract_shadda=True)   # SHADDA + DAMMA Example
# letter, tashkeel, shadda = araby.separate('ذْ', extract_shadda=True)   # SUKUN Example
enkar = 'كَإِنْكَارِ'
# print(enkar[4:6])
# print( diacritics_mapping['FATHA'] in enkar[0:1])
# print( diacritics_mapping['SHADDA'] in 'زَّ')
# print( diacritics_mapping['DAMMA'] in 'زَّ')

print('FATHA in tashkeel: ', diacritics_mapping['FATHA'] in tashkeel)
print('DAMMA in tashkeel: ', diacritics_mapping['DAMMA'] in tashkeel)
print('KASRA in tashkeel: ', diacritics_mapping['KASRA'] in tashkeel)
print('SUKUN in tashkeel: ', diacritics_mapping['SUKUN'] in tashkeel)
print('FATHATAN in tashkeel: ', diacritics_mapping['FATHATAN'] in tashkeel)
print('DAMMATAN in tashkeel: ', diacritics_mapping['DAMMATAN'] in tashkeel)
print('KASRATAN in tashkeel: ', diacritics_mapping['KASRATAN'] in tashkeel)
print('SHADDA in tashkeel: ', diacritics_mapping['SHADDA'] in tashkeel)
print('=============================')
print('FATHA in shadda: ', diacritics_mapping['FATHA'] in shadda)
print('DAMMA in shadda: ', diacritics_mapping['DAMMA'] in shadda)
print('KASRA in shadda: ', diacritics_mapping['KASRA'] in shadda)
print('SUKUN in shadda: ', diacritics_mapping['SUKUN'] in shadda)
print('FATHATAN in shadda: ', diacritics_mapping['FATHATAN'] in shadda)
print('DAMMATAN in shadda: ', diacritics_mapping['DAMMATAN'] in shadda)
print('KASRATAN in shadda: ', diacritics_mapping['KASRATAN'] in shadda)
print('SHADDA in shadda: ', diacritics_mapping['SHADDA'] in shadda)

print((diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['FATHA'] not in tashkeel and diacritics_mapping['DAMMA'] not in tashkeel and diacritics_mapping['KASRA'] not in tashkeel))
print((diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda))

FATHA in tashkeel:  True
DAMMA in tashkeel:  False
KASRA in tashkeel:  False
SUKUN in tashkeel:  True
FATHATAN in tashkeel:  False
DAMMATAN in tashkeel:  False
KASRATAN in tashkeel:  False
SHADDA in tashkeel:  False
FATHA in shadda:  False
DAMMA in shadda:  False
KASRA in shadda:  False
SUKUN in shadda:  False
FATHATAN in shadda:  False
DAMMATAN in shadda:  False
KASRATAN in shadda:  False
SHADDA in shadda:  True
testt False
yarab False


In [53]:
# firstly, initialize an empty dictionary for the 'prior' feature
# the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
def get_prior(tokenized_input, gold_output):
    prior = {}  # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
    for i in range(len(tokenized_input)):
        if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
            # get the indices of the word in the tokenized input array
            indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
            # print(indices)
            # get the words in the gold_output array with the same indices
            words = []
            maxi_len = 0
            for j in indices:
                if gold_output[j] not in words:
                    words.append(gold_output[j])
                    maxi_len = max(maxi_len, len(gold_output[j]))

            for t in range(len(tokenized_input[i])):
                prior[(tokenized_input[i], tokenized_input[i][t], t)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
            
            indx2 = 0
            for word in words:
                indx = 0
                while indx < maxi_len:
                    # extract the diacritics of word[indx]
                    for iter in range(indx+1, len(word)):
                        if is_not_arabic_diacritic(word[iter]):
                            # print(iter)
                            letter, tashkeel, shadda = araby.separate(word[indx: iter], extract_shadda=True) 
                            if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][0] = 1 
                            if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][1] = 1
                            if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][2] = 1
                            if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][3] = 1
                            if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][4] = 1
                            if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][5] = 1
                            if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):  
                                prior[(tokenized_input[i], word[indx], indx2)][6] = 1 # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                            if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], indx2)][7] = 1
                            indx = iter - 1
                            indx2 += 1
                            break 
                    indx += 1
                indx2 = 0


                indx = len(word) - 1    # my assumption is that the last character in the not a diacritic
                if (not is_not_arabic_diacritic(word[len(word) - 1]) and is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is not, then the index of the last character is len(word) - 2
                    indx = len(word) - 2
                elif (not is_not_arabic_diacritic(word[len(word) - 1]) and not is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is also a diacritic (in shadda case), then the index of the last character is len(word) - 3
                    indx = len(word) - 3


                if (tokenized_input[i], word[indx], indx) not in prior:
                    letter, tashkeel, shadda = araby.separate(word[indx: len(word)], extract_shadda=True) 
                    if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][0] = 1
                    if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][1] = 1
                    if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][2] = 1 
                    if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][3] = 1 
                    if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][4] = 1 
                    if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][5] = 1
                    if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):
                        prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][6] = 1  # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                    if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][7] = 1
                    
    return prior

In [54]:
test_tokenized_input = ['كإنكار', 'كإنكار', 'بقذر','بقذر', 'أكثر', 'أكثر', 'الزركشي']
test_gold_output = ['كَإِنْكَارِ','كَإِنْكَارٍ', 'بِقَذَر', 'بِقَذَرٍ','أكْثَرَ', 'أَكْثَرُ', 'الزَّرْكَشِيُّ']
print (get_prior(test_tokenized_input, test_gold_output))

{('كإنكار', 'ك', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'إ', 1): [0, 0, 1, 0, 0, 0, 0, 0], ('كإنكار', 'ن', 2): [0, 0, 0, 0, 0, 0, 1, 0], ('كإنكار', 'ك', 3): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ا', 4): [0, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ر', 5): [0, 0, 1, 0, 0, 1, 0, 0], ('بقذر', 'ب', 0): [0, 0, 1, 0, 0, 0, 0, 0], ('بقذر', 'ق', 1): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ذ', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ر', 3): [0, 0, 0, 0, 0, 1, 0, 0], ('أكثر', 'أ', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ك', 1): [0, 0, 0, 0, 0, 0, 1, 0], ('أكثر', 'ث', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ر', 3): [1, 1, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ا', 0): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ل', 1): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ز', 2): [1, 0, 0, 0, 0, 0, 0, 1], ('الزركشي', 'ر', 3): [0, 0, 0, 0, 0, 0, 1, 0], ('الزركشي', 'ك', 4): [1, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ش', 5): [0, 0, 1, 0, 0, 0, 0, 0], ('الزركشي', 'ي', 6): [0, 1, 0, 0, 0, 0, 0, 1]}


In [49]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 276 minutes TO RUN
# # write in a file the prior feature
# prior_feature = get_prior(tokenized_input, gold_output)
# with open('./generatedFiles/training/prior_feature.txt', 'w', encoding='utf-8') as file:
#     for key, value in prior_feature.items():
#         file.write(f'{key}: {value}\n')

In [56]:
def read_map(file_path, number_of_keys=2):
    """
    Read the contents of the file located at file_path 
    and append to the dictionary prior_feature
    """
    prior_feature = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            key, value = line.strip().split(':')
            key = key.strip()
            value = value.strip()
            key = key[1:-1].split(',')
            value = value[1:-1].split(',')
            if number_of_keys == 2:
                key = (key[0][1:-1], key[1][2:-1], int(key[2]))
            else:
                key = (key[0][1:-1], key[1][2:-1], int(key[2]), int(key[3]))
            
            value = [int(i) for i in value]
            prior_feature[key] = value
    return prior_feature

In [57]:
prior_feature = read_map('./generatedFiles/training/prior_feature.txt', 2)
print(prior_feature[('قوله', 'ق', 0)])

[1, 0, 0, 0, 0, 0, 0, 0]


### 4 - CASE Feature: 
whether the letter expects a core word diacritic or a case ending. Case endings are placed on only one letter in a word, which may or may not be the last letter in the word. This is a binary feature.

In [58]:
# from farasa.stemmer import FarasaStemmer

# def arabic_stemmer(text):
#     stemmer = FarasaStemmer(interactive=True)  # Set interactive to True for better performance

#     # Perform stemming
#     stemmed_text = stemmer.stem(text)

#     return stemmed_text

# # Example usage
# input_text = "الكتابة باللغة العربية"
# stemmed_text = arabic_stemmer(input_text)
# print("Original text:", input_text)
# print("Stemmed text:", stemmed_text)




Original text: الكتابة باللغة العربية
Stemmed text: كتابة لغة عربي


In [52]:
# for i in range(10):
#     stemmed_text = arabic_stemmer(tokenized_input[i])
#     print("Original text:", tokenized_input[i])
#     print("Stemmed text:", stemmed_text)

In [53]:
# for i in range(len(tokenized_input)):
#     stemmed_text = arabic_stemmer(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/stemmed_input.txt', 'a', encoding='utf-8') as file:
#         file.write(stemmed_text + '\n')
        

In [54]:
# stemmed_text = stemmed_text.split(' ')
# # write in a file the stemmed input
# with open('./generatedFiles/stemmed_input.txt', 'w', encoding='utf-8') as file:
#     for word in stemmed_text:
#         file.write(word + '\n')


### 5 - POS Tagging:
Marking up a word in a text as corresponding to a particular part of speech, based on both its definition and its context.

In [98]:
# from farasa.pos import FarasaPOSTagger

# tagger = FarasaPOSTagger(interactive=True)  # Download model if needed
# text = "قراءة يَحْتَاجُ الكتب مفيدة للعقل."
# tagged = tagger.tag(text)
# # Output: [['قراءة', 'NOUN'], ['الكتب', 'NOUN'], ['مفيدة', 'ADJ'], ['للعقل', 'NOUN'], ['.', 'PUNCT']]

# print(tagged)



S/S قراء +ة/NOUN+NSUFF-FS يحتاج/V ال+ كتب/DET+NOUN-MS مفيد +ة/ADJ+NSUFF-FP ل+/PREP ال+ عقل/DET+NOUN-MS ./PUNC E/E


## **Model Building**

In [20]:
# THIS CELL TOOK 12 minutes TO RUN
# input layer
char_features_vector=[]
tag_features_vector=[]
prior_features_vector=[]
embeddings = []

for i in range(len(tokenized_input)):
    for j in range(len(tokenized_input[i])):    
        char_index = tokenizer_char.word_index.get(tokenized_input[i][j])
        char_features_vector= char_embeddings[char_index]
        if (len(tokenized_input[i]) != len(input_segments[i])):
            input_segments[i] = "S" * (len(tokenized_input[i]) - len(input_segments[i])) + input_segments[i]
        tag_index = tokenizer_tags.word_index.get(input_segments[i][j].lower())
        tag_features_vector= tags_embeddings[tag_index]
        prior_features_vector= prior_feature[(tokenized_input[i], tokenized_input[i][j], j)]
        # pad the prior feature vector with zeros to have the same length as the other features
        prior_features_vector = np.pad(prior_features_vector, (0, embedding_size-8), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # embeddings.append(np.vstack((char_features_vector, tag_features_vector, prior_features_vector)))
        embeddings.append(np.concatenate((char_features_vector, tag_features_vector, prior_features_vector)))

embeddings = np.array(embeddings)

# print(char_features_vector)
# print(tag_features_vector)
# print(prior_features_vector)
print(embeddings.shape)

(8353805, 150)


In [61]:
# save the embeddings in a pickle file
with open('./generatedFiles/embeddings.pickle', 'wb') as file:
    pickle.dump(embeddings, file)

In [3]:
# read the embeddings from the pickle file
with open('./generatedFiles/embeddings.pickle', 'rb') as file:
    embeddings = pickle.load(file)

In [21]:
output_map = {
    (1, 0, 0, 0, 0, 0, 0, 0) : 0, # FATHA
    (0, 0, 0, 1, 0, 0, 0, 0) : 1, # FATHATAN
    (0, 0, 1, 0, 0, 0, 0, 0) : 2, # KASRA
    (0, 0, 0, 0, 0, 1, 0, 0) : 3, # KASRATAN
    (0, 1, 0, 0, 0, 0, 0, 0) : 4, # DAMMA
    (0, 0, 0, 0, 1, 0, 0, 0) : 5, # DAMMATAN
    (0, 0, 0, 0, 0, 0, 1, 0) : 6, # SUKUN
    (0, 0, 0, 0, 0, 0, 0, 1) : 7,  # SHADDA
    (1, 0, 0, 0, 0, 0, 0, 1) : 8, # SHADDA FATHA
    (0, 0, 0, 1, 0, 0, 0, 1) : 9, # SHADDA FATHATAN
    (0, 0, 1, 0, 0, 0, 0, 1) : 10, # SHADDA KASRA
    (0, 0, 0, 0, 0, 1, 0, 1) : 11, # SHADDA KASRATAN
    (0, 1, 0, 0, 0, 0, 0, 1) : 12, # SHADDA DAMMA
    (0, 0, 0, 0, 1, 0, 0, 1) : 13, # SHADDA DAMMATAN
    (0, 0, 0, 0, 0, 0, 0, 0) : 14
}

In [24]:
# gold labels
with open('./generatedFiles/gold_output_dict.txt', 'w', encoding='utf-8') as file:
    for idx, word in enumerate(gold_output):
        gold_diacritics = get_prior([tokenized_input[idx]], [word])
        for key, value in gold_diacritics.items():
            key = key + (idx,)
            file.write(f'{key}: {value}\n')

NameError: name 'get_prior' is not defined

In [22]:
# read the prior feature file in a dictionary called prior_feature 
gold_output_dict = {}
with open('./generatedFiles/gold_output_dict.txt', 'r', encoding='utf-8') as file:
    for line in file:
        key, value = line.strip().split(':')
        key = key.strip()
        value = value.strip()
        key = key[1:-1].split(',')
        value = value[1:-1].split(',')
        key = (key[0][1:-1], key[1][2:-1], int(key[2]), int(key[3]))
        value = [int(i) for i in value]
        gold_output_dict[key] = value

In [45]:
gold_output_dict[('قوله', 'ق', 0, 0)]

KeyError: ('قوله', 'ق', 0, 0)

In [26]:
# Change gold_output_dict.values() to a list of tuples
for key, value in gold_output_dict.items():
    gold_output_dict[key] = tuple(value)
    
gold_output_dict_values = list(gold_output_dict.values())

In [28]:
with open('./generatedFiles/gold_output_id.txt', 'w', encoding='utf-8') as file:
    for value in gold_output_dict_values:
        file.write(f'{output_map[value]}\n')

In [32]:
# read the gold_output_id file
with open('./generatedFiles/gold_output_id.txt', 'r', encoding='utf-8') as file:
    gold_output_id = file.readlines()
    gold_output_id = [line.strip() for line in gold_output_id]

gold_output_id = np.array(gold_output_id)

In [33]:
print(gold_output_id.shape)
print(gold_output_id[:10])

(8353805,)
['0' '6' '4' '4' '0' '6' '0' '0' '0' '14']


In [34]:
print(len(embeddings))

8353805


In [35]:
# Truncate emdeddings to have the 8353000
embeddings_reshape = embeddings[:4500000]
gold_output_id = gold_output_id[:4500000]

# Make it np array 
embeddings_reshape = np.array(embeddings_reshape)
gold_output_id = np.array(gold_output_id)


In [36]:
print (embeddings_reshape.shape)
print (gold_output_id.shape)

(4500000, 150)
(4500000,)


In [37]:
# Reshape embeddings to have 3 dimensions 
embeddings_reshape = embeddings_reshape.reshape((-1, 1000, 150))
gold_output_id_reshape = gold_output_id.reshape(-1, 1000, 1)

In [38]:
import tensorflow as tf
# Print Shap
#e
print(embeddings_reshape.shape)
print(gold_output_id_reshape.shape)

# print the first 10 rows of the embeddings
# print(embeddings[:10])

# print the first 10 rows of the gold_output_id
print(gold_output_id_reshape[:][0])

# print the first 10 columns of the gold_output_id
print(tf.keras.utils.to_categorical(gold_output_id_reshape[:][0]))

(4500, 1000, 150)
(4500, 1000, 1)
[['0']
 ['6']
 ['4']
 ['4']
 ['0']
 ['6']
 ['0']
 ['0']
 ['0']
 ['14']
 ['6']
 ['0']
 ['8']
 ['4']
 ['0']
 ['0']
 ['4']
 ['14']
 ['0']
 ['6']
 ['0']
 ['14']
 ['0']
 ['14']
 ['14']
 ['8']
 ['6']
 ['0']
 ['2']
 ['12']
 ['14']
 ['6']
 ['4']
 ['0']
 ['0']
 ['0']
 ['0']
 ['0']
 ['6']
 ['4']
 ['4']
 ['2']
 ['0']
 ['6']
 ['3']
 ['0']
 ['6']
 ['0']
 ['2']
 ['14']
 ['14']
 ['0']
 ['2']
 ['6']
 ['0']
 ['14']
 ['2']
 ['0']
 ['6']
 ['2']
 ['0']
 ['2']
 ['14']
 ['3']
 ['2']
 ['14']
 ['6']
 ['2']
 ['6']
 ['0']
 ['14']
 ['2']
 ['4']
 ['4']
 ['14']
 ['0']
 ['0']
 ['14']
 ['4']
 ['2']
 ['0']
 ['4']
 ['4']
 ['14']
 ['4']
 ['4']
 ['2']
 ['6']
 ['14']
 ['14']
 ['10']
 ['14']
 ['2']
 ['0']
 ['4']
 ['14']
 ['0']
 ['1']
 ['0']
 ['2']
 ['6']
 ['0']
 ['14']
 ['2']
 ['4']
 ['6']
 ['0']
 ['3']
 ['2']
 ['0']
 ['0']
 ['3']
 ['0']
 ['0']
 ['10']
 ['4']
 ['8']
 ['14']
 ['3']
 ['14']
 ['6']
 ['4']
 ['0']
 ['0']
 ['0']
 ['0']
 ['0']
 ['6']
 ['4']
 ['14']
 ['6']
 ['2']
 ['0']
 ['14']
 

In [30]:
# build a training model, first we need input layer that take matrix "embeddings" as an input with dropout of 10%
# then we need a bidirectional LSTM layer with 100 units
# then we need a dense layer with 100 units and relu activation function
# then we need an output layer with 14 units and softmax activation function
# use early stopping with patience of five epochs, a learning rate of 0.001, a batch size of 256, and an Adamax optimizer

# define the model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.activations import relu,linear

input_shape = (1000, 150)
    # tf.keras.layers.Input(shape=150),
    # Dropout(0.1),
    # Bidirectional(LSTM(100)),
    # Dense(100, activation='relu'),
    # Dense(14, activation='softmax')

model = Sequential()
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=input_shape))
model.add(TimeDistributed(Dense(100, activation='relu')))
model.add(TimeDistributed(Dense(15, activation='softmax')))

# model = Sequential()
# forward_layer = LSTM(50)
# backward_layer = LSTM(50, activation='relu', go_backwards=True)
# model.add(Bidirectional(forward_layer, backward_layer=backward_layer, input_shape=(1,150)))
# # model.add(Dropout(0.1))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(14, activation='softmax'))


# compile the model
model.compile(optimizer=Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# model.build(reshaped_matrix.shape)
# summarize the model
print(model.summary())

# early stopping
# early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 1000, 100)         80400     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 1000, 100)         10100     
 ributed)                                                        
                                                                 
 time_distributed_1 (TimeDi  (None, 1000, 15)          1515      
 stributed)                                                      
                                                                 
Total params: 92015 (359.43 KB)
Trainable params: 92015 (359.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [40]:
labels = []
for i in range(len(gold_output_id_reshape)):
    for j in range(1000):
        labels.append( tf.keras.utils.to_categorical(gold_output_id_reshape[i][j], num_classes=15))
        
labels = np.array(labels)

In [41]:
labels = labels.reshape(-1, 1000, 15)
print(labels.shape)

(4500, 1000, 15)


In [42]:
# fit the model on the training dataset and evaluate it on the validation dataset,
# use early stopping with patience of five epochs, a learning rate of 0.001, a batch size of 256
# before that, configure the model to use GPU

# fit the model with gpu

with tf.device('/GPU:0'):
    model.fit(embeddings_reshape, labels, epochs=50, batch_size=256)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [43]:
# save the model
model.save('./generatedFiles/model.h5')

  saving_api.save_model(


In [45]:
# Load the dataset
# Specify the file path
file_path = "./test/test_no_diacritics.txt"

# Read the contents of the file located at file_path 
# and append each line to the list data_before_preprocessing
with open(file_path, 'r', encoding='utf-8') as file:
    val_data_before_preprocessing = file.readlines()
    # remove '\n' from each line
    val_data_before_preprocessing = [line.strip() for line in val_data_before_preprocessing]
    
    

In [57]:
for i in range(len(val_data_before_preprocessing)):
    save_tokenized_input(val_data_before_preprocessing[i],"test_tokenized_input")

In [98]:
for i in range(len(val_data_before_preprocessing)):
    save_test_tokenized_input(val_data_before_preprocessing[i], i ,"test_line_numb_input")

In [58]:
# Read the tokenized input file
# read the tokenized input file
with open('./generatedFiles/test_tokenized_input.txt', 'r', encoding='utf-8') as file:
    val_tokenized_input = file.readlines()
    # Remove '\n' from each line
    val_tokenized_input = [line.strip() for line in val_tokenized_input]
    
print(len(val_tokenized_input))

104898


In [34]:
for i in range(len(val_data_before_preprocessing)):
    save_gold_output(val_data_before_preprocessing[i],"val_gold_output")
    

In [50]:
#read test_set_without_labels.csv file 
test_set_without_labels = pd.read_csv('./test/test_set_without_labels.csv', encoding='utf-8')
test_set_without_labels.head()

Unnamed: 0,id,line_number,letter
0,0,0,ل
1,1,0,ي
2,2,0,س
3,3,0,ل
4,4,0,ل


In [35]:
# Read from the gold_output file
with open('./generatedFiles/val_gold_output.txt', 'r', encoding='utf-8') as file:
    val_gold_output = file.readlines()
    # remove '\n' from each line
    val_gold_output = [line.strip() for line in val_gold_output]

In [51]:
for i in range(len(val_tokenized_input)):
    val_segments, val_seg_tags = get_seg_tags(val_tokenized_input[i])
    # Write and append on the tokenized input to a file
    with open('./generatedFiles/test_input_segments.txt', 'a', encoding='utf-8') as file:
        for tag in val_seg_tags:
            file.write(tag)
        file.write('\n')
        

In [59]:
# Read the val input_segments file
with open('./generatedFiles/test_input_segments.txt', 'r', encoding='utf-8') as file:
    val_input_segments = file.readlines()
    val_input_segments = [line.strip() for line in val_input_segments]

In [60]:
# THIS CELL TOOK 12 minutes TO RUN
# input layer
val_char_features_vector=[]
val_tag_features_vector=[]
val_prior_features_vector=[]
val_embeddings = []
# with open('./generatedFiles/i_j.txt', 'a', encoding='utf-8') as file:
for i in range(len(val_tokenized_input)):
    for j in range(len(val_tokenized_input[i])):    
        # write i and j in a file
        # file.write(f'{i} {j}\n')
        char_index = tokenizer_char.word_index.get(val_tokenized_input[i][j])
        char_features_vector= char_embeddings[char_index]
        if (len(val_tokenized_input[i]) != len(val_input_segments[i])):
            val_input_segments[i] = "S" * (len(val_tokenized_input[i]) - len(val_input_segments[i])) + val_input_segments[i]
        tag_index = tokenizer_tags.word_index.get(val_input_segments[i][j].lower())
        tag_features_vector= tags_embeddings[tag_index]
        prior_features_vector= (prior_feature[(val_tokenized_input[i], val_tokenized_input[i][j], j)]) if (val_tokenized_input[i], val_tokenized_input[i][j], j) in prior_feature else [1, 1, 1, 1, 1, 1, 1, 1]
        # pad the prior feature vector with zeros to have the same length as the other features
        prior_features_vector = np.pad(prior_features_vector, (0, 12), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # embeddings.append(np.vstack((char_features_vector, tag_features_vector, prior_features_vector)))
        val_embeddings.append(np.concatenate((char_features_vector, tag_features_vector, prior_features_vector)))

val_embeddings = np.array(val_embeddings)

# print(char_features_vector)
# print(tag_features_vector)
# print(prior_features_vector)
print(val_embeddings.shape)

(417470, 60)


In [65]:
#pad the embeddings to be of size 418000
val_embeddings = np.pad(val_embeddings, ((0, 418000 - val_embeddings.shape[0]), (0, 0)), 'constant')

In [66]:
val_embeddings.shape

(418000, 60)

In [67]:
val_embeddings_reshape = val_embeddings[:418000]
val_embeddings_reshape = val_embeddings_reshape.reshape((-1, 1000, 60))
val_embeddings_reshape = np.array(val_embeddings_reshape)
print(val_embeddings_reshape.shape)


(418, 1000, 60)


In [112]:
print (val_gold_output[:10])

['قَوْلُهُ', 'وَلَا', 'تُكْرَهُ', 'ضِيَافَتُهُ', 'الْفَرْقُ', 'الثَّالِثُ', 'وَالثَّلَاثُونَ', 'بَيْنَ', 'قَاعِدَةِ', 'تَقَدُّمِ']


In [121]:
# gold labels
with open('./generatedFiles/val_gold_output_dict.txt', 'w', encoding='utf-8') as file:
    for idx, word in enumerate(val_gold_output):
        gold_diacritics = get_prior([val_tokenized_input[idx]], [word])
        for key, value in gold_diacritics.items():
            key = key + (idx,)
            file.write(f'{key}: {value}\n')

In [122]:
# read the prior feature file in a dictionary called prior_feature 
val_gold_output_dict = {}
with open('./generatedFiles/val_gold_output_dict.txt', 'r', encoding='utf-8') as file:
    for line in file:
        key, value = line.strip().split(':')
        key = key.strip()
        value = value.strip()
        key = key[1:-1].split(',')
        value = value[1:-1].split(',')
        key = (key[0][1:-1], key[1][2:-1], int(key[2]), int(key[3]))
        value = [int(i) for i in value]
        val_gold_output_dict[key] = value

In [123]:
# Change gold_output_dict.values() to a list of tuples
for key, value in val_gold_output_dict.items():
    val_gold_output_dict[key] = tuple(value)
    
val_gold_output_dict_values = list(val_gold_output_dict.values())

In [125]:
with open('./generatedFiles/val_gold_output_id.txt', 'w', encoding='utf-8') as file:
    for value in val_gold_output_dict_values:
        file.write(f'{output_map[value]}\n')

In [127]:
# read the gold_output_id file
with open('./generatedFiles/val_gold_output_id.txt', 'r', encoding='utf-8') as file:
    val_gold_output_id = file.readlines()
    val_gold_output_id = [line.strip() for line in val_gold_output_id]

val_gold_output_id = np.array(val_gold_output_id)

In [130]:
val_gold_output_id = val_gold_output_id[:421000]
val_gold_output_id = val_gold_output_id.reshape(-1, 1000, 1)

In [131]:
val_labels = []
for i in range(len(val_gold_output_id)):
    for j in range(1000):
        val_labels.append( tf.keras.utils.to_categorical(val_gold_output_id[i][j], num_classes=15))
        
val_labels = np.array(val_labels)

val_labels = val_labels.reshape(-1, 1000, 15)
print(val_labels.shape)

(4210, 100, 15)


In [133]:
# evaluate the model on the validation dataset

# evaluate the model

loss, accuracy = model.evaluate(val_embeddings_reshape, val_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))



Accuracy: 66.312587


In [68]:
# load the model
model = tf.keras.models.load_model('./generatedFiles/model3.h5')

In [106]:
# predict the test dataset
predictions = model.predict(val_embeddings_reshape)



In [84]:
print(predictions.shape)

(418000, 15)


In [107]:
predictions = predictions.reshape(-1, 15)

In [72]:
print(predictions[:10])    

[[9.13796008e-01 1.33784284e-04 1.02331594e-03 1.20260862e-04
  7.52487744e-04 4.73528817e-05 2.49509321e-05 3.98757329e-05
  9.89871114e-05 2.71059946e-07 4.40470359e-07 3.43715897e-07
  1.15467415e-07 8.10991594e-07 8.39609057e-02]
 [1.68595999e-03 1.73978582e-10 6.16314574e-05 9.12179075e-06
  1.12038877e-04 3.35399818e-06 9.93750870e-01 3.53652990e-06
  6.19534194e-06 2.08193220e-11 1.22610118e-07 1.68911754e-07
  1.14870993e-07 6.30322063e-07 4.36629448e-03]
 [9.99693036e-01 1.03811519e-08 1.69698205e-06 2.95501906e-07
  5.01330715e-06 1.07018074e-07 3.29081340e-05 2.69331167e-07
  2.50303583e-06 8.56664652e-12 2.37258491e-10 2.17200924e-10
  1.12204739e-10 1.41583922e-09 2.64183967e-04]
 [4.60601914e-05 4.77857647e-08 9.98641670e-01 4.12220015e-06
  1.33615918e-04 7.17239814e-08 9.38295273e-07 5.87541820e-08
  6.54907506e-09 4.81892637e-10 3.57363729e-06 1.03452180e-09
  2.59281818e-09 1.78892859e-10 1.16989145e-03]
 [1.51824730e-04 2.25686331e-10 1.39221243e-04 4.33684448e-07
  

In [108]:
# get the index of the maximum value in each row
predictions = np.argmax(predictions, axis=1)


In [109]:
print(predictions.shape)
print(predictions[:10])

(418000,)
[ 0  6  0  2 14  0  2 14  2  2]


In [76]:
# make a dictionary to map the predictions to the corresponding unicode 

predictions_map = {
    0 : diacritics_mapping['FATHA'],
    1 : diacritics_mapping['FATHATAN'],
    2 : diacritics_mapping['KASRA'],
    3 : diacritics_mapping['KASRATAN'],
    4 : diacritics_mapping['DAMMA'],
    5 : diacritics_mapping['DAMMATAN'],
    6 : diacritics_mapping['SUKUN'],
    7 : diacritics_mapping['SHADDA'],
    8 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHA'],
    9 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHATAN'],
    10 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRA'],
    11 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRATAN'],
    12 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMA'],
    13 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMATAN'],
    14 : ''
}


In [110]:
# truncate the predictions to 417470
predictions = predictions[:417470]

In [111]:
# loop over the letters and concatenate it with the corresponding prediction
predicted_diacritized_text = []
count = 0
for i in range(len(val_tokenized_input)):
    for j in range(len(val_tokenized_input[i])):
        predicted_diacritized_text.append(val_tokenized_input[i][j] + predictions_map[predictions[count]])
        count += 1
        
print(len(predicted_diacritized_text))
print(predicted_diacritized_text[:10])

417470
['لَ', 'يْ', 'سَ', 'لِ', 'ل', 'وَ', 'كِ', 'ي', 'لِ', 'بِ']


In [99]:
# read test_lines_numb_input file
with open('./generatedFiles/test_line_numb_input.txt', 'r', encoding='utf-8') as file:
    test_line_numb_input = file.readlines()
    test_line_numb_input = [line.strip() for line in test_line_numb_input]

In [103]:
def is_arabic_letter(letter):
    # Unicode code points for Alef and Yeh in Arabic script
    alef_code_point = ord('ا')
    yeh_code_point = ord('ي')

    # Unicode code point for the given letter
    letter_code_point = ord(letter)

    # Check if the letter is an Arabic letter between Alef and Yeh
    return alef_code_point <= letter_code_point <= yeh_code_point

In [118]:
# write the predictions in a csv file 
# the file will have 3 columns, the first column is an incremented id from 0 and the second column is the line number and the third column is the diacritized letter
with open('./generatedFiles/predictions.csv', 'w', encoding='utf-8') as file:
    file.write('ID,line_number,letter\n')
    id = 0
    id2 = 0
    for i in range(len(val_tokenized_input)):
        for j in range(len(val_tokenized_input[i])):
            # check if val_tokenized_input[i][j] is an arabic letter
            if val_tokenized_input[i][j] in arabic_letters:
                file.write(f'{id2},{test_line_numb_input[i]},{predicted_diacritized_text[id]}\n')
                id2 += 1
            id += 1

In [122]:
with open('./generatedFiles/predictions.csv', 'w', encoding='utf-8') as file:
    file.write('ID,label\n')
    id = 0
    id2 = 0
    for i in range(len(val_tokenized_input)):
        for j in range(len(val_tokenized_input[i])):
            # check if val_tokenized_input[i][j] is an arabic letter
            if val_tokenized_input[i][j] in arabic_letters:
                file.write(f'{id2},{predictions[id]}\n')
                id2 += 1
            id += 1