In [6]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import farasa
from farasa.segmenter import FarasaSegmenter 
import unicodedata
import torch

In [7]:
# Run on GPU
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print (device)
# print the cpu or gpu
print(torch.cuda.get_device_name(0))
# print the number of gpus you have
print(torch.cuda.device_count())
# print current gpu
print(torch.cuda.current_device())

cuda
NVIDIA GeForce RTX 3060
1
0


In [46]:
# Load the dataset
# Specify the file path
file_path = "./dataset/train.txt"

# Read the contents of the file located at file_path 
# and append each line to the list data_before_preprocessing
with open(file_path, 'r', encoding='utf-8') as file:
    data_before_preprocessing = file.readlines()
    # remove '\n' from each line
    data_before_preprocessing = [line.strip() for line in data_before_preprocessing]
    
    

In [59]:
# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any non-Arabic letters
def remove_non_arabic(text):
    text = re.sub(r'[^\u0600-\u06FF\s]|،|؛', '', text)
    return text

def input_preprocessing_text(text):
    # Correct most common errors on word like repetetion of harakats, or tanween before alef
    text = araby.autocorrect(text)

    # Remove any non-Arabic letters
    text = remove_non_arabic(text)

    # Remove diacritics
    text = remove_diacritics(text)

    # Tokenize
    text = araby.tokenize(text)

    return text

def save_tokenized_input(text,path):
    words = input_preprocessing_text(text)
    # Write and append on the tokenized input to a file
    with open(f'./generatedFiles/{path}.txt', 'a', encoding='utf-8') as file:
        for word in words:
            file.write(word + '\n')

def save_gold_output(text,path):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    # Tokenize
    text = araby.tokenize(text)

    # Write and append on the gold output to a file
    with open(f'./generatedFiles/{path}.txt', 'a', encoding='utf-8') as file:
        for word in text:
            # if last word in the text don't add '\n'
            file.write(word + '\n')


def is_not_arabic_diacritic(char):
   category = unicodedata.category(char)
   return not (category == 'Mn' or category == 'Mc')


In [48]:
# Example usage:
character = 'ذْ'
if is_not_arabic_diacritic(character[1]):
   print("The character is not an Arabic diacritic.")
else:
   print("The character is an Arabic diacritic.")


# Testing of is_not_arabic_diacritic() function with gettting the index of the first non diacritic character in the word
word = 'زَّراع'
 
for i in range(1, len(word)): # start from 1 because the first character is not a diacritic
    if is_not_arabic_diacritic(word[i]):
        print(i)
        break 

The character is an Arabic diacritic.
3


In [49]:
# # RUN ONE TIME ONLY THIS CODE AGAIN 
# # Generate Gold Input file
# for i in range(len(data_before_preprocessing)):
#     save_tokenized_input(data_before_preprocessing[i])

In [50]:
# #RUN ONE TIME ONLY THIS CODE AGAIN
# # Generate Gold Output file
# for i in range(len(data_before_preprocessing)):
#     test = data_before_preprocessing[i]
#     text1 = save_gold_output(test)

In [24]:
# For testing
test = "قَالَ ابْنُ الْقَاسِمِ : قَالَ مَالِكٌ فِي مَكِّيٍّ أَحْرَمَ بِحَجَّةٍ مِنْ الْحَرَمِ ثُمَّ أُحْصِرَ ، أَنَّهُ يَخْرُجُ إلَى الْحِلِّ فَيُلَبِّي مِنْ هُنَاكَ لِأَنَّهُ أَمَرَ مَنْ فَاتَهُ الْحَجُّ وَقَدْ أَحْرَمَ مِنْ مَكَّةَ ، أَنْ يَخْرُجَ إلَى الْحِلِّ فَيَعْمَلَ فِيمَا بَقِيَ عَلَيْهِ مَا يَعْمَلُ الْمُعْتَمِرُ وَيُحِلُّ .( 2 / 437 ) "
text2 = input_preprocessing_text(test)
print(text2)
text3 = remove_non_arabic(test)

['قال', 'ابن', 'القاسم', 'قال', 'مالك', 'في', 'مكي', 'أحرم', 'بحجة', 'من', 'الحرم', 'ثم', 'أحصر', 'أنه', 'يخرج', 'إلى', 'الحل', 'فيلبي', 'من', 'هناك', 'لأنه', 'أمر', 'من', 'فاته', 'الحج', 'وقد', 'أحرم', 'من', 'مكة', 'أن', 'يخرج', 'إلى', 'الحل', 'فيعمل', 'فيما', 'بقي', 'عليه', 'ما', 'يعمل', 'المعتمر', 'ويحل']


In [25]:
# Important functions in PyArabic

# araby.tokenize(text) # Tokenize the sentence text into words
# araby.is_arabicrange(text) # Check if the text is Arabic
# araby.sentence_tokenize(text) # Tokenize the text into sentences
# araby.strip_tashkeel(text) # Remove diacritics (FATHA, DAMMA, KASRA, SUKUN, SHADDA, FATHATAN, DAMMATAN, KASRATAN)
# araby.strip_diacritics(text) # Remove diacritics (Small Alef الألف الخنجرية, Harakat + Shadda, Quranic marks)
# araby.strip_tatweel(text) # Remove tatweel
# araby.strip_shadda(text) # Remove shadda
# araby.autocorrect(text) # Correct most common errors on word like repetetion of harakats,or tanwin befor alef
# araby.arabicrange() # Return a list of Arabic characters

# New Functions in PyArabic
# araby.vocalized_similarity(word1, word2) # if the two words has the same letters and the same harakats, this function return True. 
# The two words can be full vocalized, or partial vocalized

# araby.vocalizedlike(word1, word2) Same as vocalized_similarity but return True and False

# araby.joint(word1, word2) # joint the letters with the marks the length ot letters and marks must be equal return word



# Return the text, its tashkeel and shadda if extract_shadda is True
# text, marks, shada = araby.separate(text,extract_shadda=True) # Separate diacritics from the text
# print (text)
# for m in marks:
#     print (araby.name(m))

# for s in shada:
#     print (araby.name(s))

In [72]:
# read the tokenized input file
with open('./generatedFiles/tokenized_input.txt', 'r', encoding='utf-8') as file:
    tokenized_input = file.readlines()
    print(len(tokenized_input))
    # Remove '\n' from each line
    tokenized_input = [line.strip() for line in tokenized_input]
    # Put the tokenized input of length 1 in tokenized_input list 
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]

print(tokenized_input[:10])



2104308
['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']


In [73]:
print(len(tokenized_input))

2104308


# **Core Word (CW) Diacritization**

## **Feature Extraction**

### 1 - Characters: 
Here we extract each character from all tokenized words and create a vector of size 50 for each character.

In [74]:
tokenizer_char = Tokenizer(char_level=True)
tokenizer_char.fit_on_texts(tokenized_input)
sequences_char = tokenizer_char.texts_to_sequences(tokenized_input)
char_features = pad_sequences(sequences_char)   # padding the sequences to have the same length as the longest sequence (word)
char_embeddings = np.random.rand(len(tokenizer_char.word_index) + 1, 50)

In [29]:
print(char_features.shape) # (number of words, max length of word in the dataset)


print(char_embeddings.shape)

# 38 rows: 37 unique characters identified by the tokenizer, 1 row for handling characters not seen in the training data
# 50 columns: Each character is encoded as a 50-dimensional vector

(2104308, 13)
(38, 50)


In [30]:
print(char_features[0]) 
# the number of non zero elements corresponds to the length of the word 
# and the value of each element corresponds to the index of the character in the tokenizer
# which means that every character now is encoded as a number and this number is the index of the character in the tokenizer

[ 0  0  0  0  0  0  0  0  0 13  5  1  7]


In [31]:
print(char_embeddings[0])
# this is the embedding of each character in the tokenizer

[0.82718437 0.4224459  0.16871503 0.62382641 0.32524436 0.87265325
 0.0577308  0.33543328 0.68879951 0.73667659 0.61688437 0.9953354
 0.17036873 0.9519963  0.86273352 0.46042732 0.44972585 0.33441949
 0.45360105 0.35258226 0.55844318 0.58011174 0.39465262 0.89635392
 0.33452824 0.13378085 0.23381511 0.92304843 0.74989829 0.94401159
 0.22424333 0.38812339 0.97364285 0.21564904 0.54669863 0.10668212
 0.39008205 0.38445956 0.71925058 0.58768456 0.76962771 0.78860936
 0.24047497 0.93822055 0.36737086 0.27888135 0.9513102  0.38241166
 0.47530879 0.37132545]


In [32]:
print(char_embeddings[char_features[0]].shape)
# 13 is the word of characters and 50 is the embedding size of each character

print(char_embeddings[char_features[0]])
# this is the embedding of each character in the first tokenized word, this is the 1st feature and the input of the model

(13, 50)
[[0.82718437 0.4224459  0.16871503 0.62382641 0.32524436 0.87265325
  0.0577308  0.33543328 0.68879951 0.73667659 0.61688437 0.9953354
  0.17036873 0.9519963  0.86273352 0.46042732 0.44972585 0.33441949
  0.45360105 0.35258226 0.55844318 0.58011174 0.39465262 0.89635392
  0.33452824 0.13378085 0.23381511 0.92304843 0.74989829 0.94401159
  0.22424333 0.38812339 0.97364285 0.21564904 0.54669863 0.10668212
  0.39008205 0.38445956 0.71925058 0.58768456 0.76962771 0.78860936
  0.24047497 0.93822055 0.36737086 0.27888135 0.9513102  0.38241166
  0.47530879 0.37132545]
 [0.82718437 0.4224459  0.16871503 0.62382641 0.32524436 0.87265325
  0.0577308  0.33543328 0.68879951 0.73667659 0.61688437 0.9953354
  0.17036873 0.9519963  0.86273352 0.46042732 0.44972585 0.33441949
  0.45360105 0.35258226 0.55844318 0.58011174 0.39465262 0.89635392
  0.33452824 0.13378085 0.23381511 0.92304843 0.74989829 0.94401159
  0.22424333 0.38812339 0.97364285 0.21564904 0.54669863 0.10668212
  0.39008205 0.3

### 2 - The position of the character in a word segment:
For example, given the word “wAlktAb” , which is composed of three segments “w+Al+ktAb”. Letters were marked as “B” if they begin a segment, “M” if they are in the middle of a segment, “E” if they end a segment, and “S” if they are single letter segments. So for “w+Al+ktAb”, the corresponding character positions are “S+BE+BMME.”

In [67]:
segmenter = FarasaSegmenter(interactive=True) # The default behaviour



In [68]:
def get_seg_tags(word):                 # word = "wAlktAb"
    segments = segmenter.segment(word)  # segments will be a list: ["w", "Al", "ktAb"]
    segments = segments.split('+')
    seg_tags = []
    for segment in segments:
        if len(segment) == 1:
            seg_tags.append("S")
        else:
            seg_tags.append("B")  # First letter
            seg_tags.extend("M" * (len(segment) - 2))  # Middle letters
            seg_tags.append("E")  # Last letter
    return segments, seg_tags

word = "كقلمه"
segments, seg_tags = get_seg_tags(word)
print("Segmented word:", segments)
print("SEG tags:", seg_tags)


Segmented word: ['ك', 'قلم', 'ه']
SEG tags: ['S', 'B', 'M', 'E', 'S']


In [35]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 25m 4.5s TO RUN
# # The Output of this code is the input_segments.txt file

# for i in range(len(tokenized_input)):
#     segments, seg_tags = get_seg_tags(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/input_segments.txt', 'a', encoding='utf-8') as file:
#         for tag in seg_tags:
#             file.write(tag)
#         file.write('\n')

In [76]:
# Read the input_segments file
with open('./generatedFiles/input_segments.txt', 'r', encoding='utf-8') as file:
    input_segments = file.readlines()
    print(len(input_segments))
    # Remove '\n' from each line
    input_segments = [line.strip() for line in input_segments]
    # Put the tokenized input of length 1 in the tokenized_input list
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]

print(input_segments[:10])

2104308
['BMES', 'BE', 'BME', 'BEBME', 'BES', 'BME', 'BME', 'BEBMMME', 'BME', 'BMES']


In [77]:
tokenizer_tags = Tokenizer(char_level=True)
tokenizer_tags.fit_on_texts(input_segments)
sequences_tags = tokenizer_tags.texts_to_sequences(input_segments)
tags_features = pad_sequences(sequences_tags)   
tags_embeddings = np.random.rand(len(tokenizer_tags.word_index) + 1, 50)

In [38]:
print(tags_features.shape) 
print(tags_embeddings.shape)

(2104308, 13)
(5, 50)


In [39]:
tags_features[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 2, 4])

In [40]:
tags_embeddings[tags_features[0][0]]

array([0.00352603, 0.69221287, 0.30273659, 0.11188011, 0.89065211,
       0.49201768, 0.68695663, 0.16368839, 0.56587932, 0.88104962,
       0.9849579 , 0.91881847, 0.15517539, 0.12924416, 0.56370847,
       0.61223982, 0.76536904, 0.62366   , 0.57627321, 0.42726112,
       0.54336608, 0.53620942, 0.19644801, 0.39810245, 0.60179897,
       0.27066639, 0.36974522, 0.31703842, 0.26838852, 0.2407861 ,
       0.05064811, 0.08938926, 0.75855506, 0.81586703, 0.98943101,
       0.85699995, 0.89626381, 0.93447942, 0.98596828, 0.60570575,
       0.88464929, 0.41572764, 0.55302554, 0.9256604 , 0.10286387,
       0.86617492, 0.44862383, 0.73284923, 0.49109846, 0.74762704])

### 3 - PRIOR: 
diacritics seen in the training set per segment. Since we used a character-level model, this feature informed the model with word-level information. For example, the word “ktAb”  was observed to have two diacritized forms in the training set, namely “kitaAb” ( – book) and “kut∼aAb” ( – writers). The first letter in the word (“k”) accepted the diacritics “i” and “u.” Thus, given a binary vector representing whether a character is allowed to assume any of the eight primitive Arabic diacritic marks (a, i, u, o, K, N, F, and ∼ in order), the first letter would be given the following vector “01100000.” If a word segment was never observed during training, then the vector for all letters therein would be set to 11111111.

In [41]:
# read the gold_output file
with open('./generatedFiles/gold_output.txt', 'r', encoding='utf-8') as file:
    gold_output = file.readlines()
    print(len(gold_output))
    # remove '\n' from each line
    gold_output = [line.strip() for line in gold_output]
    # put in tokenized_input list the tokenized input of length 1
    # tokenized_input = [(line.strip(), i) for i,line in enumerate(tokenized_input) if (len(line.strip())== 1 and line.strip() != '؟')]
    # get the inde

print(gold_output[:10])

2104308
['قَوْلُهُ', 'أَوْ', 'قَطَعَ', 'الْأَوَّلُ', 'يَدَهُ', 'إلَخْ', 'قَالَ', 'الزَّرْكَشِيُّ', 'ابْنُ', 'عَرَفَةَ']


In [42]:
print(len(tokenized_input))
print(tokenized_input[:10])

2104308
['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']


In [43]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SHADDA': '\u0651',
    'SUKUN': '\u0652',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D'
}

In [44]:
# # Extract diacritics by returning a list containing a tuple of 3 elements: (letter, tashkeel, shadda)
# def extract_arabic_diacritics(word):
#     diacritics_list = []
#     extracted_word, tashkeel, shadda = araby.separate(word, extract_shadda=True)
#     for i in range(len(extracted_word)):
#         print(f'{araby.name(extracted_word[i])} {araby.name(tashkeel[i])} {araby.name(shadda[i])}')
#         diacritics_list.append((extracted_word[i], (tashkeel[i].encode("utf8")).decode(), (shadda[i].encode("utf8")).decode()))
#     return diacritics_list

In [45]:
# # firstly, initialize an empty dictionary for the 'prior' feature
# # the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# # then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
# def get_prior(tokenized_input, gold_output):
#     prior = {} # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
#     for i in range(len(tokenized_input)):
#         if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
#             # get the indices of the word in the tokenized input array
#             indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
#             print(indices)
#             # get the words in the gold_output array with the same indices
#             words = [gold_output[j] for j in indices]
#             extracted_diac_all_words = []
#             for word in words:
#                 extracted_diac_all_words.append(extract_arabic_diacritics(word))
#             for indx, charac in enumerate(tokenized_input[i]):
#                 for extracted_diac_per_word in extracted_diac_all_words:
#                     # extract the diacritics of word[indx]
#                     prior[(tokenized_input[i], charac, indx)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
#                     if diacritics_mapping['SHADDA'] in extracted_diac_per_word[indx]:
#                         prior[(tokenized_input[i], charac, indx)][4] = 1 if diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][5] = 1 if diacritics_mapping['DAMMA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][6] = 1 if diacritics_mapping['KASRA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac, indx)][7] = 1 if not  diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] and not diacritics_mapping['DAMMA'] in word[indx: indx+2]  and not diacritics_mapping['KASRA'] in word[indx: indx+2] else 0
#                     else:
#                         prior[(tokenized_input[i], charac,indx)][0] = 1 if diacritics_mapping['FATHA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][1] = 1 if diacritics_mapping['DAMMA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][2] = 1 if diacritics_mapping['KASRA'] in extracted_diac_per_word[indx] else 0
#                         prior[(tokenized_input[i], charac,indx)][3] = 1 if diacritics_mapping['SUKUN'] in extracted_diac_per_word[indx] else 0
#     return prior

In [46]:
letter, tashkeel, shadda = araby.separate('زَّ', extract_shadda=True)   # SHADDA + FATHA Example
# letter, tashkeel, shadda = araby.separate('وَ', extract_shadda=True)   # FATHA Example
# letter, tashkeel, shadda = araby.separate('مً', extract_shadda=True)   # FATHATAN Example
# letter, tashkeel, shadda = araby.separate('عٌ', extract_shadda=True)   # DAMMATAN Example
# letter, tashkeel, shadda = araby.separate('يُّ', extract_shadda=True)   # SHADDA + DAMMA Example
# letter, tashkeel, shadda = araby.separate('ذْ', extract_shadda=True)   # SUKUN Example
enkar = 'كَإِنْكَارِ'
# print(enkar[4:6])
# print( diacritics_mapping['FATHA'] in enkar[0:1])
# print( diacritics_mapping['SHADDA'] in 'زَّ')
# print( diacritics_mapping['DAMMA'] in 'زَّ')

print('FATHA in tashkeel: ', diacritics_mapping['FATHA'] in tashkeel)
print('DAMMA in tashkeel: ', diacritics_mapping['DAMMA'] in tashkeel)
print('KASRA in tashkeel: ', diacritics_mapping['KASRA'] in tashkeel)
print('SUKUN in tashkeel: ', diacritics_mapping['SUKUN'] in tashkeel)
print('FATHATAN in tashkeel: ', diacritics_mapping['FATHATAN'] in tashkeel)
print('DAMMATAN in tashkeel: ', diacritics_mapping['DAMMATAN'] in tashkeel)
print('KASRATAN in tashkeel: ', diacritics_mapping['KASRATAN'] in tashkeel)
print('SHADDA in tashkeel: ', diacritics_mapping['SHADDA'] in tashkeel)
print('=============================')
print('FATHA in shadda: ', diacritics_mapping['FATHA'] in shadda)
print('DAMMA in shadda: ', diacritics_mapping['DAMMA'] in shadda)
print('KASRA in shadda: ', diacritics_mapping['KASRA'] in shadda)
print('SUKUN in shadda: ', diacritics_mapping['SUKUN'] in shadda)
print('FATHATAN in shadda: ', diacritics_mapping['FATHATAN'] in shadda)
print('DAMMATAN in shadda: ', diacritics_mapping['DAMMATAN'] in shadda)
print('KASRATAN in shadda: ', diacritics_mapping['KASRATAN'] in shadda)
print('SHADDA in shadda: ', diacritics_mapping['SHADDA'] in shadda)

print('testt', (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['FATHA'] not in tashkeel and diacritics_mapping['DAMMA'] not in tashkeel and diacritics_mapping['KASRA'] not in tashkeel))
print('yarab', (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda))

FATHA in tashkeel:  True
DAMMA in tashkeel:  False
KASRA in tashkeel:  False
SUKUN in tashkeel:  True
FATHATAN in tashkeel:  False
DAMMATAN in tashkeel:  False
KASRATAN in tashkeel:  False
SHADDA in tashkeel:  False
FATHA in shadda:  False
DAMMA in shadda:  False
KASRA in shadda:  False
SUKUN in shadda:  False
FATHATAN in shadda:  False
DAMMATAN in shadda:  False
KASRATAN in shadda:  False
SHADDA in shadda:  True
testt False
yarab False


In [47]:
# firstly, initialize an empty dictionary for the 'prior' feature
# the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
def get_prior(tokenized_input, gold_output):
    prior = {}  # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
    for i in range(len(tokenized_input)):
        if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
            # get the indices of the word in the tokenized input array
            indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
            # print(indices)
            # get the words in the gold_output array with the same indices
            words = []
            maxi_len = 0
            for j in indices:
                if gold_output[j] not in words:
                    words.append(gold_output[j])
                    maxi_len = max(maxi_len, len(gold_output[j]))

            for t in range(len(tokenized_input[i])):
                prior[(tokenized_input[i], tokenized_input[i][t], t)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
            
            indx2 = 0
            for word in words:
                indx = 0
                while indx < maxi_len:
                    # extract the diacritics of word[indx]
                    for iter in range(indx+1, len(word)):
                        if is_not_arabic_diacritic(word[iter]):
                            # print(iter)
                            letter, tashkeel, shadda = araby.separate(word[indx: iter], extract_shadda=True) 
                            if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][0] = 1 
                            if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][1] = 1
                            if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][2] = 1
                            if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][3] = 1
                            if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][4] = 1
                            if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][5] = 1
                            if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):  
                                prior[(tokenized_input[i], word[indx], indx2)][6] = 1 # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                            if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], indx2)][7] = 1
                            indx = iter - 1
                            indx2 += 1
                            break 
                    indx += 1
                indx2 = 0


                indx = len(word) - 1    # my assumption is that the last character in the not a diacritic
                if (not is_not_arabic_diacritic(word[len(word) - 1]) and is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is not, then the index of the last character is len(word) - 2
                    indx = len(word) - 2
                elif (not is_not_arabic_diacritic(word[len(word) - 1]) and not is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is also a diacritic (in shadda case), then the index of the last character is len(word) - 3
                    indx = len(word) - 3


                if (tokenized_input[i], word[indx], indx) not in prior:
                    letter, tashkeel, shadda = araby.separate(word[indx: len(word)], extract_shadda=True) 
                    if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][0] = 1
                    if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][1] = 1
                    if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][2] = 1 
                    if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][3] = 1 
                    if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][4] = 1 
                    if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][5] = 1
                    if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):
                        prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][6] = 1  # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                    if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][7] = 1
                    
    return prior

In [48]:
test_tokenized_input = ['كإنكار', 'كإنكار', 'بقذر','بقذر', 'أكثر', 'أكثر', 'الزركشي']
test_gold_output = ['كَإِنْكَارِ','كَإِنْكَارٍ', 'بِقَذَر', 'بِقَذَرٍ','أكْثَرَ', 'أَكْثَرُ', 'الزَّرْكَشِيُّ']
print (get_prior(test_tokenized_input, test_gold_output))

{('كإنكار', 'ك', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'إ', 1): [0, 0, 1, 0, 0, 0, 0, 0], ('كإنكار', 'ن', 2): [0, 0, 0, 0, 0, 0, 1, 0], ('كإنكار', 'ك', 3): [1, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ا', 4): [0, 0, 0, 0, 0, 0, 0, 0], ('كإنكار', 'ر', 5): [0, 0, 1, 0, 0, 1, 0, 0], ('بقذر', 'ب', 0): [0, 0, 1, 0, 0, 0, 0, 0], ('بقذر', 'ق', 1): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ذ', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('بقذر', 'ر', 3): [0, 0, 0, 0, 0, 1, 0, 0], ('أكثر', 'أ', 0): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ك', 1): [0, 0, 0, 0, 0, 0, 1, 0], ('أكثر', 'ث', 2): [1, 0, 0, 0, 0, 0, 0, 0], ('أكثر', 'ر', 3): [1, 1, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ا', 0): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ل', 1): [0, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ز', 2): [1, 0, 0, 0, 0, 0, 0, 1], ('الزركشي', 'ر', 3): [0, 0, 0, 0, 0, 0, 1, 0], ('الزركشي', 'ك', 4): [1, 0, 0, 0, 0, 0, 0, 0], ('الزركشي', 'ش', 5): [0, 0, 1, 0, 0, 0, 0, 0], ('الزركشي', 'ي', 6): [0, 1, 0, 0, 0, 0, 0, 1]}


In [49]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 276 minutes TO RUN
# # write in a file the prior feature
# prior_feature = get_prior(tokenized_input, gold_output)
# with open('./generatedFiles/prior_feature.txt', 'w', encoding='utf-8') as file:
#     for key, value in prior_feature.items():
#         file.write(f'{key}: {value}\n')

In [79]:
# read the prior feature file in a dictionary called prior_feature 
prior_feature = {}
with open('./generatedFiles/prior_feature.txt', 'r', encoding='utf-8') as file:
    for line in file:
        key, value = line.strip().split(':')
        key = key.strip()
        value = value.strip()
        key = key[1:-1].split(',')
        value = value[1:-1].split(',')
        key = (key[0][1:-1], key[1][2:-1], int(key[2]))
        value = [int(i) for i in value]
        prior_feature[key] = value

print(prior_feature[('قوله', 'ق', 0)])

[1, 0, 0, 0, 0, 0, 0, 0]


### 4 - CASE Feature: 
whether the letter expects a core word diacritic or a case ending. Case endings are placed on only one letter in a word, which may or may not be the last letter in the word. This is a binary feature.

In [51]:
from farasa.stemmer import FarasaStemmer

def arabic_stemmer(text):
    stemmer = FarasaStemmer(interactive=True)  # Set interactive to True for better performance

    # Perform stemming
    stemmed_text = stemmer.stem(text)

    return stemmed_text

# Example usage
input_text = "الكتابة باللغة العربية"
stemmed_text = arabic_stemmer(input_text)
print("Original text:", input_text)
print("Stemmed text:", stemmed_text)




Original text: الكتابة باللغة العربية
Stemmed text: كتابة لغة عربي


In [52]:
# for i in range(10):
#     stemmed_text = arabic_stemmer(tokenized_input[i])
#     print("Original text:", tokenized_input[i])
#     print("Stemmed text:", stemmed_text)

In [53]:
# for i in range(len(tokenized_input)):
#     stemmed_text = arabic_stemmer(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/stemmed_input.txt', 'a', encoding='utf-8') as file:
#         file.write(stemmed_text + '\n')
        

In [54]:
# stemmed_text = stemmed_text.split(' ')
# # write in a file the stemmed input
# with open('./generatedFiles/stemmed_input.txt', 'w', encoding='utf-8') as file:
#     for word in stemmed_text:
#         file.write(word + '\n')


## **Model Building**

In [155]:
# THIS CELL TOOK 12 minutes TO RUN
# input layer
char_features_vector=[]
tag_features_vector=[]
prior_features_vector=[]
embeddings = []
# with open('./generatedFiles/i_j.txt', 'a', encoding='utf-8') as file:
for i in range(len(tokenized_input)):
    for j in range(len(tokenized_input[i])):    
        # write i and j in a file
        # file.write(f'{i} {j}\n')
        char_index = tokenizer_char.word_index.get(tokenized_input[i][j])
        char_features_vector= char_embeddings[char_index]
        if (len(tokenized_input[i]) != len(input_segments[i])):
            input_segments[i] = "S" * (len(tokenized_input[i]) - len(input_segments[i])) + input_segments[i]
        tag_index = tokenizer_tags.word_index.get(input_segments[i][j].lower())
        tag_features_vector= tags_embeddings[tag_index]
        prior_features_vector= prior_feature[(tokenized_input[i], tokenized_input[i][j], j)]
        # pad the prior feature vector with zeros to have the same length as the other features
        prior_features_vector = np.pad(prior_features_vector, (0, 42), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # embeddings.append(np.vstack((char_features_vector, tag_features_vector, prior_features_vector)))
        embeddings.append(np.concatenate((char_features_vector, tag_features_vector, prior_features_vector)))

embeddings = np.array(embeddings)

# print(char_features_vector)
# print(tag_features_vector)
# print(prior_features_vector)
print(embeddings.shape)

In [61]:
# save the embeddings in a pickle file
with open('./generatedFiles/embeddings.pickle', 'wb') as file:
    pickle.dump(embeddings, file)

In [3]:
# read the embeddings from the pickle file
with open('./generatedFiles/embeddings.pickle', 'rb') as file:
    embeddings = pickle.load(file)

In [4]:
output_map = {
    (1, 0, 0, 0, 0, 0, 0, 0) : 0, # FATHA
    (0, 0, 0, 1, 0, 0, 0, 0) : 1, # FATHATAN
    (0, 0, 1, 0, 0, 0, 0, 0) : 2, # KASRA
    (0, 0, 0, 0, 0, 1, 0, 0) : 3, # KASRATAN
    (0, 1, 0, 0, 0, 0, 0, 0) : 4, # DAMMA
    (0, 0, 0, 0, 1, 0, 0, 0) : 5, # DAMMATAN
    (0, 0, 0, 0, 0, 0, 1, 0) : 6, # SUKUN
    (0, 0, 0, 0, 0, 0, 0, 1) : 7,  # SHADDA
    (1, 0, 0, 0, 0, 0, 0, 1) : 8, # SHADDA FATHA
    (0, 0, 0, 1, 0, 0, 0, 1) : 9, # SHADDA FATHATAN
    (0, 0, 1, 0, 0, 0, 0, 1) : 10, # SHADDA KASRA
    (0, 0, 0, 0, 0, 1, 0, 1) : 11, # SHADDA KASRATAN
    (0, 1, 0, 0, 0, 0, 0, 1) : 12, # SHADDA DAMMA
    (0, 0, 0, 0, 1, 0, 0, 1) : 13, # SHADDA DAMMATAN
    (0, 0, 0, 0, 0, 0, 0, 0) : 14
}

In [5]:
# gold labels
with open('./generatedFiles/gold_output_dict.txt', 'w', encoding='utf-8') as file:
    for idx, word in enumerate(gold_output):
        gold_diacritics = get_prior([tokenized_input[idx]], [word])
        for key, value in gold_diacritics.items():
            key = key + (idx,)
            file.write(f'{key}: {value}\n')

NameError: name 'gold_output' is not defined

In [None]:
# read the prior feature file in a dictionary called prior_feature 
gold_output_dict = {}
with open('./generatedFiles/gold_output_dict.txt', 'r', encoding='utf-8') as file:
    for line in file:
        key, value = line.strip().split(':')
        key = key.strip()
        value = value.strip()
        key = key[1:-1].split(',')
        value = value[1:-1].split(',')
        key = (key[0][1:-1], key[1][2:-1], int(key[2]), int(key[3]))
        value = [int(i) for i in value]
        gold_output_dict[key] = value

In [None]:
gold_output_dict[('قوله', 'ق', 0, 0)]

[1, 0, 0, 0, 0, 0, 0, 0]

In [None]:
# Change gold_output_dict.values() to a list of tuples
for key, value in gold_output_dict.items():
    gold_output_dict[key] = tuple(value)
    
gold_output_dict_values = list(gold_output_dict.values())

In [68]:
with open('./generatedFiles/gold_output_id.txt', 'w', encoding='utf-8') as file:
    for value in gold_output_dict_values:
        file.write(f'{output_map[value]}\n')

In [27]:
# read the gold_output_id file
with open('./generatedFiles/gold_output_id.txt', 'r', encoding='utf-8') as file:
    gold_output_id = file.readlines()
    gold_output_id = [line.strip() for line in gold_output_id]

gold_output_id = np.array(gold_output_id)

In [28]:
print(gold_output_id.shape)
print(gold_output_id[:10])

(8353805,)
['0' '6' '4' '4' '0' '6' '0' '0' '0' '14']


In [29]:
print(len(embeddings))

8353805


In [30]:
# Truncate emdeddings to have the 8353000
embeddings_reshape = embeddings[:835300]
gold_output_id = gold_output_id[:835300]

# Make it np array 
embeddings_reshape = np.array(embeddings_reshape)
gold_output_id = np.array(gold_output_id)


In [31]:
print (embeddings_reshape.shape)
print (gold_output_id.shape)

(835300, 150)
(835300,)


In [32]:
# Reshape embeddings to have 3 dimensions 
embeddings_reshape = embeddings_reshape.reshape((-1, 100, 150))
gold_output_id_reshape = gold_output_id.reshape(-1, 100, 1)

In [37]:
import tensorflow as tf
# Print Shap
#e
print(embeddings_reshape.shape)
print(gold_output_id_reshape.shape)

# print the first 10 rows of the embeddings
# print(embeddings[:10])

# print the first 10 rows of the gold_output_id
print(gold_output_id_reshape[:][0])

# print the first 10 columns of the gold_output_id
print(tf.keras.utils.to_categorical(gold_output_id_reshape[:][0]))

(8353, 100, 150)
(8353, 100, 1)
[['0']
 ['6']
 ['4']
 ['4']
 ['0']
 ['6']
 ['0']
 ['0']
 ['0']
 ['14']
 ['6']
 ['0']
 ['8']
 ['4']
 ['0']
 ['0']
 ['4']
 ['14']
 ['0']
 ['6']
 ['0']
 ['14']
 ['0']
 ['14']
 ['14']
 ['8']
 ['6']
 ['0']
 ['2']
 ['12']
 ['14']
 ['6']
 ['4']
 ['0']
 ['0']
 ['0']
 ['0']
 ['0']
 ['6']
 ['4']
 ['4']
 ['2']
 ['0']
 ['6']
 ['3']
 ['0']
 ['6']
 ['0']
 ['2']
 ['14']
 ['14']
 ['0']
 ['2']
 ['6']
 ['0']
 ['14']
 ['2']
 ['0']
 ['6']
 ['2']
 ['0']
 ['2']
 ['14']
 ['3']
 ['2']
 ['14']
 ['6']
 ['2']
 ['6']
 ['0']
 ['14']
 ['2']
 ['4']
 ['4']
 ['14']
 ['0']
 ['0']
 ['14']
 ['4']
 ['2']
 ['0']
 ['4']
 ['4']
 ['14']
 ['4']
 ['4']
 ['2']
 ['6']
 ['14']
 ['14']
 ['10']
 ['14']
 ['2']
 ['0']
 ['4']
 ['14']
 ['0']
 ['1']
 ['0']
 ['2']]
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


In [34]:
# build a training model, first we need input layer that take matrix "embeddings" as an input with dropout of 10%
# then we need a bidirectional LSTM layer with 100 units
# then we need a dense layer with 100 units and relu activation function
# then we need an output layer with 14 units and softmax activation function
# use early stopping with patience of five epochs, a learning rate of 0.001, a batch size of 256, and an Adamax optimizer

# define the model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, TimeDistributed
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.activations import relu,linear

input_shape = (100, 150)
    # tf.keras.layers.Input(shape=150),
    # Dropout(0.1),
    # Bidirectional(LSTM(100)),
    # Dense(100, activation='relu'),
    # Dense(14, activation='softmax')

model = Sequential()
model.add(Bidirectional(LSTM(50, return_sequences=True), input_shape=input_shape))
model.add(TimeDistributed(Dense(100, activation='relu')))
model.add(TimeDistributed(Dense(15, activation='softmax')))

# model = Sequential()
# forward_layer = LSTM(50)
# backward_layer = LSTM(50, activation='relu', go_backwards=True)
# model.add(Bidirectional(forward_layer, backward_layer=backward_layer, input_shape=(1,150)))
# # model.add(Dropout(0.1))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(14, activation='softmax'))


# compile the model
model.compile(optimizer=Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# model.build(reshaped_matrix.shape)
# summarize the model
print(model.summary())

# early stopping
# early_stopping = EarlyStopping(monitor='val_accuracy', patience=5)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 100, 100)          80400     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 100, 100)          10100     
 ributed)                                                        
                                                                 
 time_distributed_1 (TimeDi  (None, 100, 15)           1515      
 stributed)                                                      
                                                                 
Total params: 92015 (359.43 KB)
Trainable params: 92015 (359.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [39]:
labels = []
for i in range(len(gold_output_id_reshape)):
    for j in range(100):
        labels.append( tf.keras.utils.to_categorical(gold_output_id_reshape[i][j], num_classes=15))
        
labels = np.array(labels)

In [40]:
labels = labels.reshape(-1, 100, 15)
print(labels.shape)

(8353, 100, 15)


In [42]:
# fit the model on the training dataset and evaluate it on the validation dataset,
# use early stopping with patience of five epochs, a learning rate of 0.001, a batch size of 256
# before that, configure the model to use GPU

# fit the model with gpu

with tf.device('/GPU:0'):
    model.fit(embeddings_reshape, labels, epochs=100, batch_size=256)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100

KeyboardInterrupt: 

In [56]:
# Load the dataset
# Specify the file path
file_path = "./dataset/val.txt"

# Read the contents of the file located at file_path 
# and append each line to the list data_before_preprocessing
with open(file_path, 'r', encoding='utf-8') as file:
    val_data_before_preprocessing = file.readlines()
    # remove '\n' from each line
    val_data_before_preprocessing = [line.strip() for line in val_data_before_preprocessing]
    
    

In [57]:
for i in range(len(val_data_before_preprocessing)):
    save_tokenized_input(val_data_before_preprocessing[i],"val_tokenized_input")

In [64]:
# Read the tokenized input file
# read the tokenized input file
with open('./generatedFiles/val_tokenized_input.txt', 'r', encoding='utf-8') as file:
    val_tokenized_input = file.readlines()
    # Remove '\n' from each line
    val_tokenized_input = [line.strip() for line in val_tokenized_input]
    
print(len(val_tokenized_input))

106184


In [63]:
for i in range(len(val_data_before_preprocessing)):
    save_gold_output(val_data_before_preprocessing[i],"val_gold_output")
    

2500


In [69]:
for i in range(len(val_tokenized_input)):
    val_segments, val_seg_tags = get_seg_tags(val_tokenized_input[i])
    # Write and append on the tokenized input to a file
    with open('./generatedFiles/val_input_segments.txt', 'a', encoding='utf-8') as file:
        for tag in val_seg_tags:
            file.write(tag)
        file.write('\n')

In [81]:
# THIS CELL TOOK 12 minutes TO RUN
# input layer
val_char_features_vector=[]
val_tag_features_vector=[]
val_prior_features_vector=[]
val_embeddings = []
# with open('./generatedFiles/i_j.txt', 'a', encoding='utf-8') as file:
for i in range(len(val_tokenized_input)):
    for j in range(len(val_tokenized_input[i])):    
        # write i and j in a file
        # file.write(f'{i} {j}\n')
        char_index = tokenizer_char.word_index.get(val_tokenized_input[i][j])
        char_features_vector= char_embeddings[char_index]
        if (len(val_tokenized_input[i]) != len(val_segments[i])):
            val_segments[i] = "S" * (len(val_tokenized_input[i]) - len(val_segments[i])) + val_segments[i]
        tag_index = tokenizer_tags.word_index.get(val_segments[i][j].lower())
        tag_features_vector= tags_embeddings[tag_index]
        prior_features_vector= (prior_feature[(val_tokenized_input[i], val_tokenized_input[i][j], j)]) if (val_tokenized_input[i], val_tokenized_input[i][j], j) in prior_feature else [1, 1, 1, 1, 1, 1, 1, 1]
        # pad the prior feature vector with zeros to have the same length as the other features
        prior_features_vector = np.pad(prior_features_vector, (0, 42), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # embeddings.append(np.vstack((char_features_vector, tag_features_vector, prior_features_vector)))
        print (char_features_vector.shape)
        print (tag_features_vector.shape)
        print (prior_features_vector.shape)
        val_embeddings.append(np.concatenate((char_features_vector, tag_features_vector, prior_features_vector)))

val_embeddings = np.array(val_embeddings)

# print(char_features_vector)
# print(tag_features_vector)
# print(prior_features_vector)
print(val_embeddings.shape)

(50,)
(50,)
(50,)
(50,)
(1, 5, 50)
(50,)


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 3 dimension(s)

In [None]:
# evaluate the model on the validation dataset

# evaluate the model
loss, accuracy = model.evaluate(embeddings_reshape, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

