In [2]:
import pickle
import numpy as np
import pandas as pd
import re
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import farasa
from farasa.segmenter import FarasaSegmenter 
import unicodedata
import torch

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, TimeDistributed, BatchNormalization
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.activations import relu,linear

from gensim.models.word2vec import LineSentence
from gensim.models import FastText




In [None]:
# # Run on GPU
# use_cuda = torch.cuda.is_available()
# device = torch.device("cuda" if use_cuda else "cpu")
# print (device)
# # print the cpu or gpu
# print(torch.cuda.get_device_name(0))
# # print the number of gpus you have
# print(torch.cuda.device_count())
# # print current gpu
# print(torch.cuda.current_device())

In [4]:
def read_data(file_path):
    """
    Read the contents of the file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()

        # remove '\n' from each line
        data = [line.strip() for line in data]
    return data


In [None]:
data_before_preprocessing = read_data("./dataset/train.txt")
print(len(data_before_preprocessing))

In [5]:
def read_pickle_file(file_path):
    """
    Read the contents of the pickle file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

In [6]:
def save_words_in_file(path, words, permission='w'):
    """
    Save the words in the file located at path 
    """
    with open(path, permission, encoding='utf-8') as file:
            for word in words:
                file.write(word + '\n')

In [8]:
# set for arabic letters
arabic_letters = set(read_pickle_file("./Delivery/arabic_letters.pickle"))

print(len(arabic_letters))
print(arabic_letters)

36
{'ت', 'ن', 'س', 'ى', 'غ', 'ق', 'ح', 'ث', 'ظ', 'د', 'ش', 'ة', 'ؤ', 'ص', 'ب', 'ط', 'ذ', 'أ', 'إ', 'ك', 'ا', 'ز', 'خ', 'ع', 'ر', 'ي', 'ئ', 'ه', 'م', 'ل', 'ء', 'و', 'ض', 'ج', 'ف', 'آ'}


In [9]:
# set for arabic letters
diacritics = set(read_pickle_file("./Delivery/diacritics.pickle"))

print(len(diacritics))
print(diacritics)

8
{'ِ', 'ً', 'ٌ', 'ُ', 'َ', 'ْ', 'ّ', 'ٍ'}


In [7]:
# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any letters not found in set arabic_letters and not found in set diacritics
def remove_non_arabic(text):
    text = re.sub(r'[^\s' + ''.join(arabic_letters) + ''.join(diacritics) + ']', '', text)
    return text

def input_preprocessing_text(text):
    # Correct most common errors on word like repetetion of harakats, or tanween before alef
    text = araby.autocorrect(text)

    # Remove any non-Arabic letters
    text = remove_non_arabic(text)

    # Remove diacritics
    text = remove_diacritics(text)

    # Tokenize
    text = araby.tokenize(text)

    return text

def save_tokenized_input(text,path="./generatedFiles/training/tokenized_input.txt", permission='w'):
    words = input_preprocessing_text(text)
    save_words_in_file(path, words, permission)
    

def save_gold_output(text,path="./generatedFiles/training/gold_output.txt", permission='w'):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    # Tokenize
    text = araby.tokenize(text)

    save_words_in_file(path, text, permission)


def is_not_arabic_diacritic(char):
   category = unicodedata.category(char)
   return not (category == 'Mn' or category == 'Mc')

In [10]:
def save_sentence_in_file(path, words, permission='w'):
    """
    Save the words in the file located at path 
    """
    with open(path, permission, encoding='utf-8') as file:
            file.write(words + '\n')

In [11]:
def save_new_input_sentence(text,path="./generatedFiles/training/new_input_sentence.txt", permission='w'):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)
    # Remove diacritics
    text = remove_diacritics(text)
    
    #remove extra spaces between words
    text = re.sub(r'\s+', ' ', text)

    save_sentence_in_file(path, text, permission)

In [None]:
# # Example usage:
# character = 'ذْ'
# if is_not_arabic_diacritic(character[1]):
#    print("The character is not an Arabic diacritic.")
# else:
#    print("The character is an Arabic diacritic.")


# # Testing of is_not_arabic_diacritic() function with gettting the index of the first non diacritic character in the word
# word = 'زَّراع'
 
# for i in range(1, len(word)): # start from 1 because the first character is not a diacritic
#     if is_not_arabic_diacritic(word[i]):
#         print(i)
#         break 

In [None]:
# # RUN ONE TIME ONLY THIS CODE AGAIN 
# # Generate Gold Input file
# for i in range(len(data_before_preprocessing)):
#     save_tokenized_input(data_before_preprocessing[i], permission='a')

In [None]:
# # RUN ONE TIME ONLY THIS CODE AGAIN
# # Generate Gold Output file
# for i in range(len(data_before_preprocessing)):
#     save_gold_output(data_before_preprocessing[i], permission='a')

In [None]:
# # RUN ONE TIME ONLY THIS CODE AGAIN
# # Generate input Sentence file
# for i in range(len(data_before_preprocessing)):
#     save_new_input_sentence(data_before_preprocessing[i], permission='a')

In [None]:
# Important functions in PyArabic

# araby.tokenize(text) # Tokenize the sentence text into words
# araby.is_arabicrange(text) # Check if the text is Arabic
# araby.sentence_tokenize(text) # Tokenize the text into sentences
# araby.strip_tashkeel(text) # Remove diacritics (FATHA, DAMMA, KASRA, SUKUN, SHADDA, FATHATAN, DAMMATAN, KASRATAN)
# araby.strip_diacritics(text) # Remove diacritics (Small Alef الألف الخنجرية, Harakat + Shadda, Quranic marks)
# araby.strip_tatweel(text) # Remove tatweel
# araby.strip_shadda(text) # Remove shadda
# araby.autocorrect(text) # Correct most common errors on word like repetetion of harakats,or tanwin befor alef
# araby.arabicrange() # Return a list of Arabic characters

# New Functions in PyArabic
# araby.vocalized_similarity(word1, word2) # if the two words has the same letters and the same harakats, this function return True. 
# The two words can be full vocalized, or partial vocalized

# araby.vocalizedlike(word1, word2) Same as vocalized_similarity but return True and False

# araby.joint(word1, word2) # joint the letters with the marks the length ot letters and marks must be equal return word



# Return the text, its tashkeel and shadda if extract_shadda is True
# text, marks, shada = araby.separate(text,extract_shadda=True) # Separate diacritics from the text
# print (text)
# for m in marks:
#     print (araby.name(m))

# for s in shada:
#     print (araby.name(s))

In [27]:
# Read tokenized_input file
tokenized_input = read_data("./generatedFiles/training/tokenized_input.txt")
print(tokenized_input[:10])
print(len(tokenized_input))

['قوله', 'أو', 'قطع', 'الأول', 'يده', 'إلخ', 'قال', 'الزركشي', 'ابن', 'عرفة']
2101983


# **Core Word (CW) Diacritization**

## **Feature Extraction**

### 1 - Characters: 
Here we extract each character from all tokenized words and create a vector of size 50 for each character.

In [29]:
embedding_size = 40

In [30]:
tokenizer_char = Tokenizer(char_level=True)
tokenizer_char.fit_on_texts(tokenized_input)
sequences_char = tokenizer_char.texts_to_sequences(tokenized_input)
char_features = pad_sequences(sequences_char)   # padding the sequences to have the same length as the longest sequence (word)
char_embeddings = np.random.rand(len(tokenizer_char.word_index) + 1, embedding_size)


In [90]:
# Save the tokenizer_char model
with open('./generatedFiles/training/tokenizer_char.pickle', 'wb') as handle:
    pickle.dump(tokenizer_char, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the sequences_char model
with open('./generatedFiles/training/sequences_char.pickle', 'wb') as handle:
    pickle.dump(sequences_char, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the char_embeddings array in a pickle file
with open('./generatedFiles/training/char_embeddings.pickle', 'wb') as handle:
    pickle.dump(char_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the char_features array in a pickle file
with open('./generatedFiles/training/char_features.pickle', 'wb') as handle:
    pickle.dump(char_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [93]:
print(char_features.shape) # (number of words, max length of word in the dataset)


print(char_embeddings.shape)

# 38 rows: 37 unique characters identified by the tokenizer, 1 row for handling characters not seen in the training data
# 50 columns: Each character is encoded as a 50-dimensional vector

(2101983, 13)
(37, 40)


In [94]:
print(char_features[0]) 
# the number of non zero elements corresponds to the length of the word 
# and the value of each element corresponds to the index of the character in the tokenizer
# which means that every character now is encoded as a number and this number is the index of the character in the tokenizer

[ 0  0  0  0  0  0  0  0  0 13  5  1  7]


In [101]:
print(char_embeddings[0])
# this is the embedding of each character in the tokenizer

[0.80871371 0.12394689 0.87569474 0.31353818 0.33009045 0.11812771
 0.38956887 0.62473864 0.43774451 0.05983604 0.79880199 0.98974258
 0.23386662 0.25450462 0.49377536 0.6882848  0.73133686 0.62197055
 0.92977178 0.82167678 0.91133812 0.64268308 0.64488677 0.14072622
 0.54441718 0.31470806 0.62167814 0.8099589  0.47332709 0.37057716
 0.6861456  0.411859   0.58265002 0.92869714 0.36860802 0.49624959
 0.08634157 0.4526868  0.5898067  0.44434528]


In [96]:
print(char_embeddings[char_features[0]].shape)
# 13 is the word of characters and 50 is the embedding size of each character

print(char_embeddings[char_features[0]])
# this is the embedding of each character in the first tokenized word, this is the 1st feature and the input of the model

(13, 40)
[[0.80871371 0.12394689 0.87569474 0.31353818 0.33009045 0.11812771
  0.38956887 0.62473864 0.43774451 0.05983604 0.79880199 0.98974258
  0.23386662 0.25450462 0.49377536 0.6882848  0.73133686 0.62197055
  0.92977178 0.82167678 0.91133812 0.64268308 0.64488677 0.14072622
  0.54441718 0.31470806 0.62167814 0.8099589  0.47332709 0.37057716
  0.6861456  0.411859   0.58265002 0.92869714 0.36860802 0.49624959
  0.08634157 0.4526868  0.5898067  0.44434528]
 [0.80871371 0.12394689 0.87569474 0.31353818 0.33009045 0.11812771
  0.38956887 0.62473864 0.43774451 0.05983604 0.79880199 0.98974258
  0.23386662 0.25450462 0.49377536 0.6882848  0.73133686 0.62197055
  0.92977178 0.82167678 0.91133812 0.64268308 0.64488677 0.14072622
  0.54441718 0.31470806 0.62167814 0.8099589  0.47332709 0.37057716
  0.6861456  0.411859   0.58265002 0.92869714 0.36860802 0.49624959
  0.08634157 0.4526868  0.5898067  0.44434528]
 [0.80871371 0.12394689 0.87569474 0.31353818 0.33009045 0.11812771
  0.38956887 

In [99]:
print(char_embeddings[1])
print(char_embeddings[5])


[0.89564725 0.25195525 0.07590479 0.24319147 0.71416636 0.35021229
 0.91162322 0.55128277 0.72244724 0.67865803 0.5007849  0.74818395
 0.39703012 0.60525603 0.30358701 0.65201726 0.27057574 0.68505244
 0.12685006 0.44511192 0.98633195 0.93323719 0.31900607 0.77598429
 0.93693782 0.78101963 0.1549614  0.37614095 0.99988996 0.88039479
 0.29993726 0.45198818 0.59049471 0.35870078 0.49778056 0.05134592
 0.63903589 0.58398972 0.8917426  0.44850799]
[0.58786353 0.36178631 0.23029876 0.75367712 0.78879611 0.54356207
 0.30758601 0.83894193 0.12935863 0.42913136 0.57190114 0.39638332
 0.56109088 0.93151887 0.95497757 0.19307364 0.03617345 0.17160821
 0.58669999 0.14381791 0.98064519 0.40775706 0.26062596 0.14192332
 0.53891232 0.99480954 0.84515416 0.62273199 0.06131796 0.9881755
 0.4708029  0.59122424 0.53232905 0.00407433 0.21474428 0.78097428
 0.06029918 0.75889144 0.11160706 0.51833428]


### 2 - The position of the character in a word segment:
For example, given the word “wAlktAb” , which is composed of three segments “w+Al+ktAb”. Letters were marked as “B” if they begin a segment, “M” if they are in the middle of a segment, “E” if they end a segment, and “S” if they are single letter segments. So for “w+Al+ktAb”, the corresponding character positions are “S+BE+BMME.”

In [22]:
segmenter = FarasaSegmenter(interactive=True) # The default behaviour



In [12]:
def get_seg_tags(word):                 # word = "wAlktAb"
    segments = segmenter.segment(word)  # segments will be a list: ["w", "Al", "ktAb"]
    segments = segments.split('+')
    seg_tags = []
    for segment in segments:
        if len(segment) == 1:
            seg_tags.append("S")
        else:
            seg_tags.append("B")  # First letter
            seg_tags.extend("M" * (len(segment) - 2))  # Middle letters
            seg_tags.append("E")  # Last letter
    return segments, seg_tags

In [None]:
# word = "كقلمه"
# segments, seg_tags = get_seg_tags(word)
# print("Segmented word:", segments)
# print("SEG tags:", seg_tags)

In [None]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 25m 4.5s TO RUN
# # The Output of this code is the input_segments.txt file

# for i in range(len(tokenized_input)):
#     segments, seg_tags = get_seg_tags(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/training/input_segments.txt', 'a', encoding='utf-8') as file:
#         for tag in seg_tags:
#             file.write(tag)
#         file.write('\n')

In [33]:
input_segments = read_data("./generatedFiles/training/input_segments.txt")
print(len(input_segments))
print(input_segments[:10])

2101983
['BMES', 'BE', 'BME', 'BEBME', 'BES', 'BME', 'BME', 'BEBMMME', 'BME', 'BMES']


In [34]:
tokenizer_tags = Tokenizer(char_level=True)
tokenizer_tags.fit_on_texts(input_segments)
sequences_tags = tokenizer_tags.texts_to_sequences(input_segments)
tags_features = pad_sequences(sequences_tags)   
tags_embeddings = np.random.rand(len(tokenizer_tags.word_index) + 1, embedding_size)

In [105]:
# Save the tokenizer_tags model
with open('./generatedFiles/training/tokenizer_tags.pickle', 'wb') as handle:
    pickle.dump(tokenizer_tags, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the sequences_char model
with open('./generatedFiles/training/sequences_tags.pickle', 'wb') as handle:
    pickle.dump(sequences_tags, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the tags_embeddings array in a pickle file
with open('./generatedFiles/training/tags_embeddings.pickle', 'wb') as handle:
    pickle.dump(tags_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the tags_features array in a pickle file
with open('./generatedFiles/training/tags_features.pickle', 'wb') as handle:
    pickle.dump(tags_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(tags_features.shape) 
print(tags_embeddings.shape)
tags_features[0]

In [None]:
tags_embeddings[tags_features[0][0]]

### 3 - PRIOR: 
diacritics seen in the training set per segment. Since we used a character-level model, this feature informed the model with word-level information. For example, the word “ktAb”  was observed to have two diacritized forms in the training set, namely “kitaAb” ( – book) and “kut∼aAb” ( – writers). The first letter in the word (“k”) accepted the diacritics “i” and “u.” Thus, given a binary vector representing whether a character is allowed to assume any of the eight primitive Arabic diacritic marks (a, i, u, o, K, N, F, and ∼ in order), the first letter would be given the following vector “01100000.” If a word segment was never observed during training, then the vector for all letters therein would be set to 11111111.

In [None]:
gold_output = read_data("./generatedFiles/training/gold_output.txt")
print(len(gold_output))
print(gold_output[:10])

In [35]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SHADDA': '\u0651',
    'SUKUN': '\u0652',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D'
}

In [None]:
# # Extract diacritics by returning a list containing a tuple of 3 elements: (letter, tashkeel, shadda)
# def extract_arabic_diacritics(word):
#     diacritics_list = []
#     extracted_word, tashkeel, shadda = araby.separate(word, extract_shadda=True)
#     for i in range(len(extracted_word)):
#         print(f'{araby.name(extracted_word[i])} {araby.name(tashkeel[i])} {araby.name(shadda[i])}')
#         diacritics_list.append((extracted_word[i], (tashkeel[i].encode("utf8")).decode(), (shadda[i].encode("utf8")).decode()))
#     return diacritics_list

In [None]:
# letter, tashkeel, shadda = araby.separate('زَّ', extract_shadda=True)   # SHADDA + FATHA Example

# enkar = 'كَإِنْكَارِ'

# print('FATHA in tashkeel: ', diacritics_mapping['FATHA'] in tashkeel)
# print('DAMMA in tashkeel: ', diacritics_mapping['DAMMA'] in tashkeel)
# print('KASRA in tashkeel: ', diacritics_mapping['KASRA'] in tashkeel)
# print('SUKUN in tashkeel: ', diacritics_mapping['SUKUN'] in tashkeel)
# print('FATHATAN in tashkeel: ', diacritics_mapping['FATHATAN'] in tashkeel)
# print('DAMMATAN in tashkeel: ', diacritics_mapping['DAMMATAN'] in tashkeel)
# print('KASRATAN in tashkeel: ', diacritics_mapping['KASRATAN'] in tashkeel)
# print('SHADDA in tashkeel: ', diacritics_mapping['SHADDA'] in tashkeel)
# print('=============================')
# print('FATHA in shadda: ', diacritics_mapping['FATHA'] in shadda)
# print('DAMMA in shadda: ', diacritics_mapping['DAMMA'] in shadda)
# print('KASRA in shadda: ', diacritics_mapping['KASRA'] in shadda)
# print('SUKUN in shadda: ', diacritics_mapping['SUKUN'] in shadda)
# print('FATHATAN in shadda: ', diacritics_mapping['FATHATAN'] in shadda)
# print('DAMMATAN in shadda: ', diacritics_mapping['DAMMATAN'] in shadda)
# print('KASRATAN in shadda: ', diacritics_mapping['KASRATAN'] in shadda)
# print('SHADDA in shadda: ', diacritics_mapping['SHADDA'] in shadda)

# print((diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['FATHA'] not in tashkeel and diacritics_mapping['DAMMA'] not in tashkeel and diacritics_mapping['KASRA'] not in tashkeel))
# print((diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda))

In [13]:
# firstly, initialize an empty dictionary for the 'prior' feature
# the value will be the 8 arabic marks (FATHA, DAMMA, KASRA, FATHATAN, DAMMATAN, KASRATAN, SUKUN, SHADDA) as a binary vector

# then, loop over the tokenized input and check if the each character and word pair is not in the dictionary, get the indices of this word and its duplicates in the tokenized input array
def get_prior(tokenized_input, gold_output):
    prior = {}  # this dictionary will hold a key of tuple of 3 elements (word, character, index of character in the word) and the value will be the 8 arabic marks
    for i in range(len(tokenized_input)):
        if (tokenized_input[i], tokenized_input[i][0], 0) not in prior:
            # get the indices of the word in the tokenized input array
            indices = [j for j, x in enumerate(tokenized_input) if x == tokenized_input[i]]
            # print(indices)
            # get the words in the gold_output array with the same indices
            words = []
            maxi_len = 0
            for j in indices:
                if gold_output[j] not in words:
                    words.append(gold_output[j])
                    maxi_len = max(maxi_len, len(gold_output[j]))

            for t in range(len(tokenized_input[i])):
                prior[(tokenized_input[i], tokenized_input[i][t], t)] = [0, 0, 0, 0, 0, 0, 0, 0] # initialize the value of the key with zeros
            
            indx2 = 0
            for word in words:
                indx = 0
                while indx < maxi_len:
                    # extract the diacritics of word[indx]
                    for iter in range(indx+1, len(word)):
                        if is_not_arabic_diacritic(word[iter]):
                            # print(iter)
                            letter, tashkeel, shadda = araby.separate(word[indx: iter], extract_shadda=True) 
                            if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][0] = 1 
                            if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][1] = 1
                            if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], indx2)][2] = 1
                            if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][3] = 1
                            if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][4] = 1
                            if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], indx2)][5] = 1
                            if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):  
                                prior[(tokenized_input[i], word[indx], indx2)][6] = 1 # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                            if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], indx2)][7] = 1
                            indx = iter - 1
                            indx2 += 1
                            break 
                    indx += 1
                indx2 = 0


                indx = len(word) - 1    # my assumption is that the last character in the not a diacritic
                if (not is_not_arabic_diacritic(word[len(word) - 1]) and is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is not, then the index of the last character is len(word) - 2
                    indx = len(word) - 2
                elif (not is_not_arabic_diacritic(word[len(word) - 1]) and not is_not_arabic_diacritic(word[len(word) - 2])):  # if the last character is a diacritic and the one before it is also a diacritic (in shadda case), then the index of the last character is len(word) - 3
                    indx = len(word) - 3


                if (tokenized_input[i], word[indx], indx) not in prior:
                    letter, tashkeel, shadda = araby.separate(word[indx: len(word)], extract_shadda=True) 
                    if diacritics_mapping['FATHA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][0] = 1
                    if diacritics_mapping['DAMMA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][1] = 1
                    if diacritics_mapping['KASRA'] in tashkeel:         prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][2] = 1 
                    if diacritics_mapping['FATHATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][3] = 1 
                    if diacritics_mapping['DAMMATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][4] = 1 
                    if diacritics_mapping['KASRATAN'] in tashkeel:      prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][5] = 1
                    if (diacritics_mapping['SUKUN'] in tashkeel and diacritics_mapping['SHADDA'] not in shadda):
                        prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][6] = 1  # if the letter has SHADDA, araby.separate() will return SUKUN in tashkeel and SHADDA in shadda, so to avoid this mislabeling we check if SHADDA not in shadda and if SUKUN in tashkeel, then this is a true SUKUN
                    if diacritics_mapping['SHADDA'] in shadda:          prior[(tokenized_input[i], word[indx], len(tokenized_input[i])-1)][7] = 1
                    
    return prior

In [None]:
# test_tokenized_input = ['كإنكار', 'كإنكار', 'بقذر','بقذر', 'أكثر', 'أكثر', 'الزركشي']
# test_gold_output = ['كَإِنْكَارِ','كَإِنْكَارٍ', 'بِقَذَر', 'بِقَذَرٍ','أكْثَرَ', 'أَكْثَرُ', 'الزَّرْكَشِيُّ']
# print (get_prior(test_tokenized_input, test_gold_output))

In [None]:
# # DON'T RUN THIS CODE AGAIN, THIS CELL TOOK 276 minutes TO RUN
# # write in a file the prior feature
# prior_feature = get_prior(tokenized_input, gold_output)
# with open('./generatedFiles/training/prior_feature.txt', 'w', encoding='utf-8') as file:
#     for key, value in prior_feature.items():
#         file.write(f'{key}: {value}\n')

In [14]:
def read_map(file_path, number_of_keys=2):
    """
    Read the contents of the file located at file_path 
    and append to the dictionary prior_feature
    """
    prior_feature = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            key, value = line.strip().split(':')
            key = key.strip()
            value = value.strip()
            key = key[1:-1].split(',')
            value = value[1:-1].split(',')
            if number_of_keys == 2:
                key = (key[0][1:-1], key[1][2:-1], int(key[2]))
            else:
                key = (key[0][1:-1], key[1][2:-1], int(key[2]), int(key[3]))
            
            value = [int(i) for i in value]
            prior_feature[key] = value
    return prior_feature

In [37]:
prior_feature = read_map('./generatedFiles/training/prior_feature.txt', 2)
print(prior_feature[('قوله', 'ق', 0)])

[1, 0, 0, 0, 0, 0, 0, 0]


### 4 - CASE Feature: 
whether the letter expects a core word diacritic or a case ending. Case endings are placed on only one letter in a word, which may or may not be the last letter in the word. This is a binary feature.

In [None]:
# from farasa.stemmer import FarasaStemmer

# def arabic_stemmer(text):
#     stemmer = FarasaStemmer(interactive=True)  # Set interactive to True for better performance

#     # Perform stemming
#     stemmed_text = stemmer.stem(text)

#     return stemmed_text

# # Example usage
# input_text = "الكتابة باللغة العربية"
# stemmed_text = arabic_stemmer(input_text)
# print("Original text:", input_text)
# print("Stemmed text:", stemmed_text)


In [None]:
# for i in range(10):
#     stemmed_text = arabic_stemmer(tokenized_input[i])
#     print("Original text:", tokenized_input[i])
#     print("Stemmed text:", stemmed_text)

In [None]:
# for i in range(len(tokenized_input)):
#     stemmed_text = arabic_stemmer(tokenized_input[i])
#     # Write and append on the tokenized input to a file
#     with open('./generatedFiles/stemmed_input.txt', 'a', encoding='utf-8') as file:
#         file.write(stemmed_text + '\n')
        

In [None]:
# stemmed_text = stemmed_text.split(' ')
# # write in a file the stemmed input
# with open('./generatedFiles/stemmed_input.txt', 'w', encoding='utf-8') as file:
#     for word in stemmed_text:
#         file.write(word + '\n')


### 5 - POS Tagging:
Marking up a word in a text as corresponding to a particular part of speech, based on both its definition and its context.

In [None]:
# from farasa.pos import FarasaPOSTagger

# tagger = FarasaPOSTagger(interactive=True)  # Download model if needed
# text = "قراءة يَحْتَاجُ الكتب مفيدة للعقل."
# tagged = tagger.tag(text)
# # Output: [['قراءة', 'NOUN'], ['الكتب', 'NOUN'], ['مفيدة', 'ADJ'], ['للعقل', 'NOUN'], ['.', 'PUNCT']]

# print(tagged)

### 6 - Word2Vec
Understanding context

In [72]:
# sentences = LineSentence('./generatedFiles/training/new_input_sentence.txt')

# Train Word2Vec model
# model1 = FastText(sentences, vector_size=50, window=5, workers=4)
# model1.save("word2vec.model")

In [None]:
# # Get embeddings for two words
# word1_embedding = model.wv["قال"]
# word2_embedding = model.wv["التفرغ"]

# print(word1_embedding)
# print(word2_embedding)

## **Model Building**

In [None]:
char_features_vector=[]
tag_features_vector=[]
prior_features_vector=[]
embeddings = []

for i in range(len(tokenized_input)):
    for j in range(len(tokenized_input[i])):    
        char_index = tokenizer_char.word_index.get(tokenized_input[i][j])
        char_features_vector= char_embeddings[char_index]
        if (len(tokenized_input[i]) != len(input_segments[i])):
            input_segments[i] = "S" * (len(tokenized_input[i]) - len(input_segments[i])) + input_segments[i]
        tag_index = tokenizer_tags.word_index.get(input_segments[i][j].lower())
        tag_features_vector= tags_embeddings[tag_index]
        prior_features_vector= prior_feature[(tokenized_input[i], tokenized_input[i][j], j)]
        # pad the prior feature vector with zeros to have the same length as the other features
        prior_features_vector = np.pad(prior_features_vector, (0, embedding_size-8), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # embeddings.append(np.vstack((char_features_vector, tag_features_vector, prior_features_vector)))
        # word_embedding = model.wv[tokenized_input[i]]
        embeddings.append(np.concatenate((char_features_vector, tag_features_vector, prior_features_vector)))

embeddings = np.array(embeddings)

In [None]:
# print(char_features_vector)
# print(tag_features_vector)
# print(prior_features_vector)
print(len(embeddings))
print(embeddings.shape)

In [None]:
# save the embeddings in a pickle file
# with open('./generatedFiles/embeddings.pickle', 'wb') as file:
#     pickle.dump(embeddings, file)

In [None]:
# read the embeddings from the pickle file
# with open('./generatedFiles/embeddings.pickle', 'rb') as file:
#     embeddings = pickle.load(file)

In [15]:
output_map = {
    (1, 0, 0, 0, 0, 0, 0, 0) : 0, # FATHA
    (0, 0, 0, 1, 0, 0, 0, 0) : 1, # FATHATAN
    (0, 0, 1, 0, 0, 0, 0, 0) : 2, # KASRA
    (0, 0, 0, 0, 0, 1, 0, 0) : 3, # KASRATAN
    (0, 1, 0, 0, 0, 0, 0, 0) : 4, # DAMMA
    (0, 0, 0, 0, 1, 0, 0, 0) : 5, # DAMMATAN
    (0, 0, 0, 0, 0, 0, 1, 0) : 6, # SUKUN
    (0, 0, 0, 0, 0, 0, 0, 1) : 7,  # SHADDA
    (1, 0, 0, 0, 0, 0, 0, 1) : 8, # SHADDA FATHA
    (0, 0, 0, 1, 0, 0, 0, 1) : 9, # SHADDA FATHATAN
    (0, 0, 1, 0, 0, 0, 0, 1) : 10, # SHADDA KASRA
    (0, 0, 0, 0, 0, 1, 0, 1) : 11, # SHADDA KASRATAN
    (0, 1, 0, 0, 0, 0, 0, 1) : 12, # SHADDA DAMMA
    (0, 0, 0, 0, 1, 0, 0, 1) : 13, # SHADDA DAMMATAN
    (0, 0, 0, 0, 0, 0, 0, 0) : 14
}

In [None]:
with open('./generatedFiles/training/gold_output_dict.txt', 'w', encoding='utf-8') as file:
    for idx, word in enumerate(gold_output):
        gold_diacritics = get_prior([tokenized_input[idx]], [word])
        for key, value in gold_diacritics.items():
            key = key + (idx,)
            file.write(f'{key}: {value}\n')

In [None]:
gold_output_dict = read_map('./generatedFiles/training/gold_output_dict.txt', 3)

In [None]:
gold_output_dict[('قوله', 'ق', 0, 0)]

In [None]:
# Change gold_output_dict.values() to a list of tuples
for key, value in gold_output_dict.items():
    gold_output_dict[key] = tuple(value)
    
gold_output_dict_values = list(gold_output_dict.values())

In [None]:
# with open('./generatedFiles/training/gold_output_id.txt', 'w', encoding='utf-8') as file:
#     for value in gold_output_dict_values:
#         file.write(f'{output_map[value]}\n')

In [None]:
# read the gold_output_id file
gold_output_id = read_data('./generatedFiles/training/gold_output_id.txt')
gold_output_id = np.array(gold_output_id)

In [None]:
print(gold_output_id.shape)
print(gold_output_id[:10])

In [None]:
# Length of embeddings = 8351478
# Truncate emdeddings to have the 8353000
embeddings_reshape = embeddings[:6000000]
gold_output_id = gold_output_id[:6000000]

# Make it np array 
embeddings_reshape = np.array(embeddings_reshape)
gold_output_id = np.array(gold_output_id)

In [None]:
print (embeddings_reshape.shape)
print (gold_output_id.shape)

In [None]:
# Reshape embeddings to have 3 dimensions 
embeddings_reshape = embeddings_reshape.reshape((-1, 1000, 120))
gold_output_id_reshape = gold_output_id.reshape(-1, 1000, 1)

In [None]:
print(embeddings_reshape.shape)
print(gold_output_id_reshape.shape)

# print the first 10 rows of the embeddings
# print(embeddings[:10])

# print the first 10 rows of the gold_output_id
print(gold_output_id_reshape[:][0])

# print the first 10 columns of the gold_output_id
print(tf.keras.utils.to_categorical(gold_output_id_reshape[:][0]))

In [None]:
# Build a training model, first we need input layer that take matrix "embeddings" as an input with dropout of 10%
# then we need a bidirectional LSTM layer with 100 units
# then we need a dense layer with 100 units and relu activation function
# then we need an output layer with 14 units and softmax activation function
# use early stopping with patience of five epochs, a learning rate of 0.001, a batch size of 256, and an Adamax optimizer

input_shape = (1000, 120)

model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=input_shape))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(TimeDistributed(Dense(128, activation='relu')))
model.add(TimeDistributed(Dense(15, activation='softmax')))


# compile the model
model.compile(optimizer=Adamax(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

# early stopping
# early_stopping = EarlyStopping(monitor='test_accuracy', patience=5)

In [None]:
labels = []
for i in range(len(gold_output_id_reshape)):
    for j in range(1000):
        labels.append( tf.keras.utils.to_categorical(gold_output_id_reshape[i][j], num_classes=15))
        
labels = np.array(labels)

In [None]:
labels = labels.reshape(-1, 1000, 15)
print(labels.shape)

In [None]:
# fit the model on the training dataset and evaluate it on the validation dataset,
# use early stopping with patience of five epochs, a learning rate of 0.001, a batch size of 256
# before that, configure the model to use GPU

# fit the model with gpu
with tf.device('/GPU:0'):
    model.fit(embeddings_reshape, labels, epochs=30, batch_size=256)


In [None]:
# save the model
model.save('./generatedFiles/model_shakkala.h5')

## Validation:

In [None]:
validation_data_before_preprocessing = read_data("./dataset/val.txt")

In [None]:
for i in range(len(validation_data_before_preprocessing)):
    save_tokenized_input(validation_data_before_preprocessing[i], path="./generatedFiles/validation/validation_tokenized_input.txt", permission='a')

In [None]:
validation_tokenized_input = read_data("./generatedFiles/validation/validation_tokenized_input.txt")

In [None]:
for i in range(len(validation_data_before_preprocessing)):
    save_gold_output(validation_data_before_preprocessing[i],"./generatedFiles/validation/validation_gold_output.txt", permission='a')

In [None]:
validation_gold_output = read_data("./generatedFiles/validation/validation_gold_output.txt")

In [None]:
for i in range(len(validation_tokenized_input)):
    validation_segments, validation_seg_tags = get_seg_tags(validation_tokenized_input[i])
    with open('./generatedFiles/validation/validation_input_segments.txt', 'a', encoding='utf-8') as file:
        for tag in validation_seg_tags:
            file.write(tag)
        file.write('\n')

In [None]:
validation_input_segments = read_data("./generatedFiles/validation/validation_input_segments.txt")

In [None]:
validation_char_features_vector = []
validation_tag_features_vector = []
validation_prior_features_vector = []
validation_embeddings = []

for i in range(len(validation_tokenized_input)):
    for j in range(len(validation_tokenized_input[i])):    
        char_index = tokenizer_char.word_index.get(validation_tokenized_input[i][j])
        validation_char_features_vector = char_embeddings[char_index]
        if (len(validation_tokenized_input[i]) != len(validation_input_segments[i])):
            validation_input_segments[i] = "S" * (len(validation_tokenized_input[i]) - len(validation_input_segments[i])) + validation_input_segments[i]
        tag_index = tokenizer_tags.word_index.get(validation_input_segments[i][j].lower())
        validation_tag_features_vector= tags_embeddings[tag_index]
        validation_prior_features_vector= (prior_feature[(validation_tokenized_input[i], validation_tokenized_input[i][j], j)]) if (validation_tokenized_input[i], validation_tokenized_input[i][j], j) in prior_feature else [1, 1, 1, 1, 1, 1, 1, 1]
        # pad the prior feature vector with zeros to have the same length as the other features
        validation_prior_features_vector = np.pad(validation_prior_features_vector, (0, 32), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # embeddings.append(np.vstack((char_features_vector, tag_features_vector, prior_features_vector)))
        validation_embeddings.append(np.concatenate((validation_char_features_vector, validation_tag_features_vector, validation_prior_features_vector)))

validation_embeddings = np.array(validation_embeddings)

# print(char_features_vector)
# print(tag_features_vector)
# print(prior_features_vector)
print(validation_embeddings.shape)

In [None]:
validation_embeddings_reshape = validation_embeddings[:421000]
validation_embeddings_reshape = validation_embeddings_reshape.reshape((-1, 1000, 120))
validation_embeddings_reshape = np.array(validation_embeddings_reshape)
print(validation_embeddings_reshape.shape)

In [None]:
# Gold labels
with open('./generatedFiles/validation/validation_gold_output_dict.txt', 'w', encoding='utf-8') as file:
    for idx, word in enumerate(validation_gold_output):
        gold_diacritics = get_prior([validation_tokenized_input[idx]], [word])
        for key, value in gold_diacritics.items():
            key = key + (idx,)
            file.write(f'{key}: {value}\n')

In [None]:
validation_gold_output_dict = read_map('./generatedFiles/validation/validation_gold_output_dict.txt', 3)

In [None]:
# Change gold_output_dict.values() to a list of tuples
for key, value in validation_gold_output_dict.items():
    validation_gold_output_dict[key] = tuple(value)
    
validation_gold_output_dict_values = list(validation_gold_output_dict.values())

In [None]:
with open('./generatedFiles/validation/validation_gold_output_id.txt', 'w', encoding='utf-8') as file:
    for value in validation_gold_output_dict_values:
        file.write(f'{output_map[value]}\n')

In [None]:
validation_gold_output_id = read_data('./generatedFiles/validation/validation_gold_output_id.txt')
validation_gold_output_id = np.array(validation_gold_output_id)

In [None]:
print(validation_gold_output_id.shape)

In [None]:
validation_gold_output_id = validation_gold_output_id[:421000]
validation_gold_output_id = validation_gold_output_id.reshape(-1, 1000, 1)

In [None]:
validation_labels = []
for i in range(len(validation_gold_output_id)):
    for j in range(1000):
        validation_labels.append( tf.keras.utils.to_categorical(validation_gold_output_id[i][j], num_classes=15))
        
validation_labels = np.array(validation_labels)

validation_labels = validation_labels.reshape(-1, 1000, 15)
print(validation_labels.shape)

In [None]:
# load the model
model = tf.keras.models.load_model('./generatedFiles/model_shakkala.h5')

In [None]:
# Evaluate the model

loss, accuracy = model.evaluate(validation_embeddings_reshape, validation_labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

## Test Set:

In [18]:
test_data_before_preprocessing = read_data("./dataset/test.txt")

In [19]:
for i in range(len(test_data_before_preprocessing)):
    save_tokenized_input(test_data_before_preprocessing[i], path="./generatedFiles/test/test_tokenized_input.txt", permission='a')

In [20]:
test_tokenized_input = read_data("./generatedFiles/test/test_tokenized_input.txt")

In [23]:
for i in range(len(test_tokenized_input)):
    test_segments, test_seg_tags = get_seg_tags(test_tokenized_input[i])
    with open('./generatedFiles/test/test_input_segments.txt', 'a', encoding='utf-8') as file:
        for tag in test_seg_tags:
            file.write(tag)
        file.write('\n')

In [24]:
test_input_segments = read_data("./generatedFiles/test/test_input_segments.txt")

In [73]:
test_char_features_vector=[]
test_tag_features_vector=[]
test_prior_features_vector=[]
test_embeddings = []
for i in range(len(test_tokenized_input)):
    for j in range(len(test_tokenized_input[i])):    
        char_index = tokenizer_char.word_index.get(test_tokenized_input[i][j])
        test_char_features_vector= char_embeddings[char_index]
        if (len(test_tokenized_input[i]) != len(test_input_segments[i])):
            test_input_segments[i] = "S" * (len(test_tokenized_input[i]) - len(test_input_segments[i])) + test_input_segments[i]
        tag_index = tokenizer_tags.word_index.get(test_input_segments[i][j].lower())
        test_tag_features_vector= tags_embeddings[tag_index]
        test_prior_features_vector= (prior_feature[(test_tokenized_input[i], test_tokenized_input[i][j], j)]) if (test_tokenized_input[i], test_tokenized_input[i][j], j) in prior_feature else [1, 1, 1, 1, 1, 1, 1, 1]
        # pad the prior feature vector with zeros to have the same length as the other features
        test_prior_features_vector = np.pad(test_prior_features_vector, (0, 32), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # test_embeddings.append(np.vstack((test_char_features_vector, test_tag_features_vector, test_prior_features_vector)))
        # word_embedding = model1.wv[test_tokenized_input[i]]
        test_embeddings.append(np.concatenate((test_char_features_vector, test_tag_features_vector, test_prior_features_vector)))

test_embeddings = np.array(test_embeddings)

print(test_embeddings.shape)

(417359, 170)


In [74]:
# pad the embeddings to be of size 418000
test_embeddings = np.pad(test_embeddings, ((0, 418000 - test_embeddings.shape[0]), (0, 0)), 'constant')

In [75]:
test_embeddings.shape

(418000, 170)

In [76]:
test_embeddings_reshape = test_embeddings[:418000]
test_embeddings_reshape = test_embeddings_reshape.reshape((-1, 1000, 170))
test_embeddings_reshape = np.array(test_embeddings_reshape)
print(test_embeddings_reshape.shape)


(418, 1000, 170)


## Prediction:

In [77]:
# load the model
model = tf.keras.models.load_model('./generatedFiles/model_shakkala.h5')

In [78]:
# predict the test dataset
predictions = model.predict(test_embeddings_reshape)



In [79]:
print(predictions.shape)

(418, 1000, 15)


In [80]:
predictions = predictions.reshape(-1, 15)

In [81]:
print(predictions[:10])    

[[4.95688571e-03 6.73070531e-07 4.57599754e-06 4.32224078e-06
  4.24501650e-06 1.12170767e-06 7.62303546e-03 8.01798701e-03
  9.74423826e-01 2.39276287e-05 1.46317121e-03 4.60539413e-05
  3.28106899e-03 5.04063719e-05 9.87033127e-05]
 [4.01821919e-03 1.61821354e-05 1.67270682e-05 1.83824552e-04
  1.31599617e-03 4.30663567e-05 7.47109354e-01 1.33366091e-02
  6.38496950e-02 4.72218890e-05 3.19705047e-02 4.36150614e-04
  1.27261266e-01 7.16300390e-04 9.67886578e-03]
 [4.38582450e-01 4.64903351e-05 7.65887453e-05 1.51948771e-04
  1.89074099e-01 1.93466505e-04 3.06609110e-03 1.60435890e-03
  8.15489516e-03 1.97290974e-05 7.06350571e-03 2.82681081e-04
  3.07692349e-01 4.31169290e-04 4.35602106e-02]
 [1.66252153e-06 1.27697968e-08 9.99973893e-01 1.06274257e-07
  8.62993499e-09 5.22037213e-09 2.16680185e-09 1.33216123e-08
  3.96678104e-08 1.90017890e-10 2.40693716e-05 3.76307234e-08
  6.13984863e-09 1.34798519e-08 1.40929416e-07]
 [7.26842245e-06 1.26454552e-07 5.43859414e-07 1.24323080e-07
  

In [82]:
# get the index of the maximum value in each row
predictions = np.argmax(predictions, axis=1)

In [83]:
print(predictions.shape)
print(predictions[:10])

(418000,)
[ 8  6  0  2  6  0  2 14  2  2]


In [84]:
# make a dictionary to map the predictions to the corresponding unicode 

predictions_map = {
    0 : diacritics_mapping['FATHA'],
    1 : diacritics_mapping['FATHATAN'],
    2 : diacritics_mapping['KASRA'],
    3 : diacritics_mapping['KASRATAN'],
    4 : diacritics_mapping['DAMMA'],
    5 : diacritics_mapping['DAMMATAN'],
    6 : diacritics_mapping['SUKUN'],
    7 : diacritics_mapping['SHADDA'],
    8 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHA'],
    9 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHATAN'],
    10 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRA'],
    11 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRATAN'],
    12 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMA'],
    13 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMATAN'],
    14 : ''
}


In [85]:
# truncate the predictions to 417470
predictions = predictions[:417470]

In [86]:
# loop over the letters and concatenate it with the corresponding prediction
predicted_diacritized_text = []
count = 0
for i in range(len(test_tokenized_input)):
    for j in range(len(test_tokenized_input[i])):
        predicted_diacritized_text.append(test_tokenized_input[i][j] + predictions_map[predictions[count]])
        count += 1
        
print(len(predicted_diacritized_text))
print(predicted_diacritized_text[:10])

417359
['لَّ', 'يْ', 'سَ', 'لِ', 'لْ', 'وَ', 'كِ', 'ي', 'لِ', 'بِ']


In [87]:
with open('./generatedFiles/test/predictions.csv', 'w', encoding='utf-8') as file:
    file.write('ID,label\n')
    id = 0
    id2 = 0
    for i in range(len(test_tokenized_input)):
        for j in range(len(test_tokenized_input[i])):
            # check if test_tokenized_input[i][j] is an arabic letter
            if test_tokenized_input[i][j] in arabic_letters:
                file.write(f'{id2},{predictions[id]}\n')
                id2 += 1
            id += 1

In [88]:
x ={
    0 : 0,
    1 : 1,
    2 : 4,
    3 : 5,
    4 : 2,
    5 : 3,
    6 : 6,
    7 : 7,
    8 : 8,
    9 : 9,
    10 : 12,
    11 : 13,
    12 : 10,
    13 : 11,
    14 : 14
    
}

In [89]:
# read csv file, and loop over the csv, for the second column replace each value with its corresponding value in x
import csv
data = []
with open('./generatedFiles/test/predictions.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        data.append(row)
        
for i in range(1, len(data)):
    data[i][1] = x[int(data[i][1])]
    
    
    
# write the new data to the csv file
with open("./generatedFiles/test/predictions_updated.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)