In [16]:
import pickle
import numpy as np
import pandas as pd
import re
import pyarabic.araby as araby
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import farasa
from farasa.segmenter import FarasaSegmenter 
import unicodedata
import torch

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, TimeDistributed, BatchNormalization
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.activations import relu,linear

from gensim.models.word2vec import LineSentence
from gensim.models import FastText

In [17]:
def read_data(file_path):
    """
    Read the contents of the file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()

        # remove '\n' from each line
        data = [line.strip() for line in data]
    return data

In [18]:
def read_pickle_file(file_path):
    """
    Read the contents of the pickle file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

def save_words_in_file(path, words, permission='w'):
    """
    Save the words in the file located at path 
    """
    with open(path, permission, encoding='utf-8') as file:
            for word in words:
                file.write(word + '\n')

In [19]:
# set for arabic letters
arabic_letters = set(read_pickle_file("./Delivery/arabic_letters.pickle"))

print(len(arabic_letters))
print(arabic_letters)

36
{'ح', 'ى', 'و', 'ا', 'ظ', 'ن', 'ج', 'د', 'خ', 'ث', 'ر', 'س', 'ئ', 'ع', 'ه', 'آ', 'ط', 'ز', 'م', 'ذ', 'ق', 'غ', 'ف', 'ي', 'ص', 'ض', 'ت', 'ب', 'ل', 'ك', 'إ', 'ء', 'ؤ', 'أ', 'ش', 'ة'}


In [20]:
# set for arabic letters
diacritics = set(read_pickle_file("./Delivery/diacritics.pickle"))

print(len(diacritics))
print(diacritics)

8
{'َ', 'ٍ', 'ٌ', 'ُ', 'ْ', 'ً', 'ِ', 'ّ'}


In [21]:
# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any letters not found in set arabic_letters and not found in set diacritics
def remove_non_arabic(text):
    text = re.sub(r'[^\s' + ''.join(arabic_letters) + ''.join(diacritics) + ']', '', text)
    return text

def input_preprocessing_text(text):
    # Correct most common errors on word like repetetion of harakats, or tanween before alef
    text = araby.autocorrect(text)

    # Remove any non-Arabic letters
    text = remove_non_arabic(text)

    # Remove diacritics
    text = remove_diacritics(text)

    # Tokenize
    text = araby.tokenize(text)

    return text

def save_tokenized_input(text,path="./generatedFiles/training/tokenized_input.txt", permission='w'):
    words = input_preprocessing_text(text)
    save_words_in_file(path, words, permission)
    

def save_gold_output(text,path="./generatedFiles/training/gold_output.txt", permission='w'):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    # Tokenize
    text = araby.tokenize(text)

    save_words_in_file(path, text, permission)


def is_not_arabic_diacritic(char):
   category = unicodedata.category(char)
   return not (category == 'Mn' or category == 'Mc')

In [22]:
segmenter = FarasaSegmenter(interactive=True) # The default behaviour

def get_seg_tags(word):                 # word = "wAlktAb"
    segments = segmenter.segment(word)  # segments will be a list: ["w", "Al", "ktAb"]
    segments = segments.split('+')
    seg_tags = []
    for segment in segments:
        if len(segment) == 1:
            seg_tags.append("S")
        else:
            seg_tags.append("B")  # First letter
            seg_tags.extend("M" * (len(segment) - 2))  # Middle letters
            seg_tags.append("E")  # Last letter
    return segments, seg_tags



In [23]:
embedding_size = 40

In [32]:
# Load the tokenizer_char model
with open('./generatedFiles/test/tokenizer_char.pickle', 'rb') as handle:
    tokenizer_char = pickle.load(handle)

# Load the sequences_char model
with open('./generatedFiles/test/sequences_char.pickle', 'rb') as handle:
    sequences_char = pickle.load(handle)

# Load the char_embeddings array from a pickle file
with open('./generatedFiles/test/char_embeddings.pickle', 'rb') as handle:
    char_embeddings = pickle.load(handle)

# Load the char_features array from a pickle file
with open('./generatedFiles/test/char_features.pickle', 'rb') as handle:
    char_features = pickle.load(handle)

In [28]:
# Load the tokenizer_tags model
with open('./generatedFiles/test/tokenizer_tags.pickle', 'rb') as handle:
    tokenizer_tags = pickle.load(handle)

# Load the tags_embeddings array in a pickle file
with open('./generatedFiles/test/tags_embeddings.pickle', 'rb') as handle:
    tags_embeddings = pickle.load(handle)

# Load the sequences_char model
with open('./generatedFiles/test/sequences_tags.pickle', 'rb') as handle:
    sequences_tags = pickle.load(handle)

# Load the tags_features array in a pickle file
with open('./generatedFiles/test/tags_features.pickle', 'rb') as handle:
    tags_features = pickle.load(handle)

In [36]:
test_data_before_preprocessing = read_data("./dataset/test.txt")

In [37]:
for i in range(len(test_data_before_preprocessing)):
    save_tokenized_input(test_data_before_preprocessing[i], path="./generatedFiles/test/test_tokenized_input.txt", permission='a')

In [38]:
test_tokenized_input = read_data("./generatedFiles/test/test_tokenized_input.txt")

In [39]:
for i in range(len(test_tokenized_input)):
    test_segments, test_seg_tags = get_seg_tags(test_tokenized_input[i])
    with open('./generatedFiles/test/test_input_segments.txt', 'a', encoding='utf-8') as file:
        for tag in test_seg_tags:
            file.write(tag)
        file.write('\n')

In [40]:
test_input_segments = read_data("./generatedFiles/test/test_input_segments.txt")

In [41]:
def read_map(file_path, number_of_keys=2):
    """
    Read the contents of the file located at file_path 
    and append to the dictionary prior_feature
    """
    prior_feature = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            key, value = line.strip().split(':')
            key = key.strip()
            value = value.strip()
            key = key[1:-1].split(',')
            value = value[1:-1].split(',')
            if number_of_keys == 2:
                key = (key[0][1:-1], key[1][2:-1], int(key[2]))
            else:
                key = (key[0][1:-1], key[1][2:-1], int(key[2]), int(key[3]))
            
            value = [int(i) for i in value]
            prior_feature[key] = value
    return prior_feature

In [42]:
prior_feature = read_map('./generatedFiles/training/prior_feature.txt', 2)

In [43]:
test_char_features_vector=[]
test_tag_features_vector=[]
test_prior_features_vector=[]
test_embeddings = []
for i in range(len(test_tokenized_input)):
    for j in range(len(test_tokenized_input[i])):    
        char_index = tokenizer_char.word_index.get(test_tokenized_input[i][j])
        test_char_features_vector= char_embeddings[char_index]
        if (len(test_tokenized_input[i]) != len(test_input_segments[i])):
            test_input_segments[i] = "S" * (len(test_tokenized_input[i]) - len(test_input_segments[i])) + test_input_segments[i]
        tag_index = tokenizer_tags.word_index.get(test_input_segments[i][j].lower())
        test_tag_features_vector= tags_embeddings[tag_index]
        test_prior_features_vector= (prior_feature[(test_tokenized_input[i], test_tokenized_input[i][j], j)]) if (test_tokenized_input[i], test_tokenized_input[i][j], j) in prior_feature else [1, 1, 1, 1, 1, 1, 1, 1]
        # pad the prior feature vector with zeros to have the same length as the other features
        test_prior_features_vector = np.pad(test_prior_features_vector, (0, 32), 'constant')
        # concatenate the 3 features vectors to have a matrix of 3 columns
        # test_embeddings.append(np.vstack((test_char_features_vector, test_tag_features_vector, test_prior_features_vector)))
        # word_embedding = model1.wv[test_tokenized_input[i]]
        test_embeddings.append(np.concatenate((test_char_features_vector, test_tag_features_vector, test_prior_features_vector)))

test_embeddings = np.array(test_embeddings)

print(test_embeddings.shape)

(417359, 120)


In [44]:
# pad the embeddings to be of size 418000
test_embeddings = np.pad(test_embeddings, ((0, 418000 - test_embeddings.shape[0]), (0, 0)), 'constant')

In [45]:
test_embeddings.shape

(418000, 120)

In [46]:
test_embeddings_reshape = test_embeddings[:418000]
test_embeddings_reshape = test_embeddings_reshape.reshape((-1, 1000, 120))
test_embeddings_reshape = np.array(test_embeddings_reshape)
print(test_embeddings_reshape.shape)


(418, 1000, 120)


In [47]:
# load the model
model = tf.keras.models.load_model('./generatedFiles/model_shakkala.h5')




In [48]:
# predict the test dataset
predictions = model.predict(test_embeddings_reshape)



In [49]:
print(predictions.shape)
predictions = predictions.reshape(-1, 15)

(418, 1000, 15)


In [50]:
print(predictions[:10])    

[[9.96443689e-01 3.66844215e-05 7.48699880e-04 4.39207543e-05
  9.42222076e-04 5.09472593e-05 6.08946779e-04 1.76945454e-04
  5.69241936e-04 2.79082233e-05 3.04720288e-05 3.11689000e-05
  9.55952928e-05 2.91507749e-05 1.64438941e-04]
 [1.26982059e-05 6.93961738e-07 2.64693226e-05 3.46447860e-06
  9.11959869e-05 3.71064857e-06 9.99723494e-01 1.67853466e-06
  1.16646879e-06 2.62588799e-07 5.43247097e-06 1.24023450e-06
  5.20431854e-07 2.14748070e-06 1.25671853e-04]
 [9.97247159e-01 7.11207176e-05 2.60321773e-04 2.22980161e-04
  3.29572911e-04 9.70155525e-05 8.79902916e-04 3.03097295e-05
  6.32464362e-05 1.31812403e-05 3.28825154e-05 2.45149677e-05
  2.82575129e-05 4.41194134e-05 6.55433338e-04]
 [1.24083990e-05 7.47032402e-07 9.99879241e-01 1.53134752e-05
  1.20202358e-05 1.62061710e-07 2.28520548e-05 1.05064305e-06
  3.51975757e-07 4.44376064e-06 2.08391812e-05 1.80776078e-06
  5.67607003e-07 3.37026449e-07 2.77936433e-05]
 [3.92807488e-05 1.54813472e-06 2.93963039e-05 6.49212379e-05
  

In [51]:
# get the index of the maximum value in each row
predictions = np.argmax(predictions, axis=1)

In [52]:
print(predictions.shape)
print(predictions[:10])

(418000,)
[ 0  6  0  2  6  0  2 14  2  2]


In [53]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SHADDA': '\u0651',
    'SUKUN': '\u0652',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D'
}

In [54]:
# make a dictionary to map the predictions to the corresponding unicode 

predictions_map = {
    0 : diacritics_mapping['FATHA'],
    1 : diacritics_mapping['FATHATAN'],
    2 : diacritics_mapping['KASRA'],
    3 : diacritics_mapping['KASRATAN'],
    4 : diacritics_mapping['DAMMA'],
    5 : diacritics_mapping['DAMMATAN'],
    6 : diacritics_mapping['SUKUN'],
    7 : diacritics_mapping['SHADDA'],
    8 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHA'],
    9 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHATAN'],
    10 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRA'],
    11 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRATAN'],
    12 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMA'],
    13 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMATAN'],
    14 : ''
}


In [55]:
# truncate the predictions to 417470
predictions = predictions[:417470]

In [56]:
# loop over the letters and concatenate it with the corresponding prediction
predicted_diacritized_text = []
count = 0
for i in range(len(test_tokenized_input)):
    for j in range(len(test_tokenized_input[i])):
        predicted_diacritized_text.append(test_tokenized_input[i][j] + predictions_map[predictions[count]])
        count += 1
        
print(len(predicted_diacritized_text))
print(predicted_diacritized_text[:10])

417359
['لَ', 'يْ', 'سَ', 'لِ', 'لْ', 'وَ', 'كِ', 'ي', 'لِ', 'بِ']


In [57]:
with open('./generatedFiles/test/predictions.csv', 'w', encoding='utf-8') as file:
    file.write('ID,label\n')
    id = 0
    id2 = 0
    for i in range(len(test_tokenized_input)):
        for j in range(len(test_tokenized_input[i])):
            # check if test_tokenized_input[i][j] is an arabic letter
            if test_tokenized_input[i][j] in arabic_letters:
                file.write(f'{id2},{predictions[id]}\n')
                id2 += 1
            id += 1

In [58]:
x ={
    0 : 0,
    1 : 1,
    2 : 4,
    3 : 5,
    4 : 2,
    5 : 3,
    6 : 6,
    7 : 7,
    8 : 8,
    9 : 9,
    10 : 12,
    11 : 13,
    12 : 10,
    13 : 11,
    14 : 14
    
}

In [59]:
# read csv file, and loop over the csv, for the second column replace each value with its corresponding value in x
import csv
data = []
with open('./generatedFiles/test/predictions.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        data.append(row)
        
for i in range(1, len(data)):
    data[i][1] = x[int(data[i][1])]
    
    
    
# write the new data to the csv file
with open("./generatedFiles/test/predictions_updated.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)