# Named Entity Recognition Assignment
NER is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc. In this assignment, you will train a named entity recognition system and test it on a test data. \
Let's get started

In [1]:
import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from utils import get_params, get_vocab
import random as rnd
import pickle
import csv

In [None]:
def get_params(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file, encoding="utf-8") as f:
        for sentence in f.read().splitlines():
            # replace each token by its index if it is in vocab
            # else use index of UNK_WORD
            s = [vocab[token] if token in vocab 
                 else vocab['UNK']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file, encoding="utf-8") as f:
        for sentence in f.read().splitlines():
            # replace each label by its index
            l = [tag_map[label] for label in sentence.split(' ')] # I added plus 1 here
            labels.append(l) 
    return sentences, labels, len(sentences)

In [None]:
# def get_vocab(vocab_path, tags_path):
#     vocab = {}
#     with open(vocab_path, encoding="utf-8") as f:
#         for i, l in enumerate(f.read().splitlines()):
#             vocab[l] = i  # to avoid the 0
#         # loading tags (we require this to map tags to their indices)
#     vocab['<PAD>'] = len(vocab) # 35180
#     tag_map = {}
#     with open(tags_path, encoding="utf-8") as f:
#         for i, t in enumerate(f.read().splitlines()):
#             tag_map[t] = i 
    
#     return vocab, tag_map

In [2]:
def read_pickle_file(file_path):
    """
    Read the contents of the pickle file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

diacritics = set(read_pickle_file("../Delivery/diacritics.pickle"))

In [3]:
arabic_letters = set(read_pickle_file("../Delivery/arabic_letters.pickle"))

# save this arabic_letters as a dict with index starting from 0
vocab = dict()
for i, char in enumerate(arabic_letters):
    vocab[i] = char
# vocab[36] = '<PAD>'
vocab[36] = ' '

In [4]:
reverse_vocab = dict()
for key, value in vocab.items():
    reverse_vocab[value] = key

print(reverse_vocab)

{'أ': 0, 'ص': 1, 'ا': 2, 'ى': 3, 'ذ': 4, 'ف': 5, 'ظ': 6, 'ؤ': 7, 'ت': 8, 'ء': 9, 'ه': 10, 'ي': 11, 'ز': 12, 'ط': 13, 'ل': 14, 'إ': 15, 'د': 16, 'ئ': 17, 'و': 18, 'ج': 19, 'ن': 20, 'ح': 21, 'ر': 22, 'س': 23, 'ش': 24, 'ك': 25, 'ع': 26, 'خ': 27, 'ق': 28, 'غ': 29, 'آ': 30, 'م': 31, 'ب': 32, 'ة': 33, 'ث': 34, 'ض': 35, ' ': 36}


In [5]:
# Read the tokenized_input and then append to tokenized_input_char as list of lists for each line, each list contains the characters of the line encoded as the dict vocab
def read_tokenized_input(reverse_vocab, input_file):
    with open(input_file, 'r', encoding="utf-8") as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]
    tokenized_input_char = []
    for line in lines:
        line = list(line)
        line = [reverse_vocab[char] for char in line]
        tokenized_input_char.append(line)
    return tokenized_input_char

tokenized_input_char = read_tokenized_input(reverse_vocab, '../generatedFiles/training/new_input_sentence.txt')

In [6]:
print(tokenized_input_char[0])
print(len(tokenized_input_char))

[28, 18, 14, 10, 36, 0, 18, 36, 28, 13, 26, 36, 2, 14, 0, 18, 14, 36, 11, 16, 10, 36, 15, 14, 27, 36, 28, 2, 14, 36, 2, 14, 12, 22, 25, 24, 11]
50000


In [7]:
# get the length of the max array in tokenized_input_char
max_len = max([len(line) for line in tokenized_input_char])
print(max_len)

7095


In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

word_sequences_padded = pad_sequences(tokenized_input_char, padding='post', value=37, maxlen=max_len)


2024-01-01 23:38:28.485707: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-01 23:38:28.637266: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/mic/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/mic/lib
2024-01-01 23:38:28.637307: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do n

In [9]:
print(word_sequences_padded[0])

[28 18 14 ... 37 37 37]


# Importing and discovering the data

In [10]:
# Map each diacrtics to its unicode
diacritics_mapping = {
    'FATHA': '\u064E',
    'DAMMA': '\u064F',
    'KASRA': '\u0650',
    'SHADDA': '\u0651',
    'SUKUN': '\u0652',
    'FATHATAN': '\u064B',
    'DAMMATAN': '\u064C',
    'KASRATAN': '\u064D'
}

In [11]:
predictions_map = {
    0 : diacritics_mapping['FATHA'],
    1 : diacritics_mapping['FATHATAN'],
    2 : diacritics_mapping['KASRA'],
    3 : diacritics_mapping['KASRATAN'],
    4 : diacritics_mapping['DAMMA'],
    5 : diacritics_mapping['DAMMATAN'],
    6 : diacritics_mapping['SUKUN'],
    7 : diacritics_mapping['SHADDA'],
    8 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHA'],
    9 : diacritics_mapping['SHADDA'] + diacritics_mapping['FATHATAN'],
    10 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRA'],
    11 : diacritics_mapping['SHADDA'] + diacritics_mapping['KASRATAN'],
    12 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMA'],
    13 : diacritics_mapping['SHADDA'] + diacritics_mapping['DAMMATAN'],
    14 : ''
}

# reverse predictions_map
# predictions_map = {v: k for k, v in predictions_map.items()}

print(predictions_map)


{0: 'َ', 1: 'ً', 2: 'ِ', 3: 'ٍ', 4: 'ُ', 5: 'ٌ', 6: 'ْ', 7: 'ّ', 8: 'َّ', 9: 'ًّ', 10: 'ِّ', 11: 'ٍّ', 12: 'ُّ', 13: 'ٌّ', 14: ''}


In [12]:

print(len(diacritics))
print(diacritics)

print(len(arabic_letters))
print(arabic_letters)

8
{'ً', 'َ', 'ّ', 'ِ', 'ٌ', 'ُ', 'ْ', 'ٍ'}
36
{'أ', 'ص', 'ا', 'ى', 'ذ', 'ف', 'ظ', 'ؤ', 'ت', 'ء', 'ه', 'ي', 'ز', 'ط', 'ل', 'إ', 'د', 'ئ', 'و', 'ج', 'ن', 'ح', 'ر', 'س', 'ش', 'ك', 'ع', 'خ', 'ق', 'غ', 'آ', 'م', 'ب', 'ة', 'ث', 'ض'}


In [13]:
import pyarabic.araby as araby
import re

# Remove diacritics
def remove_diacritics(text):
    text = araby.strip_tashkeel(text)
    return text

# Remove any letters not found in set arabic_letters and not found in set diacritics
def remove_non_arabic(text):
    text = re.sub(r'[^\s' + ''.join(arabic_letters) + ''.join(diacritics) + ']', '', text)
    return text


def save_sentence_in_file(path, words, permission='w'):
    """
    Save the words in the file located at path 
    """
    with open(path, permission, encoding='utf-8') as file:
            file.write(words + '\n')

def save_new_input_sentence(text,path="./OutputFiles/new_input_sentence.txt", permission='w'):
    # Remove any non-Arabic letters and extra spaces
    text = remove_non_arabic(text)

    #remove extra spaces between words
    text = re.sub(r'\s+', ' ', text)

    save_sentence_in_file(path, text, permission)


def read_data(file_path):
    """
    Read the contents of the file located at file_path 
    and append each line to the list data
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()

        # remove '\n' from each line
        data = [line.strip() for line in data]
    return data

data_before_preprocessing = read_data("./dataset/train.txt")



In [14]:

for i in range(len(data_before_preprocessing)):
    save_new_input_sentence(data_before_preprocessing[i], path="./OutputFiles/gold_input_sentence.txt", permission='a')

In [15]:
tashkeel_list = read_pickle_file("./tashkeel_list.pkl")

In [16]:
print(tashkeel_list[0])

# reverse predictions_map
predictions_map_reversed = {v: k for k, v in predictions_map.items()}

print(predictions_map_reversed)

['َ', 'ْ', 'ُ', 'ُ', '', 'َ', 'ْ', '', 'َ', 'َ', 'َ', '', '', 'ْ', 'َ', '٤', 'ُ', '', 'َ', 'َ', 'ُ', '', '', 'َ', 'ْ', '', 'َ', '', 'َ', '', '', '', '٤', 'ْ', 'َ', 'ِ', '٥']
{'َ': 0, 'ً': 1, 'ِ': 2, 'ٍ': 3, 'ُ': 4, 'ٌ': 5, 'ْ': 6, 'ّ': 7, 'َّ': 8, 'ًّ': 9, 'ِّ': 10, 'ٍّ': 11, 'ُّ': 12, 'ٌّ': 13, '': 14}


In [17]:
new_map = {
    diacritics_mapping['FATHA'] : 0,
    diacritics_mapping['FATHATAN'] : 1,
    diacritics_mapping['KASRA'] : 2,
    diacritics_mapping['KASRATAN'] : 3,
    diacritics_mapping['DAMMA'] : 4,
    diacritics_mapping['DAMMATAN'] : 5,
    diacritics_mapping['SUKUN'] : 6,
    diacritics_mapping['SHADDA'] : 7,
    '٤' : 8,
    '١' : 9,
   '٦' : 10,
   '٣' : 11,
    '٥' : 12,
    '٢' : 13,
    '' : 14
}

In [18]:
# loop over tashkeel_list and replace each tashkeel with predictions_map_reversed
tashkeel_list_updated = [[]]
for i in range(len(tashkeel_list)):
    tashkeel_list_updated.append([])
    for j in range(len(tashkeel_list[i])):
        tashkeel_list_updated[i].append(new_map[tashkeel_list[i][j]])



In [19]:
print(tashkeel_list_updated[0])

[0, 6, 4, 4, 14, 0, 6, 14, 0, 0, 0, 14, 14, 6, 0, 8, 4, 14, 0, 0, 4, 14, 14, 0, 6, 14, 0, 14, 0, 14, 14, 14, 8, 6, 0, 2, 12]


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tashkeel_list_sequences_padded = pad_sequences(tashkeel_list_updated, padding='post', value=14, maxlen=max_len)


In [21]:
print(len(tashkeel_list_sequences_padded[2]))
print(len(word_sequences_padded[2]))

7095
7095


In [22]:
# convert the tokenized_input_char.txt to index using reverse_vocab and save it in tokenized_input_char_id list
tokenized_input_char_id = []
with open('./generatedFiles/training/tokenized_input_char.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines]
    for line in lines:
        tokenized_input_char_id.append(reverse_vocab[line])

t_sentences = tokenized_input_char_id
t_size = len(t_sentences)

`vocab` is a dictionary that translates a word string to a unique number. Given a sentence, you can represent it as an array of numbers translating with this dictionary. The dictionary contains a `<PAD>` token. 

When training an LSTM using batches, all your input sentences must be the same size. To accomplish this, you set the length of your sentences to a certain number and add the generic `<PAD>` token to fill all the empty spaces. 

In [None]:
# vocab translates from a word to a unique number
# print('padded token:', reverse_vocab['<PAD>'])

In [24]:
# The possible tags
print(predictions_map)

{0: 'َ', 1: 'ً', 2: 'ِ', 3: 'ٍ', 4: 'ُ', 5: 'ٌ', 6: 'ْ', 7: 'ّ', 8: 'َّ', 9: 'ًّ', 10: 'ِّ', 11: 'ٍّ', 12: 'ُّ', 13: 'ٌّ', 14: ''}


# NERDataset
The class that impelements the dataset for NER

In [25]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### TODO: create two tensors one for x and the other for labels ###############################

    # Convert to tensors
    self.x = torch.tensor(x)
    self.y = torch.tensor(y)   

    
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### TODO: return the length of the dataset #############################
    return len(self.x)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### TODO: return a tuple of x and y ###################################
    return self.x[idx], self.y[idx]
    ##########################################################################################

In [26]:
print(word_sequences_padded[0].max())

37


In [27]:
batch_size = 5
mini_dataset = NERDataset(word_sequences_padded, tashkeel_list_sequences_padded)
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

torch.Size([5, 7095]) torch.Size([5, 7095]) torch.Size([5, 7095]) torch.Size([5, 7095])
tensor([28, 18, 14,  ..., 37, 37, 37], dtype=torch.int32) 
 tensor([ 0,  6,  4,  ..., 14, 14, 14], dtype=torch.int32)


#### Expected output
torch.Size([5, 30]) torch.Size([5, 30]) torch.Size([3, 30]) torch.Size([3, 30])\
tensor([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
           10,    11,    12,    13,    14,     9,    15,     1,    16,    17,
           18,    19,    20,    21, 35180, 35180, 35180, 35180, 35180, 35180]) \
tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

# NER
The class that implementss the pytorch model for NER

In [28]:
class NER(nn.Module):
  def __init__(self, vocab_size=39, embedding_dim=50, hidden_size=50, n_classes=16):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim)

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    final_output = None
    ######################### TODO: implement the forward pass ####################################
    output = self.embedding(sentences)
    output, _ = self.lstm(output)
    final_output = self.linear(output)
    
    ###############################################################################################
    return final_output

In [29]:
model = NER()
print(model)

NER(
  (embedding): Embedding(39, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=16, bias=True)
)


#### Expected output
NER( \
  (embedding): Embedding(35181, 50) \
  (lstm): LSTM(50, 50, batch_first=True) \
  (linear): Linear(in_features=50, out_features=17, bias=True) \
)

# Training

In [30]:
def train(model, train_dataset, batch_size=256, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      # train_label = train_label.to(device)
      train_label = train_label.long().to(device)

      # (5) move the train label to the device
      train_input = train_input.long().to(device)


      # (6) do the forward pass
      # print(train_input.shape)
      output = model(train_input)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      # batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))
      batch_loss = criterion(output.reshape(-1, 16), train_label.reshape(-1))

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (output.argmax(dim=2) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()
      # To avoid the exploding gradient problem, we clip the gradients
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [31]:
train_dataset = NERDataset(word_sequences_padded, tashkeel_list_sequences_padded)

In [32]:
train(model, train_dataset)

  4%|██████████████████████████████████▎                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               | 7/196 [01:17<34:43, 11.03s/it]

In [None]:
# Save the model
# torch.save(model.state_dict(), "./model.pth")

In [None]:
test_before_preprocessing = read_data("../dataset/test.txt")

In [None]:
for i in range(len(test_before_preprocessing)):
    save_new_input_sentence(test_before_preprocessing[i], permission='a')

In [None]:
tokenized_input_char_test = read_tokenized_input(reverse_vocab, './OutputFiles/test_input_sentence.txt')

In [None]:
max_len_test = max([len(line) for line in tokenized_input_char_test])
print(max_len_test)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

word_sequences_padded_test = pad_sequences(tokenized_input_char_test, padding='post', value=37, maxlen=max_len_test)

# Evaluation

In [None]:
def evaluate(model, test_dataset, batch_size=512):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)
      
      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (output.argmax(dim=2) == test_label).sum().item()
      total_acc_test += acc
    
    # (6) calculate the over all accuracy
    total_acc_test /= (len(test_dataset) * test_dataset[0][0].shape[0])
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [None]:
# evaluate(model, test_dataset)

In [None]:
output = model(torch.tensor(word_sequences_padded_test))

print(output.shape)

In [None]:
print(output[0][0])

softmax_output = nn.functional.softmax(output, dim=-1)
print(softmax_output[0][0])

# Find the index of the maximum value along the last axis
max_arg = torch.argmax(softmax_output, dim=-1)

# Add a new dimension at the end to make the shape (2000, 1904, 1)
new_tensor = torch.unsqueeze(max_arg, dim=-1)

print(new_tensor.shape)
print(new_tensor[0][0])

In [None]:
# Generate csv file
with open('./OutputFiles/test_input_sentence.txt', 'r', encoding='utf-8') as file:
    test_txt = file.readlines()
        
list_of_sentences = []
for sentence in test_txt:
    list_of_sentences.append(sentence.strip())

# Create a list of lists with an added ID column and a single label column
csv_data = [['id', 'label']]

row = 0
column = 0
id = 0
for sentence in list_of_sentences:
    for char in sentence:
        if char == " " or char == ".":
            column += 1
            continue

        csv_data.append([id, new_tensor[row][column].item()])

        id += 1
        column += 1

    row += 1
    column = 0

with open("./Answer.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print(f'CSV file has been created.')

# Thank you