<h3> We used Google Colab to train our data, so this step is necessery.</h3>
<h3> If you use this notebook in local, you can pass this cell.</h3>


In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/YZV405E_NLP/project

ModuleNotFoundError: No module named 'google.colab'

<h1><b>Reading Files</b></h1>


In [None]:
#Reading the training files
with open('data/wrong.txt', 'r', encoding='utf-8') as file:
    wrong_sentences = file.readlines()

with open('data/correct.txt', 'r', encoding='utf-8') as file:
    correct_sentences = file.readlines()

#Getting sentences
wrong_sentences = [sentence.strip() for sentence in wrong_sentences]
correct_sentences = [sentence.strip() for sentence in correct_sentences]

train_x = wrong_sentences  #Wrong sentences (without Turkish characters) as train_x
train_y = correct_sentences  #Correct sentences as train_y

<h1><b>Tokenization</b></h1>


In [None]:
MAX_LENGTH = 500  #Limiting the maximum sequence length

#Sacrificing some sequences to minimize the required computational power, we have still over 99% of thr training set remaining!
train_x_filtered = [sentence for sentence in train_x if len(sentence) <= MAX_LENGTH]
train_y_filtered = [sentence for sentence in train_y if len(sentence) <= MAX_LENGTH]

In [None]:
#Lists to store tokenized sequences
tokenized_train_x = []
tokenized_train_y = []

for sentence in train_x_filtered:
    tokenized_sentence = list(sentence)  #Tokenizing into individual characters
    tokenized_sentence.append('<EOS>')   #Adding <EOS> token to mark end of sequences
    tokenized_train_x.append(tokenized_sentence)

for label in train_y_filtered:
    tokenized_label = list(label)
    tokenized_label.append('<EOS>')
    tokenized_train_y.append(tokenized_label)

<h1><b>Padding</b></h1>


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

desired_length = 501 #500 + 1, adding 1 because of the <EOS> token.

PAD_TOKEN = '<PAD>'  #Padding token

#Padding sequences
padded_train_x = pad_sequences(tokenized_train_x, maxlen=desired_length, padding='post', truncating='post', value=PAD_TOKEN, dtype=object)
padded_train_y = pad_sequences(tokenized_train_y, maxlen=desired_length, padding='post', truncating='post', value=PAD_TOKEN, dtype=object)

#Converting padding sequences back to lists
padded_train_x = [list(seq) for seq in padded_train_x]
padded_train_y = [list(seq) for seq in padded_train_y]

In [None]:
#Flattening the list of lists in padded_train_x
flattened_padded_train_x = [char for sequence in padded_train_x for char in sequence]

unique_characters = set(flattened_padded_train_x) #Getting unique characters from padded_train_x
turkish_characters = ["ı", "ö", "ş", "ğ", "ü", "ç"]
unique_characters.update(turkish_characters) #Adding Turkish unique characters

#Creating dictionaries to use in embedding, each token will be represented by its index number
index_to_char = {index: char for index, char in enumerate(unique_characters)}
char_to_index = {char: index for index, char in enumerate(unique_characters)}

In [None]:
index_to_char.keys() #Shows the number of unique tokens

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131])

In [None]:
#Embedding train_x and train_y
numerical_train_x = [[char_to_index[char] for char in seq] for seq in padded_train_x]
numerical_train_y = [[char_to_index[char] for char in seq] for seq in padded_train_y]

<h1><b>Model Design</b></h1>
<h2>Parameters setting</h2>



In [56]:
import torch
import torch.nn as nn
from torchsummary import summary

vocab_size = len(unique_characters) #Number of unique tokens
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units_forward, hidden_units_backward, num_layers, dropout_rate):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #Embedding layer
        self.bilstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_units_forward, num_layers=num_layers, batch_first=True, dropout=dropout_rate, bidirectional=True) #Forward LSTM layer that processes inputs in forward direction
        self.bilstm_backward = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_units_backward, num_layers=num_layers, batch_first=True, dropout=dropout_rate, bidirectional=True) #Backward LSTM layer
        self.dropout = nn.Dropout(dropout_rate) #Dropout layer
        self.fc = nn.Linear((hidden_units_forward + hidden_units_backward) * 2, vocab_size)  #Fully connected layer, *2 is for bidirectional

    def forward(self, x):
        x = x.long().to(self.embedding.weight.device)  #Ensuring input tensor is on the same device as the model
        embedded = self.embedding(x) #Converting input to dense embeddings
        lstm_output, _ = self.bilstm(embedded) #Forward output
        embedded_reversed = torch.flip(embedded, dims=[1]) #Reversing the embedded input sequences
        lstm_output_backward, _ = self.bilstm_backward(embedded_reversed) #Output of the backward LSTM
        lstm_output = self.dropout(lstm_output) #Applying dropout
        lstm_output_backward = self.dropout(lstm_output_backward)
        output = self.fc(torch.cat([lstm_output, lstm_output_backward], dim=-1)) #Getting the final output
        return output

#Model parameters
embedding_dim = 128
hidden_units_forward = 128
hidden_units_backward = 128
num_layers = 2
dropout_rate = 0.5

model = BiLSTMModel(vocab_size, embedding_dim, hidden_units_forward, hidden_units_backward, num_layers, dropout_rate).to(device)

In [45]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Number of parameters: {total_params}")

Number of parameters: 1403524


<h1><b>Training</b></h1>


In [57]:
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Getting the data ready for PyTorch model
# We used 16 batch because of computitation resources limitations.
train_dataset = TensorDataset(torch.tensor(numerical_train_x), torch.tensor(numerical_train_y))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Defining the loss function and optimizer
# We chose the CrossEntropyLoss because it gave us the best result
# We many different optimizers, and their parameters and this is the best combination that gives the best result.
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.001, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=True)

#Start of training
num_epochs = 50
for epoch in range(num_epochs):
    model.train() 
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()  
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model(inputs)                             # Creation of the output using input
        outputs = outputs.view(-1, outputs.shape[-1])       # Reshape outputs
        targets = targets.view(-1)                          # Flatten targets
        loss = criterion(outputs, targets)                  # Compute the loss using our output and labels(targets)
        loss.backward()                                     # Backward pass
        optimizer.step()                                    # Update weights using newly found loss
        running_loss += loss.item() * inputs.size(0)        # Loss of current input loss 
    epoch_loss = running_loss / len(train_dataset)          # Total epoch loss
    # We printed the losses every epoch because we prefered to see the changes per epoch.
    # This could be printed in wider range such as in every (2,3,5,10) epoch, so that it wont spam messages.
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")


Epoch [1/50], Loss: 0.0149
Epoch [2/50], Loss: 0.0049
Epoch [3/50], Loss: 0.0040
Epoch [4/50], Loss: 0.0035
Epoch [5/50], Loss: 0.0032
Epoch [6/50], Loss: 0.0030
Epoch [7/50], Loss: 0.0028
Epoch [8/50], Loss: 0.0027
Epoch [9/50], Loss: 0.0025
Epoch [10/50], Loss: 0.0024
Epoch [11/50], Loss: 0.0024
Epoch [12/50], Loss: 0.0023
Epoch [13/50], Loss: 0.0022
Epoch [14/50], Loss: 0.0022
Epoch [15/50], Loss: 0.0021
Epoch [16/50], Loss: 0.0021
Epoch [17/50], Loss: 0.0020
Epoch [18/50], Loss: 0.0020
Epoch [19/50], Loss: 0.0019
Epoch [20/50], Loss: 0.0019
Epoch [21/50], Loss: 0.0018
Epoch [22/50], Loss: 0.0018
Epoch [23/50], Loss: 0.0018
Epoch [24/50], Loss: 0.0017
Epoch [25/50], Loss: 0.0017
Epoch [26/50], Loss: 0.0017
Epoch [27/50], Loss: 0.0016
Epoch [28/50], Loss: 0.0016
Epoch [29/50], Loss: 0.0016
Epoch [30/50], Loss: 0.0016
Epoch [31/50], Loss: 0.0015
Epoch [32/50], Loss: 0.0015
Epoch [33/50], Loss: 0.0015
Epoch [34/50], Loss: 0.0015
Epoch [35/50], Loss: 0.0015
Epoch [36/50], Loss: 0.0015
E

<h1><b>Saving and Loading Weights of the Model</b></h1>


In [58]:
torch.save(model.state_dict(), "weights/model_weights3.pth") #Saving the model weights

In [70]:
model = BiLSTMModel(vocab_size, embedding_dim, hidden_units_forward, hidden_units_backward, num_layers, dropout_rate).to(device)

#Loading the weights
model.load_state_dict(torch.load("weights/model_weights3.pth"))

<All keys matched successfully>

<h1><b>Evaluation</b></h1>


In [55]:
import random

model.eval()

num_examples = 5  #To display 5 random samples from the training set
random_indices = random.sample(range(len(numerical_train_x)), num_examples) #Selecting random sentences

for i in random_indices:
    #Wrong and correct sequences
    input_seq = padded_train_x[i]
    correct_seq = padded_train_y[i]

    #Converting characters to their embeddings
    input_indices = [char_to_index[char] for char in input_seq]
    correct_indices = [char_to_index[char] for char in correct_seq]

    #Removing padding token
    input_indices = [index for index in input_indices if index != char_to_index['<PAD>']]
    correct_indices = [index for index in correct_indices if index != char_to_index['<PAD>']]

    #Decoding numerical indices back into characters by excluding the <EOS> token
    input_text = ''.join([index_to_char[index] for index in input_indices if index_to_char[index] != '<EOS>'])
    correct_text = ''.join([index_to_char[index] for index in correct_indices if index_to_char[index] != '<EOS>'])

    #Preparing input for prediction
    input_for_prediction = torch.tensor(numerical_train_x[i]).unsqueeze(0).to(device)

    #Predicting the input sequence
    with torch.no_grad():
        predicted_numerical = model(input_for_prediction) #Output of the model
    predicted_seq = [index_to_char[index.item()] for index in predicted_numerical.argmax(dim=-1)[0]] #Converting the output back into the characters

    #Exclude everything after the <EOS> token while printing
    predicted_text = ''
    for char in predicted_seq:
        if char == '<EOS>':
            break
        predicted_text += char

    print("Input   : ", input_text)
    print("Correct : ", correct_text)
    print("Predicted: ", predicted_text)
    print("-" * 50)

Input   :  tarhanerdemyasa yapabilir  idare yetkisiz diyor hukumet bir davranisi yasalarda bir hukum bulup haklari iptal etmekte ve o iptal butun ulkede emsal sayilmaktadir
Correct :  tarhanerdemyasa yapabilir  idare yetkisiz diyor hükümet bir davranışı yasalarda bir hüküm bulup hakları iptal etmekte ve o iptal bütün ülkede emsal sayılmaktadır
Predicted:  tarhanerdemyasa yapabilir  idare yetkisiz diyor hükümet bir davranışı yasalarda bir hüküm bulup hakları iptal etmekte ve o iptal bütün ülkede emsal sayılmaktadır
--------------------------------------------------
Input   :  gul  zirve oncesi sunlari soyledi :
Correct :  gül  zirve öncesi şunları söyledi :
Predicted:  gül  zirve öncesi şunları söyledi :
--------------------------------------------------
Input   :  fransiz gazetesi  abd nin irak a mudahaleyle ilgili planlarini gozden gecirmek zorunda kaldigini belirterek  bu konuda abd basininda cikan haberlere de atifta bulundu
Correct :  fransız gazetesi  abd nin ırak a müdahaleyle il

<h1><b>Testing Model</b></h1>


<h2>Reading test file</h2>


In [59]:
import pandas as pd

test_data = pd.read_csv("data/test.csv") #Test data

test_sentences = test_data["Sentence"]

#Tokenizing test sequences
tokenized_test_x = []
for sentence in test_sentences:
    tokenized_sentence = list(sentence.strip())
    tokenized_sentence.append('<EOS>')
    tokenized_test_x.append(tokenized_sentence)

<h2>Padding</h2>


In [60]:
max_length_test = max(len(seq) for seq in tokenized_test_x) #Finding the maximum length of sequences in the test data

padded_test_x = pad_sequences(tokenized_test_x, maxlen=max_length_test, padding='post', truncating='post', value=PAD_TOKEN, dtype=object) #Padding test sequences until all of them have the length of "max_length_test"

padded_test_x = [list(seq) for seq in padded_test_x] #Converting back to list

In [61]:
#Our training set did not have capitalized letters, but the test set does.
#Therefore, we need to make all the capitalized letters lower case first,
#And then convert them back to their original form while printing the output file.

changed_indices = [] #List to store indices with upper case letters
for i, seq in enumerate(padded_test_x):
    for j, char in enumerate(seq):
        if char.isupper() and char not in ['<EOS>', '<PAD>']: #Excluding the upper case letters in <EOS> and <PAD> tokens
            changed_indices.append((i, j))
            padded_test_x[i][j] = char.lower() #Converting upper case letters into lower case

numerical_test_x = [[char_to_index[char] for char in seq] for seq in padded_test_x]

<h2>Predicting - Saving the Output</h2>


In [71]:
import csv
import os

output_csv_file = "eval/results7.csv" #File to write our final output
os.makedirs(os.path.dirname(output_csv_file), exist_ok=True)

predictions = [] #List to store predicted sentences

for index, row in enumerate(test_data["Sentence"]):
    input_sequence = numerical_test_x[index]
    input_tensor = torch.tensor(input_sequence).unsqueeze(0).to(device)
    with torch.no_grad():
        predicted_numerical = model(input_tensor)
    predicted_sequence = [index_to_char[index.item()] for index in predicted_numerical.argmax(dim=-1)[0]]
    if '<EOS>' in predicted_sequence:
        eos_index = predicted_sequence.index('<EOS>')
        predicted_sequence = predicted_sequence[:eos_index] #Removing anything after the <EOS> token
    prediction = ''.join([char for char in predicted_sequence if char != '<EOS>'])
    predictions.append(prediction)

#Writing predictions to the output file
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ID', 'Prediction'])
    for index, prediction in enumerate(predictions):
        # Checking the indices if they corresponds to a changed uppercase letter and if they are, re-capitalize accordingly
        for i, j in changed_indices:
            if index == i:
                prediction = prediction[:j] + prediction[j].upper() + prediction[j+1:]
        writer.writerow([index, prediction])