# I210503 Muhammad Zian Ahmed
# I212562 Humayun Malik
# NLP - D
# Project

In [2]:
import pandas as pd
import random

# Load Urdu and English datasets
with open('/kaggle/input/nlpproject/urd_Arab.dev', 'r', encoding='utf-8') as f:
    urd_dev = f.readlines()

with open('/kaggle/input/nlpproject/urd_Arab.devtest', 'r', encoding='utf-8') as f:
    urd_devtest = f.readlines()

with open('/kaggle/input/nlpproject/eng_Latn.dev', 'r', encoding='utf-8') as f:
    eng_dev = f.readlines()

with open('/kaggle/input/nlpproject/eng_Latn.devtest', 'r', encoding='utf-8') as f:
    eng_devtest = f.readlines()

# Combine datasets
urdu_sentences = urd_dev + urd_devtest
english_sentences = eng_dev + eng_devtest

# Ensure both datasets have the same number of sentences
assert len(urdu_sentences) == len(english_sentences), "Datasets are not aligned!"

# Create a DataFrame
data = pd.DataFrame({
    'urdu': [sentence.strip() for sentence in urdu_sentences],
    'english': [sentence.strip() for sentence in english_sentences]
})

# Shuffle the data
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into training, validation, and test sets
train_size = int(0.7 * len(data))
val_size = int(0.15 * len(data))

train_data = data[:train_size]
val_data = data[train_size:train_size + val_size]
test_data = data[train_size + val_size:]

# Save splits to files
train_data.to_csv('train.csv', index=False)
val_data.to_csv('val.csv', index=False)
test_data.to_csv('test.csv', index=False)

print("Data preprocessing complete. Files saved as 'train.csv', 'val.csv', and 'test.csv'.")


# Print dataset lengths and first 5 samples
print(f"Number of Urdu sentences: {len(urdu_sentences)}")
print(f"Number of English sentences: {len(english_sentences)}")
print("\nSample Urdu sentences:", urdu_sentences[:5])
print("\nSample English sentences:", english_sentences[:5])

Data preprocessing complete. Files saved as 'train.csv', 'val.csv', and 'test.csv'.
Number of Urdu sentences: 2009
Number of English sentences: 2009

Sample Urdu sentences: ['پیر کے روز، سٹینفورڈ اسکول آف میڈیسن کے سائنسدانوں نے ایک جدید تشخیصی آلہ دریافت کرنے کا اعلان کیا ہے جو خلیوں کو اس کی اقسام کے لحاظ سے ترتیب دے سکتا ہے: یہ ایک چھوٹی سی پرنٹیبل چپ ہے جو غالباً ایک امریکی سنٹ میں معیاری انک جیٹ پرنٹرز کا استعمال کر کے تیار کی جا سکتی ہے-\n', 'سرکردہ محققین کہتے ہیں کہ اس سے کم آمدنی والے ممالک کے مریضوں میں کینسر، تپ و دق، ایچ آئی وی اور ملیریا کا جلد پتہ چل سکتا ہے، جہاں چھاتی کے کینسر جیسی بیماریوں سے بچنے کی شرح امیر ممالک کی مقابلے میں نصف ہو سکتی ہے۔\n', 'مقامی وقت کے مطابق تقریبا صبح 9:30 بجے (0230 UTC) 39C JASگریپین رن وے پر ٹکرا کر دھماکے کے ساتھ پھٹ کر تباہ ہو گیا، جس کے باعث ہوائی اڈے کی تجارتی پروازیں بند کرنی پڑیں-\n', 'پائلٹ کو اسکواڈرن لیڈر ڈیلوکرٹ پٹاوی کے نام سے شناخت کیا گیا۔\n', 'مقامی میڈیا نے بتایا کہ ہوائی اڈے کی ایک آگ بھجانے والی گاڑی کاروائی کرتے وقت الٹ گ

In [3]:
import re

# Tokenizer for Urdu
def tokenize_urdu(sentence):
    return re.findall(r'\w+|[^\s\w]', sentence, re.UNICODE)

# Tokenizer for English
def tokenize_english(sentence):
    return re.findall(r'\w+|[^\s\w]', sentence)

# Tokenize training, validation, and test data
def tokenize_dataset(dataset):
    urdu_tokenized = dataset['urdu'].apply(tokenize_urdu)
    english_tokenized = dataset['english'].apply(tokenize_english)
    return urdu_tokenized, english_tokenized

# Tokenize each split
train_urdu, train_english = tokenize_dataset(train_data)
val_urdu, val_english = tokenize_dataset(val_data)
test_urdu, test_english = tokenize_dataset(test_data)

# Save tokenized data
train_tokenized = pd.DataFrame({'urdu': train_urdu, 'english': train_english})
val_tokenized = pd.DataFrame({'urdu': val_urdu, 'english': val_english})
test_tokenized = pd.DataFrame({'urdu': test_urdu, 'english': test_english})

train_tokenized.to_csv('train_tokenized.csv', index=False)
val_tokenized.to_csv('val_tokenized.csv', index=False)
test_tokenized.to_csv('test_tokenized.csv', index=False)

print("Tokenized data saved as 'train_tokenized.csv', 'val_tokenized.csv', and 'test_tokenized.csv'.")

# Print first 5 samples of each split
print("\nTraining samples:\n", train_data.head())
print("\nValidation samples:\n", val_data.head())
print("\nTest samples:\n", test_data.head())


# Print first 5 tokenized samples
print("\nTokenized Training Urdu Sentences:\n", train_urdu.head())
print("\nTokenized Training English Sentences:\n", train_english.head())


Tokenized data saved as 'train_tokenized.csv', 'val_tokenized.csv', and 'test_tokenized.csv'.

Training samples:
                                                 urdu  \
0  بہت سے لوگوں کا مشاہدہ ہے کہ فعلیات اور برتاؤ ...   
1  باردیا کے مشرق میں کمین لگا کر برٹش نے اطالوی ...   
2  جرمنی نے اس حملے کو "آپریشن سی لاین" نام دیا۔ ...   
3  ڈی- ڈے لینڈنگ اور اس کے بعد کی لڑائیوں کے نتیج...   
4  بیلجیئم کے آج کے دور کے حصے ماضی میں لگزمبرگ ک...   

                                             english  
0  Many observed rhythms in physiology and behavi...  
1  In an ambush east of Bardia, the British captu...  
2  Germany code-named the attack “Operation Seali...  
3  The D-Day landings and the following battles h...  
4  Present-day parts of Belgium were part of Luxe...  

Validation samples:
                                                    urdu  \
1406       کینیڈا کا خلاف نڈال کے رو برو ریکارڈ 7–2 ہے۔   
1407  آو ہم اٹلی کے پلانس کی تشریح سے بات شروع کریں۔...   
1408  پچھلے ماہ پولی

In [4]:
from collections import Counter

def build_vocab(sentences):
    word_counts = Counter([word for sentence in sentences for word in sentence])
    vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common(), start=4)}
    vocab['<pad>'] = 0
    vocab['<sos>'] = 1
    vocab['<eos>'] = 2
    vocab['<unk>'] = 3
    return vocab

# Build vocabularies
urdu_vocab = build_vocab(train_urdu)
english_vocab = build_vocab(train_english)

# Save vocabularies
import json

with open('urdu_vocab.json', 'w') as f:
    json.dump(urdu_vocab, f)

with open('english_vocab.json', 'w') as f:
    json.dump(english_vocab, f)

print("Vocabularies saved as 'urdu_vocab.json' and 'english_vocab.json'.")


# Print vocabulary sizes and first 10 words
print(f"Urdu Vocabulary Size: {len(urdu_vocab)}")
print(f"English Vocabulary Size: {len(english_vocab)}")

print("\nSample Urdu Vocabulary:\n", dict(list(urdu_vocab.items())[:10]))
print("\nSample English Vocabulary:\n", dict(list(english_vocab.items())[:10]))



Vocabularies saved as 'urdu_vocab.json' and 'english_vocab.json'.
Urdu Vocabulary Size: 7266
English Vocabulary Size: 7498

Sample Urdu Vocabulary:
 {'کے': 4, '۔': 5, 'کی': 6, 'ہے': 7, 'میں': 8, '،': 9, 'سے': 10, 'اور': 11, 'کو': 12, 'ہیں': 13}

Sample English Vocabulary:
 {'the': 4, '.': 5, ',': 6, 'of': 7, 'and': 8, 'to': 9, 'a': 10, 'in': 11, 'is': 12, 'The': 13}


In [5]:
def sentence_to_indices(sentence, vocab, max_len=50):
    indices = [vocab.get(word, vocab['<unk>']) for word in sentence]
    indices = [vocab['<sos>']] + indices[:max_len-2] + [vocab['<eos>']]
    return indices + [vocab['<pad>']] * (max_len - len(indices))

def dataset_to_indices(urdu_sentences, english_sentences, urdu_vocab, english_vocab, max_len=50):
    urdu_indices = urdu_sentences.apply(lambda x: sentence_to_indices(x, urdu_vocab, max_len))
    english_indices = english_sentences.apply(lambda x: sentence_to_indices(x, english_vocab, max_len))
    return urdu_indices, english_indices

# Convert datasets to indices
train_urdu_indices, train_english_indices = dataset_to_indices(train_urdu, train_english, urdu_vocab, english_vocab)
val_urdu_indices, val_english_indices = dataset_to_indices(val_urdu, val_english, urdu_vocab, english_vocab)
test_urdu_indices, test_english_indices = dataset_to_indices(test_urdu, test_english, urdu_vocab, english_vocab)

# Save preprocessed data
train_preprocessed = pd.DataFrame({'urdu': train_urdu_indices, 'english': train_english_indices})
val_preprocessed = pd.DataFrame({'urdu': val_urdu_indices, 'english': val_english_indices})
test_preprocessed = pd.DataFrame({'urdu': test_urdu_indices, 'english': test_english_indices})

train_preprocessed.to_csv('train_preprocessed.csv', index=False)
val_preprocessed.to_csv('val_preprocessed.csv', index=False)
test_preprocessed.to_csv('test_preprocessed.csv', index=False)

print("Preprocessed data saved as 'train_preprocessed.csv', 'val_preprocessed.csv', and 'test_preprocessed.csv'.")



# Print first 5 samples of indexed data
print("\nIndexed Urdu Sentences:\n", train_urdu_indices.head())
print("\nIndexed English Sentences:\n", train_english_indices.head())

Preprocessed data saved as 'train_preprocessed.csv', 'val_preprocessed.csv', and 'test_preprocessed.csv'.

Indexed Urdu Sentences:
 0    [1, 42, 10, 82, 14, 777, 7, 19, 3151, 11, 1506...
1    [1, 3154, 4, 331, 8, 3155, 332, 33, 3156, 18, ...
2    [1, 398, 18, 16, 482, 12, 38, 1511, 154, 3159,...
3    [1, 679, 30, 954, 2042, 11, 16, 4, 49, 6, 2043...
4    [1, 1512, 4, 166, 4, 155, 4, 485, 778, 8, 2044...
Name: urdu, dtype: object

Indexed English Sentences:
 0    [1, 320, 1307, 1901, 11, 3171, 8, 373, 63, 317...
1    [1, 53, 40, 3177, 374, 7, 3178, 6, 4, 430, 190...
2    [1, 321, 3183, 16, 508, 4, 322, 285, 1904, 318...
3    [1, 13, 1310, 16, 1311, 3187, 8, 4, 623, 1312,...
4    [1, 3188, 16, 114, 774, 7, 1907, 36, 141, 7, 9...
Name: english, dtype: object


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

import pandas as pd

# Load preprocessed data

val_data = pd.read_csv('val_preprocessed.csv')

# Extract the src (Urdu) and trg (English) sequences
train_data = pd.read_csv('train_preprocessed.csv')
train_src = train_data['urdu'].apply(eval).tolist()
train_trg = train_data['english'].apply(eval).tolist()

val_src = val_data['urdu'].apply(eval).tolist()
val_trg = val_data['english'].apply(eval).tolist()




# Custom LSTM Cell
class LSTMCell(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LSTMCell, self).__init__()
        self.hidden_dim = hidden_dim

        self.W_i = nn.Linear(input_dim, hidden_dim)  # Input gate
        self.W_f = nn.Linear(input_dim, hidden_dim)  # Forget gate
        self.W_o = nn.Linear(input_dim, hidden_dim)  # Output gate
        self.W_c = nn.Linear(input_dim, hidden_dim)  # Cell state gate

    def forward(self, input, hidden, cell):
        # Hidden state and cell state
        h, c = hidden, cell

        # Gates
        i = torch.sigmoid(self.W_i(input))
        f = torch.sigmoid(self.W_f(input))
        o = torch.sigmoid(self.W_o(input))
        c_tilde = torch.tanh(self.W_c(input))

        # Update cell state and hidden state
        c = f * c + i * c_tilde
        h = o * torch.tanh(c)

        return h, c

# Encoder with Custom LSTM Cell
class CustomLSTMEncoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super(CustomLSTMEncoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.hidden_dim = hidden_dim
        self.lstm_cell = LSTMCell(embed_dim, hidden_dim)

    def forward(self, src):
        embedded = self.embedding(src)  # Shape: (batch_size, seq_len, embed_dim)
        batch_size = embedded.size(0)
        seq_len = embedded.size(1)

        hidden = torch.zeros(batch_size, self.hidden_dim).to(src.device)
        cell = torch.zeros(batch_size, self.hidden_dim).to(src.device)

        encoder_outputs = []

        for t in range(seq_len):
            hidden, cell = self.lstm_cell(embedded[:, t, :], hidden, cell)
            encoder_outputs.append(hidden)

        encoder_outputs = torch.stack(encoder_outputs, dim=1)  # Shape: (batch_size, seq_len, hidden_dim)
        return encoder_outputs, (hidden, cell)

# Attention Mechanism
# Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.W_a = nn.Linear(hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1)

    def forward(self, encoder_outputs, decoder_hidden):
        # Adjust the shape of decoder_hidden to match encoder_outputs
        decoder_hidden = decoder_hidden.unsqueeze(1)  # Shape: (batch_size, 1, hidden_dim)

        # Compute the attention scores
        scores = self.v(torch.tanh(self.W_a(encoder_outputs) + decoder_hidden)).squeeze(-1)

        # Apply softmax to get attention weights
        attention_weights = F.softmax(scores, dim=1)  # Shape: (batch_size, seq_len)

        # Compute the context vector as the weighted sum of encoder outputs
        context_vector = torch.sum(attention_weights.unsqueeze(2) * encoder_outputs, dim=1)  # Shape: (batch_size, hidden_dim)

        return context_vector, attention_weights

# Decoder with Custom LSTM Cell
# Decoder with Custom LSTM Cell
class CustomLSTMDecoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim):
        super(CustomLSTMDecoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.hidden_dim = hidden_dim
        self.lstm_cell = LSTMCell(embed_dim + hidden_dim, hidden_dim)  # LSTM with combined input and context vector
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, decoder_hidden, context_vector):
        embedded = self.embedding(input).unsqueeze(1)  # Shape: (batch_size, 1, embed_dim)
        combined = torch.cat((embedded, context_vector.unsqueeze(1)), dim=2)  # Shape: (batch_size, 1, embed_dim + hidden_dim)
        output, hidden = self.lstm_cell(combined.squeeze(1), decoder_hidden[0], decoder_hidden[1])  # Corrected unpacking
        output = self.fc(output)  # Shape: (batch_size, output_dim)
        return output, (hidden, hidden)  # Return hidden and hidden for consistency


# Seq2Seq Model with Custom LSTM Cells and Attention
class CustomLSTMSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder, attention):
        super(CustomLSTMSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.attention = attention

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        trg_vocab_size = self.decoder.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src.device)

        # Encoder outputs
        encoder_outputs, (hidden, cell) = self.encoder(src)

        # Decoder input (starting with <sos> token)
        input = trg[:, 0]  # The first token for each batch is <sos>

        # Initial decoder hidden state
        decoder_hidden = (hidden, cell)

        for t in range(1, trg_len):
            # Attention
            context_vector, _ = self.attention(encoder_outputs, decoder_hidden[0])

            # Decoder forward pass
            output, decoder_hidden = self.decoder(input, decoder_hidden, context_vector)

            # Store the output
            outputs[:, t, :] = output

            # Get the most probable word and use as input for next timestep (teacher forcing)
            top1 = output.argmax(1)
            input = top1 if random.random() < teacher_forcing_ratio else trg[:, t]

        return outputs




# Example Hyperparameters
INPUT_DIM = len(urdu_vocab)  # Example vocab size for source language (e.g. Urdu)
OUTPUT_DIM = len(english_vocab)  # Example vocab size for target language (e.g. English)
EMBED_DIM = 256
HIDDEN_DIM = 512

# Create Encoder, Decoder, Attention, and Seq2Seq Model
encoder = CustomLSTMEncoder(INPUT_DIM, EMBED_DIM, HIDDEN_DIM)
decoder = CustomLSTMDecoder(OUTPUT_DIM, EMBED_DIM, HIDDEN_DIM)
attention = Attention(HIDDEN_DIM)

model = CustomLSTMSeq2Seq(encoder, decoder, attention)

# Specify device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Define the device

# Move the model to the device
model = model.to(device)

# Optimizer and Loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index (0)

# Training loop
# Training loop
# Adjust the train and evaluate function for correct data format
def train(model, train_src, train_trg, val_src, val_trg, epochs=10, batch_size=32, teacher_forcing_ratio=0.5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for i in range(0, len(train_src), batch_size):  # Loop over training data
            batch_src = torch.tensor(train_src[i:i+batch_size]).to(device)
            batch_trg = torch.tensor(train_trg[i:i+batch_size]).to(device)

            optimizer.zero_grad()

            output = model(batch_src, batch_trg, teacher_forcing_ratio)
            output_dim = output.shape[-1]
            output = output[:, 1:, :].contiguous().view(-1, output_dim)  # Skip <sos> (1st token)

            # Adjust target shape to match the output
            trg = batch_trg[:, 1:].contiguous().view(-1)  # Skip <sos> (1st token)

            # Check if the shapes match
            assert output.shape[0] == trg.shape[0], f"Output shape {output.shape[0]} does not match target shape {trg.shape[0]}"

            loss = criterion(output, trg)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f'Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_src)}')

        # Evaluate on validation data after each epoch
        evaluate(model, val_src, val_trg, batch_size)


def evaluate(model, val_src, val_trg, batch_size):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for i in range(0, len(val_src), batch_size):
            batch_src = torch.tensor(val_src[i:i+batch_size]).to(device)
            batch_trg = torch.tensor(val_trg[i:i+batch_size]).to(device)

            output = model(batch_src, batch_trg, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:, 1:, :].contiguous().view(-1, output_dim)  # Skip <sos> (1st token)

            # Skip <sos> token from target sequence
            trg = batch_trg[:, 1:].contiguous().view(-1)  # Skip <sos> (1st token)

            # Ensure the target and output sizes are the same
            assert output.shape[0] == trg.shape[0], f"Output shape {output.shape[0]} does not match target shape {trg.shape[0]}"

            loss = criterion(output, trg)
            val_loss += loss.item()

    print(f'Validation Loss: {val_loss / len(val_src)}')


# Example: Start training
train(model, train_src, train_trg, val_src, val_trg, epochs=5, batch_size=32, teacher_forcing_ratio=0.5)



Epoch 1/5, Loss: 0.22592131443756233
Validation Loss: 0.22539847712976196
Epoch 2/5, Loss: 0.20869831611557332
Validation Loss: 0.22872842427504023
Epoch 3/5, Loss: 0.20325961051930067
Validation Loss: 0.23185210212124543
Epoch 4/5, Loss: 0.19725058021111305
Validation Loss: 0.2336385368904798
Epoch 5/5, Loss: 0.19204806365804009
Validation Loss: 0.23464615400447403


In [9]:
import torch
import random

def translate(model, src, trg_vocab, max_len=50):
    model.eval()
    with torch.no_grad():
        # The input to the decoder should be <sos> token (start of sequence)
        input = src[0]  # Assume src is a single sentence here (batch size 1)
        output_sentence = []

        # Initial hidden and cell states of the decoder
        encoder_outputs, (hidden, cell) = model.encoder(src.unsqueeze(0))  # Add batch dimension
        decoder_hidden = (hidden, cell)

        # Start with the <sos> token
        input = torch.tensor([trg_vocab['<sos>']]).to(device)

        for t in range(max_len):
            context_vector, _ = model.attention(encoder_outputs, decoder_hidden[0])
            output, decoder_hidden = model.decoder(input, decoder_hidden, context_vector)

            # Get the most probable token
            top1 = output.argmax(1)

            # Append to the generated sequence
            output_sentence.append(top1.item())

            # Stop if we generate the <eos> token
            if top1.item() == trg_vocab['<eos>']:
                break

            input = top1  # Use the generated word as the input for the next timestep

        return output_sentence

# Example: Translate the test set
test_data = pd.read_csv('test_preprocessed.csv')
test_src = test_data['urdu'].apply(eval).tolist()
test_trg = test_data['english'].apply(eval).tolist()

test_translations = []
for src_sentence in test_src:
    translation = translate(model, torch.tensor(src_sentence).to(device), english_vocab)
    test_translations.append(translation)


In [12]:
from nltk.translate.bleu_score import corpus_bleu

# Prepare references for BLEU evaluation (each reference is a list of words)
references = [[trg] for trg in test_trg]  # Wrap each target translation in a list

# Compute BLEU score
bleu_score = corpus_bleu(references, test_translations)
print(f'BLEU Score: {bleu_score}')


BLEU Score: 0.012018732047514769
