In [None]:
!pip install LughaatNLP

Collecting LughaatNLP
  Downloading LughaatNLP-1.0.6-py3-none-any.whl.metadata (39 kB)
Collecting python-Levenshtein (from LughaatNLP)
  Downloading python_Levenshtein-0.26.0-py3-none-any.whl.metadata (3.7 kB)
Collecting gtts (from LughaatNLP)
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Collecting SpeechRecognition (from LughaatNLP)
  Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl.metadata (28 kB)
Collecting pydub (from LughaatNLP)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting Levenshtein==0.26.0 (from python-Levenshtein->LughaatNLP)
  Downloading levenshtein-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.2 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.26.0->python-Levenshtein->LughaatNLP)
  Downloading rapidfuzz-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading LughaatNLP-1.0.6-py3-none-any.whl (69.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

#Libraries

In [None]:
import torch
import pandas as pd
import string
import re
from LughaatNLP import LughaatNLP
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

In [None]:

# Suppress SettingWithCopyWarning
pd.options.mode.chained_assignment = None


In [None]:
# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"


#Loading the data set

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/datasets/english-urdu-dataset/parallel-corpus.xlsx'
df = pd.read_excel(file_path)

# Drop columns where the header is 'Unnamed:'
original_data = df.loc[:, ~df.columns.str.contains('^Unnamed')]

original_data.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [None]:
data=original_data.copy() #making a copy so i can use it whenever i needed

In [None]:
#Renaming col for better understanding
# Rename columns 'SENTENCES' to 'English' and 'MEANING' to 'Urdu'
data.rename(columns={'SENTENCES ': 'English', 'MEANING': 'Urdu'}, inplace=True)

# Display the updated DataFrame to confirm
data.head()


Unnamed: 0,English,Urdu
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


#Preprocessing

In [None]:
# Drop any rows with NaN values in any column
data.dropna(inplace=True)

# Remove rows where 'English' has only one character or is empty
data = data[(data['English'].str.len() > 1) & (data['English'] != '')]

# Remove rows where 'Urdu' has only one character or is empty
data = data[(data['Urdu'].str.len() > 1) & (data['Urdu'] != '')]

###English Preprocessing

In [None]:
# Function to preprocess sentences
def preprocess_english_data(data):
    # Remove URLs
    data= re.sub(r'http\S+|www\S+|https\S+', '', data, flags=re.MULTILINE)
    # Remove all non-alphabetic characters, keeping only letters a-z and A-Z
    data = re.sub(r'[^a-zA-Z\s]', '', data)  # Keeps only letters and spaces
    # Remove extra spaces
    data = re.sub(r'\s+', ' ', data).strip()
    #to lower case
    data=data.lower()
    return data


In [None]:
data['Clean_English'] = data['English'].apply(preprocess_english_data)

data[['English', 'Clean_English']].head()

Unnamed: 0,English,Clean_English
0,How can I communicate with my parents?,how can i communicate with my parents
1,How can I make friends?’,how can i make friends
2,Why do I get so sad?’,why do i get so sad
3,"If you’ve asked yourself such questions, you’r...",if youve asked yourself such questions youre n...
4,"Depending on where you’ve turned for guidance,...",depending on where youve turned for guidance y...


###Urdu Preprocessing

In [None]:
# Initialize LughaatNLP instance
urdu_text_processing = LughaatNLP()


In [None]:
# Function to clean Urdu sentences

def preprocess_urdu_data(data):

    # Keep only Urdu characters (Unicode range: 0600-06FF) and spaces
    data = re.sub(r'[^ا-ے\s]', '', data)  # Removes everything except Urdu letters and spaces
    # Remove extra spaces
    data = re.sub(r'\s+', ' ', data).strip()
    #Remove diacritics
    data = urdu_text_processing.remove_diacritics(data)
    #Remove URLs
    data = urdu_text_processing.remove_urls(data)

    return data


In [None]:
data['Clean_Urdu'] = data['Urdu'].apply(preprocess_urdu_data)

data[['Urdu', 'Clean_Urdu']].head()

Unnamed: 0,Urdu,Clean_Urdu
0,میں اپنے والدین سے کیسے بات کروں ؟,میں اپنے والدین سے کیسے بات کروں
1,میں دوست کیسے بنائوں ؟,میں دوست کیسے بناوں
2,میں اتنا اداس کیوں ہوں؟.,میں اتنا اداس کیوں ہوں
3,اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...,اگر پ نے اپنے پ سے ایسے سوالات کیے ہیں تو پ اک...
4,اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...,اس بات پر منحصر ہے کہ پ رہنمای کے لیے کہاں گے ...


In [None]:
english_sentences = data['Clean_English'].tolist()
urdu_sentences = data['Clean_Urdu'].tolist()


##Tokenization

In [None]:
def tokenize(sentences):
    return [sentence.split() for sentence in sentences]

In [None]:
# Tokenize both English and Urdu sentences
tokenized_english = tokenize(english_sentences)
tokenized_urdu = tokenize(urdu_sentences)



In [None]:
tokenized_english[:5]

[['how', 'can', 'i', 'communicate', 'with', 'my', 'parents'],
 ['how', 'can', 'i', 'make', 'friends'],
 ['why', 'do', 'i', 'get', 'so', 'sad'],
 ['if',
  'youve',
  'asked',
  'yourself',
  'such',
  'questions',
  'youre',
  'not',
  'alone'],
 ['depending',
  'on',
  'where',
  'youve',
  'turned',
  'for',
  'guidance',
  'you',
  'may',
  'have',
  'been',
  'given',
  'conflicting',
  'answers']]

##Building vocab

In [None]:
# Combined function to build vocab and numerize sentences
def build_vocab_and_numerize(tokenized_sentences):
    vocab = {'<unk>': 1, '<pad>': 0, '<bos>': 2, '<eos>': 3}  # Special tokens
    idx = 4  # Start indexing words from 4
    numerized_sentences = []  # This will store the numerized sentences

    for sentence in tokenized_sentences:
        # Start with the <bos> token
        numerized_sentence = [vocab['<bos>']]
        for word in sentence:
            if word not in vocab:
                vocab[word] = idx
                idx += 1
            # Add the word index to the numerized sentence
            numerized_sentence.append(vocab[word])
        # End with the <eos> token
        numerized_sentence.append(vocab['<eos>'])

        # Add the numerized sentence to the list
        numerized_sentences.append(numerized_sentence)

    return vocab, numerized_sentences




In [None]:
# Build vocabularies and numerize for English and Urdu
english_vocab, numerized_english = build_vocab_and_numerize(tokenized_english)
urdu_vocab, numerized_urdu = build_vocab_and_numerize(tokenized_urdu)

#print("English Vocabulary:", english_vocab)
#print("Numerized English Sentences:", numerized_english)
#print("Urdu Vocabulary:", urdu_vocab)
#print("Numerized Urdu Sentences:", numerized_urdu)

In [None]:
# Convert dictionary items to a list and print the first 5
first_five = list(english_vocab.items())[:11]
print(first_five)

[('<unk>', 1), ('<pad>', 0), ('<bos>', 2), ('<eos>', 3), ('how', 4), ('can', 5), ('i', 6), ('communicate', 7), ('with', 8), ('my', 9), ('parents', 10)]


In [None]:
len(english_vocab),len(urdu_vocab)

(15760, 12558)

In [None]:
numerized_english[:5]

[[2, 4, 5, 6, 7, 8, 9, 10, 3],
 [2, 4, 5, 6, 11, 12, 3],
 [2, 13, 14, 6, 15, 16, 17, 3],
 [2, 18, 19, 20, 21, 22, 23, 24, 25, 26, 3],
 [2, 27, 28, 29, 19, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 3]]

###Convert into tensor

In [None]:
# Step 1: Convert numerized sentences to tensor format
def sentences_to_tensors(numerized_sentences):
    # Convert each sentence to a tensor and move to the specified device
    return [torch.tensor(sentence).to(device) for sentence in numerized_sentences]

In [None]:
# Convert to tensors and move to device
tensor_english = sentences_to_tensors(numerized_english)
tensor_urdu = sentences_to_tensors(numerized_urdu)


##Padding

In [None]:
# Using the <pad> index from the vocab for padding
padded_english = pad_sequence(tensor_english, batch_first=True, padding_value=english_vocab['<pad>'])
padded_urdu = pad_sequence(tensor_urdu, batch_first=True, padding_value=urdu_vocab['<pad>'])

In [None]:
padded_english[0]

tensor([ 2,  4,  5,  6,  7,  8,  9, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 

##splitting the dataset into train,validation and test

In [None]:
# Split the tokenized padded tensors into train, val, test sets
train_eng, temp_eng, train_urdu, temp_urdu = train_test_split(padded_english, padded_urdu, test_size=0.4, random_state=42)
val_eng, test_eng, val_urdu, test_urdu = train_test_split(temp_eng, temp_urdu, test_size=0.5, random_state=42)

##Creating tensor dataset object

In [None]:
# Create TensorDataset objects
train_data = TensorDataset(train_eng, train_urdu)
val_data = TensorDataset(val_eng, val_urdu)
test_data = TensorDataset(test_eng, test_urdu)

##Creating dataloader

In [None]:
# Create DataLoader objects
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [None]:
len(train_loader),len(val_loader),len(test_loader)

(556, 186, 186)

##Building the model

In [None]:
torch.cuda.empty_cache()

In [None]:
eng_vocab_size = 15760  # size of English vocab
urdu_vocab_size = 12558  # size of Urdu vocab
embed_size = 128  # embedding dimension

eng_embedding = nn.Embedding(eng_vocab_size, embed_size)  # English embedding layer
urdu_embedding = nn.Embedding(urdu_vocab_size, embed_size)  # Urdu embedding layer

In [None]:
eng_embedding,urdu_embedding

(Embedding(15760, 128), Embedding(12558, 128))

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, eng_vocab_size, urdu_vocab_size, embed_size, hidden_size):
        super(Seq2Seq, self).__init__()
        # Embedding layers
        self.eng_embedding = nn.Embedding(eng_vocab_size, embed_size)
        self.urdu_embedding = nn.Embedding(urdu_vocab_size, embed_size)

        # Encoder RNN (GRU or LSTM)
        self.encoder = nn.GRU(embed_size, hidden_size, batch_first=True)

        # Decoder RNN (GRU or LSTM)
        self.decoder = nn.GRU(embed_size, hidden_size, batch_first=True)

        # Final layer to predict the Urdu word
        self.fc = nn.Linear(hidden_size, urdu_vocab_size)

    def forward(self, eng_input, urdu_input):
        # Pass the English input through the embedding layer
        eng_embedded = self.eng_embedding(eng_input)

        # Encode the English sentence
        _, hidden = self.encoder(eng_embedded)

        # Pass the Urdu input through the embedding layer
        urdu_embedded = self.urdu_embedding(urdu_input)

        # Decode the Urdu sentence using the encoded hidden state from the encoder
        output, _ = self.decoder(urdu_embedded, hidden)

        # Pass through fully connected layer to get predicted Urdu tokens
        predictions = self.fc(output)
        return predictions


In [None]:
model = Seq2Seq(eng_vocab_size, urdu_vocab_size, embed_size, hidden_size=128)
model.to(device)

Seq2Seq(
  (eng_embedding): Embedding(15760, 128)
  (urdu_embedding): Embedding(12558, 128)
  (encoder): GRU(128, 128, batch_first=True)
  (decoder): GRU(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=12558, bias=True)
)

In [None]:
criterion  = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 10
model.train()  # Set model to training mode

for epoch in range(num_epochs):
    total_loss = 0
    for eng_batch, urdu_batch in train_loader:
        eng_batch, urdu_batch = eng_batch.to(device), urdu_batch.to(device)  # Move batches to GPU

        optimizer.zero_grad()  # Zero out gradients

        # Forward pass
        output = model(eng_batch, urdu_batch[:, :-1])  # Urdu inputs (except last token)
        loss = criterion(output.view(-1, urdu_vocab_size), urdu_batch[:, 1:].reshape(-1))  # Shifted Urdu targets
        print(output.shape,urdu_batch.shape)
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.27 GiB. GPU 0 has a total capacity of 14.75 GiB of which 793.06 MiB is free. Process 6096 has 13.97 GiB memory in use. Of the allocated memory 11.76 GiB is allocated by PyTorch, and 2.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

###Setting an optimizer and loss function

##Train_test_validation loop