In [1]:
%%capture
!pip install indic-nlp-library

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import torchtext
torchtext.disable_torchtext_deprecation_warning()
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.nn.functional import one_hot
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

import os
import re
import unicodedata
import pandas as pd
import matplotlib.pyplot as plt
from indicnlp.tokenize import indic_tokenize

In [4]:
en_filepath = "/content/drive/MyDrive/IITB.en-hi.en"
hi_filepath = "/content/drive/MyDrive/IITB.en-hi.hi"

with open(en_filepath, "r", encoding='utf-8') as f:
  english_data = f.readlines()

with open(hi_filepath, "r", encoding='utf-8') as f:
  hindi_data = f.readlines()

In [60]:
req_hindi = hindi_data[212929:267939]
req_english = english_data[212929:267939]
data = {"english_txt":req_english,"hindi_txt":req_hindi}

df = pd.DataFrame(data)

In [61]:
df.sample(8)

Unnamed: 0,english_txt,hindi_txt
8894,Hide process para _ meters\n,छुपाएँ process अनुच्छेद मीटर्स\n
99246,N _ one\n,कुछनहीं (_ o) \n
74559,Reply\n,जवाब\n
195764,Download Manager\n,डाउनलोड संख्याः trust level\n
193870,% 1 -% 2\n,% 1 से% 2\n
121283,Scope\n,घर\n
217939,CNY\n,CNY\n
67198,STARTTLS not supported\n,STARTTLS समर्थित नहीं. \n


In [62]:
# Handling the encoding issue by removing the rows.
def is_hindi_corrupted(text):
    if re.search(r'[^\u0900-\u097F\s,.?!-]', text):
        return True
    return False

In [63]:
# Apply the function to the 'Hindi' column to create a mask
df['Corrupted'] = df['hindi_txt'].apply(is_hindi_corrupted)

# Filter out corrupted rows
df_clean = df[~df['Corrupted']]

# Drop the 'Corrupted' column as it's no longer needed
df_clean = df_clean.drop(columns=['Corrupted'])

In [64]:
df_clean.shape

(134454, 2)

In [65]:
def clean_text(text, language="english"):
    # Normalize unicode characters
    text = unicodedata.normalize('NFKC', text)
    # Convert to lowercase if the text is in English
    if language == "english":
        text = text.lower()
    # Remove any English words present in Hindi text.
    if language == "hindi":
        text = re.sub('[a-zA-Z]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [66]:
df_clean["english_txt"] = df_clean["english_txt"].apply(clean_text)
df_clean["hindi_txt"] = df_clean['hindi_txt'].apply(clean_text, args=("hindi",))

In [67]:
df_clean.sample(10)

Unnamed: 0,english_txt,hindi_txt
47370,start a slideshow view of the images,छव क सलइड श दशय आरभ कर
224830,false,गलत
135666,temperature tone color picker,तपकरम टन रग चयनक
10750,type,कसम
38843,whether to play a sound when logging out of a ...,कय सजल म लग ऑफ क दरन धवन बजन ह
65903,error expunging message,सनदश क हटन म तरट
151089,xlyap,एकसलयप
108783,export in asynchronous mode,अतलयकलक सथत म नरयत
14159,save all files,सभ फइल सहज
232121,o believers if you follow the path shown by go...,ऐ ईमन लनवल यद तम अललह क डर रखग त वह तमह एक वशष...


In [68]:
df_clean['English_Words'] = df_clean['english_txt'].apply(lambda x: len(x.split()))
df_clean['Hindi_Words'] = df_clean['hindi_txt'].apply(lambda x: len(x.split()))

In [69]:
# Calculate averages
average_english_words = df_clean['English_Words'].quantile(.99)
average_hindi_words = df_clean['Hindi_Words'].quantile(.99)

# Data for plotting
averages = [average_english_words, average_hindi_words]
languages = ['English', 'Hindi']

# Creating the plot
plt.figure(figsize=(8, 6))
plt.bar(languages, averages, color=['blue', 'red'])
plt.xlabel('Language')
plt.ylabel('Number of Words per Sentence')
plt.title('Comparison of Number of Words Counts in English and Hindi')
plt.ylim(0, max(averages) + 1)  # Adjust y-axis for better visualization
plt.show()

# Torchtext
Torchtext is a library within the PyTorch ecosystem designed to facilitate the preprocessing of textual data.

## get_tokenizer
. The get_tokenizer function is one of the core utilities provided by torchtext for tokenizing text data.

. get_tokenizer retrieves a tokenizer function based on the method specified. This tokenizer can then be used to convert strings of text into lists of tokens.
##Parameters
  
  tokenizer: This argument specifies the type of tokenizer to use. You can specify built-in tokenizers such as "basic_english", "spacy", "moses", or even provide a custom tokenizer function.

  language: Some tokenizers, like those based on the Moses or Spacy libraries, might require you to specify the language of the text, which influences how the text is tokenized (e.g., handling language-specific punctuation and splitting rules).

In [70]:
tokenizer_eng = get_tokenizer('basic_english')
tokenizer_hin = indic_tokenize.trivial_tokenize  # This is the Hindi tokenizer from Indic NLP

tokenized_english_txt = [tokenizer_eng(english_sen) for english_sen in df_clean['english_txt'] ]
tokenized_hindi_txt = [tokenizer_hin(hindi_sen) for hindi_sen in df_clean['hindi_txt'] ]

In [72]:
print(tokenized_english_txt[7])
print(tokenized_hindi_txt[7])

['name', 'of', 'the', 'value', 'to', 'watch']
['नम', 'क', 'मन', 'क']


## build_vocab_from_iterator
`build_vocab_from_iterator` function in the torchtext.vocab module is used to create a vocabulary from an iterable of tokenized data. This vocabulary is essential for converting textual data into numerical form.

#Parameters:
##tokenized_conv (iterator):
This is the main data input to the function. It should be an iterator (like a `generator` or a `list`) that yields sequences of tokens. Each sequence represents a document or an example in your dataset.
##min_freq (int, optional):
 This parameter specifies the minimum frequency a token must have to be included in the vocabulary. Tokens that appear fewer than min_freq times are excluded from the vocabulary. This is useful for removing rare words which might be typos or irrelevant to most analyses.
##specials (list of str, optional):
 This is a list of special tokens that you want to add to the vocabulary. Common special tokens include:
'<pad>': A padding token used to equalize the lengths of sequences.
'<oov>' (or '<unk>' for "unknown"): A token used to represent out-of-vocabulary words during inference, or when a word appears that is not in the training vocabulary.
##special_first (bool, optional):
 Determines the ordering of special tokens in the vocabulary. If True, special tokens are added at the beginning of the vocabulary. This can be helpful for certain models where token indices are significant (e.g., models using embedding layers might have specific handling for lower indices).

In [73]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# Step 3: Building Vocabulary
features_vocab = build_vocab_from_iterator(tokenized_english_txt, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
target_vocab = build_vocab_from_iterator(tokenized_hindi_txt, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

features_vocab.set_default_index(features_vocab['<unk>'])
target_vocab.set_default_index(target_vocab['<unk>'])

In [74]:
features_vocab_total_words = len(features_vocab)
target_vocab_total_words = len(target_vocab)

In [75]:
print(features_vocab_total_words)  #English
print(target_vocab_total_words)    #Hindi

16558
13587


In [76]:
features_vocab['<bos>']

2

In [77]:
print(features_vocab_total_words)
print(target_vocab_total_words)

16558
13587


In [78]:
def tokens_to_indices(tokenized_texts, vocab):
    indices_texts = []
    for sentence in tokenized_texts:
        indices_texts.append([vocab[token] for token in sentence if token in vocab])
    return indices_texts

In [79]:
english_indices = tokens_to_indices(tokenized_english_txt, features_vocab)
hindi_indices = tokens_to_indices(tokenized_hindi_txt, target_vocab)

In [80]:
english_indices[0:5]

[[330, 17, 13, 4, 138],
 [71, 3167, 3513, 667, 17],
 [138, 53, 2190, 31, 6, 1747],
 [58, 4, 138, 53, 2190, 31, 1747],
 [593, 5, 1032]]

In [104]:
class TranslationDataset(Dataset):
    def __init__(self, english_data, hindi_data):
        self.english_data = english_data
        self.hindi_data = hindi_data

    def __len__(self):
        return len(self.english_data)

    def __getitem__(self, idx):
        english = torch.tensor(self.english_data[idx], dtype=torch.long)
        hindi = torch.tensor(self.hindi_data[idx], dtype=torch.long)
        return english, hindi

# Create the custom dataset
dataset = TranslationDataset(english_indices, hindi_indices)
FIXED_LENGTH = 60  # or any appropriate length based on your data or model requirements


##Purpose of collate_fn
The primary purpose of `collate_fn` is to dynamically decide how to combine multiple data samples into a single batch. Data samples can be anything from images, texts, or other forms of data, and they might not naturally fit together in a straightforward way (e.g., texts of varying lengths).

In [105]:
def collate_batch(batch):
    english_batch, hindi_batch = zip(*batch)

    # Pad or truncate English batch
    english_batch = [torch.tensor(seq[:FIXED_LENGTH], dtype=torch.long) if len(seq) > FIXED_LENGTH else torch.cat([torch.tensor(seq, dtype=torch.long), torch.full((FIXED_LENGTH - len(seq),), features_vocab['<pad>'], dtype=torch.long)]) for seq in english_batch]

    # Pad or truncate Hindi batch
    hindi_batch = [torch.tensor(seq[:FIXED_LENGTH], dtype=torch.long) if len(seq) > FIXED_LENGTH else torch.cat([torch.tensor(seq, dtype=torch.long), torch.full((FIXED_LENGTH - len(seq),), target_vocab['<pad>'], dtype=torch.long)]) for seq in hindi_batch]

    # Convert lists to tensors
    english_batch = torch.stack(english_batch)
    hindi_batch = torch.stack(hindi_batch)

    return english_batch, hindi_batch


In [106]:
batch_size = 32  # Adjust the batch size as needed
train_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_batch, shuffle=True)

In [107]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [85]:
print(device)

cuda


In [108]:
sample = next(iter(train_dataloader))

  english_batch = [torch.tensor(seq[:FIXED_LENGTH], dtype=torch.long) if len(seq) > FIXED_LENGTH else torch.cat([torch.tensor(seq, dtype=torch.long), torch.full((FIXED_LENGTH - len(seq),), features_vocab['<pad>'], dtype=torch.long)]) for seq in english_batch]
  hindi_batch = [torch.tensor(seq[:FIXED_LENGTH], dtype=torch.long) if len(seq) > FIXED_LENGTH else torch.cat([torch.tensor(seq, dtype=torch.long), torch.full((FIXED_LENGTH - len(seq),), target_vocab['<pad>'], dtype=torch.long)]) for seq in hindi_batch]


In [109]:
sample[0].size()

torch.Size([32, 60])

`get_itos`: stands for "index-to-string". The method returns a list where the indices in the list correspond to the numerical indices used in your model, and the values at those indices are the actual string representations (tokens).

In [110]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):    #[4,8,9,15,12]
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [111]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()

        self.Wa = nn.Linear(hidden_size, hidden_size)   # Transform for query
        self.Ua = nn.Linear(hidden_size, hidden_size)   # Transform for keys
        self.Va = nn.Linear(hidden_size, 1)   # Compute the attention score

    def forward(self, query, keys):
        # Expand query to match keys' batch and sequence dimension
        # Encoder Output i.e key Shape (batch, seq_len, hidden_dim) when batch_first = True
        # Hidden State of Decoder i.e query shape (num_dir*num_layers, batch, hidden_dim)

        # Since Decoder Queries about the information to encoder on which token to focus when generating the current token
        # we need to replecate the decoder Hidden State along the encoder output to get score for each output.
        key_shape = keys.size()

        query = query.repeat(1, key_shape[1], 1)
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))  # attn_score = VT.(tanh(Wa*s|encoder + Ua|decoder + bias))
        scores = scores.squeeze(-1)

        weights = torch.softmax(scores, dim=1)
        weights = weights.unsqueeze(1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)

        bos_token_index = features_vocab['<bos>']
        decoder_input = torch.full((batch_size, 1), bos_token_index, dtype=torch.long, device=device)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []
        # fixed length set to 60
        for i in range(60):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)    # this is the shape of [num_layers * num_directions, batch_size, hidden_size]
                                            # encoder output [batch_size, seq_len, hidden_size]
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [112]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [118]:
# Load pre-trained weights
encoder_path = "/content/encoder_epoch_30.pth"
decoder_path = "/content/decoder_epoch_30.pth"
encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))

# Define training function with additional arguments for loading weights
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
          print_every=5, plot_every=5, save_every=5, save_path="./", load_weights=False):

    if load_weights:
        encoder.load_state_dict(torch.load(encoder_path))
        decoder.load_state_dict(torch.load(decoder_path))
        print("Loaded pre-trained weights from epoch 30.")

    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(31, n_epochs + 31):  # Start from epoch 31 (after loaded weights)
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, (epoch - 30) / n_epochs),
                                        epoch, (epoch - 30) / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

        if epoch % save_every == 0:
            encoder_save_path = os.path.join(save_path, f'encoder_epoch_{epoch}.pth')
            decoder_save_path = os.path.join(save_path, f'decoder_epoch_{epoch}.pth')
            torch.save(encoder.state_dict(), encoder_save_path)
            torch.save(decoder.state_dict(), decoder_save_path)
            print(f'Model saved at epoch {epoch}')

    showPlot(plot_losses)


In [119]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [120]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data
        input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [121]:
hidden_size = 128
batch_size = 32
features_vocab_total_words = len(features_vocab)
target_vocab_total_words = len(target_vocab)

encoder = EncoderRNN(features_vocab_total_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, target_vocab_total_words).to(device)

In [None]:
# Train for additional 20 epochs, loading pre-trained weights
train(train_dataloader, encoder, decoder, n_epochs=25, print_every=5, plot_every=5, load_weights=True)

In [None]:
def evaluate(encoder, decoder, sentence, feature_vocab, target_vocab):
    with torch.no_grad():
        tokenized_english_txt_test = tokenizer_eng(sentence)
        print(f"Tokenized sentence: {tokenized_english_txt_test}")

        english_indices_test = tokens_to_indices([tokenized_english_txt_test], features_vocab)[0]
        print(f"Indices: {english_indices_test}")

        # Ensure we have exactly 60 tokens (FIXED_LENGTH)
        if len(english_indices_test) < FIXED_LENGTH:
            english_indices_test += [features_vocab['<pad>']] * (FIXED_LENGTH - len(english_indices_test))
        elif len(english_indices_test) > FIXED_LENGTH:
            english_indices_test = english_indices_test[:FIXED_LENGTH]

        print(f"Padded/Truncated indices: {english_indices_test}")

        try:
            input_tensor = torch.LongTensor(english_indices_test).to(device).unsqueeze(0)
            print(f"Input tensor shape: {input_tensor.shape}")
        except ValueError as e:
            print(f"Error creating input tensor: {e}")
            return [], None

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()
        print(f"Decoded ids shape: {decoded_ids.shape}")

        EOS_token = feature_vocab['<eos>']
        decoded_words = []

        # Handle both 1D and 0D tensors
        if decoded_ids.dim() == 0:
            decoded_ids = decoded_ids.unsqueeze(0)

        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<eos>')
                break
            decoded_words.append(target_vocab.get_itos()[idx.item()] if idx.item() < len(target_vocab) else '<unk>')

    return decoded_words, decoder_attn

# Ensure <pad> token is in the vocabulary
if '<pad>' not in features_vocab:
    features_vocab['<pad>'] = len(features_vocab)

# Try evaluation again
sentence = "here is my pet."
decoder_output, attn_weights = evaluate(encoder, decoder, sentence, features_vocab, target_vocab)
if decoder_output:
    print("Translation: " + " ".join(decoder_output))
else:
    print("Evaluation failed. Check the printed debug information.")

In [44]:
# def indices_to_words(indices, vocab):
#     return [vocab.get_itos()[index] if index < len(vocab) else '<unk>' for index in indices]