In [1]:
import pandas as pd
import numpy as np
from indicnlp.tokenize import indic_tokenize
from transformers import AutoTokenizer
import torch

In [2]:
tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base")

In [4]:
data = pd.read_csv("project_root/Sentence pairs in English-Hindi - 2025-02-11.tsv",sep="\t",header=None,
                   names=["SrcSentenceID","SrcSentence","DstSentenceID","DstSentence"])

In [5]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [6]:
data.shape

(13182, 4)

In [5]:
data["DstSentence"] = data["DstSentence"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [6]:
data["SrcSentence"] = data["SrcSentence"].apply(tokenizer.tokenize)

In [7]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",485968,"[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,1282,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",2060319,"[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,1294,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",485564,"[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,1302,"[▁That, ▁won, ', t, ▁happen, .]",2060320,"[वैसा, नहीं, होगा, ।]"
4,1308,"[▁I, ▁miss, ▁you, .]",2060321,"[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [8]:
data["SrcSentence"] = data["SrcSentence"].apply(tokenizer.convert_tokens_to_ids)

In [9]:
Vs = tokenizer.get_vocab()

In [10]:
Vd = set()
for tokenized_hindi_sentence in data["DstSentence"]:
    Vd.update(tokenized_hindi_sentence)

hindi_vocab = dict()
for idx, token in enumerate(Vd):
    hindi_vocab[token] = idx + 3

hindi_vocab["<PAD>"] = 0
hindi_vocab["<SOS>"] = 1
hindi_vocab["<EOS>"] = 2

Vd = hindi_vocab

In [11]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[वैसा, नहीं, होगा, ।]"
4,1308,"[27, 3041, 25, 5]",2060321,"[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [12]:
def convert_hindi_tokens_to_ids(tokenized_hindi_sentence):
    return [Vd[token] for token in tokenized_hindi_sentence]

In [13]:
data["DstSentence"] = data["DstSentence"].apply(convert_hindi_tokens_to_ids)

In [14]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[2408, 255, 3275, 6162, 6057, 162, 2378, 3313,..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[2408, 255, 3275, 6162, 6057, 3313, 6899]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[1836, 5469, 2644, 2041, 2310, 6559, 1134, 626..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[7044, 3685, 2962, 6899]"
4,1308,"[27, 3041, 25, 5]",2060321,"[2688, 5411, 3867, 4586, 2142, 3313, 6899]"


In [15]:
def insert_sos_token_id(hindi_sentence_token_ids_list):
    return [1] + hindi_sentence_token_ids_list

In [16]:
def insert_eos_token_id(hindi_sentence_token_ids_list):
    return hindi_sentence_token_ids_list + [2]

In [17]:
data["DstSentenceInput"] = data["DstSentence"].apply(insert_sos_token_id)
data["DstSentenceLabel"] = data["DstSentence"].apply(insert_eos_token_id)

In [18]:
data.head()

Unnamed: 0,SrcSentenceID,SrcSentence,DstSentenceID,DstSentence,DstSentenceInput,DstSentenceLabel
0,1282,"[4159, 23, 14018, 19, 460, 230, 5]",485968,"[2408, 255, 3275, 6162, 6057, 162, 2378, 3313,...","[1, 2408, 255, 3275, 6162, 6057, 162, 2378, 33...","[2408, 255, 3275, 6162, 6057, 162, 2378, 3313,..."
1,1282,"[4159, 23, 14018, 19, 460, 230, 5]",2060319,"[2408, 255, 3275, 6162, 6057, 3313, 6899]","[1, 2408, 255, 3275, 6162, 6057, 3313, 6899]","[2408, 255, 3275, 6162, 6057, 3313, 6899, 2]"
2,1294,"[2855, 16, 48, 296, 26963, 7, 140, 5]",485564,"[1836, 5469, 2644, 2041, 2310, 6559, 1134, 626...","[1, 1836, 5469, 2644, 2041, 2310, 6559, 1134, ...","[1836, 5469, 2644, 2041, 2310, 6559, 1134, 626..."
3,1302,"[466, 751, 31, 17, 1837, 5]",2060320,"[7044, 3685, 2962, 6899]","[1, 7044, 3685, 2962, 6899]","[7044, 3685, 2962, 6899, 2]"
4,1308,"[27, 3041, 25, 5]",2060321,"[2688, 5411, 3867, 4586, 2142, 3313, 6899]","[1, 2688, 5411, 3867, 4586, 2142, 3313, 6899]","[2688, 5411, 3867, 4586, 2142, 3313, 6899, 2]"


In [19]:
data.drop(labels=["SrcSentenceID","DstSentenceID","DstSentence"],axis=1,inplace=True)

In [20]:
data.head()

Unnamed: 0,SrcSentence,DstSentenceInput,DstSentenceLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[1, 2408, 255, 3275, 6162, 6057, 162, 2378, 33...","[2408, 255, 3275, 6162, 6057, 162, 2378, 3313,..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[1, 2408, 255, 3275, 6162, 6057, 3313, 6899]","[2408, 255, 3275, 6162, 6057, 3313, 6899, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[1, 1836, 5469, 2644, 2041, 2310, 6559, 1134, ...","[1836, 5469, 2644, 2041, 2310, 6559, 1134, 626..."
3,"[466, 751, 31, 17, 1837, 5]","[1, 7044, 3685, 2962, 6899]","[7044, 3685, 2962, 6899, 2]"
4,"[27, 3041, 25, 5]","[1, 2688, 5411, 3867, 4586, 2142, 3313, 6899]","[2688, 5411, 3867, 4586, 2142, 3313, 6899, 2]"


In [21]:
X = list(data["SrcSentence"])
Y_input = list(data["DstSentenceInput"])
Y_label = list(data["DstSentenceLabel"])

X_tensor = [torch.tensor(eng_tokenized_ids) for eng_tokenized_ids in X]
Y_input_tensor = [torch.tensor(hin_tokenized_ids) for hin_tokenized_ids in Y_input]
Y_label_tensor = [torch.tensor(hin_tokenized_ids) for hin_tokenized_ids in Y_label]

X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_input_padded = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_label_padded = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [22]:
Ns = X_padded.shape[1]
Nd = Y_label_padded.shape[1]

In [23]:
class Attention(torch.nn.Module):
    def __init__(self):
        super(Attention,self).__init__()
        self.attention_probabilities = torch.nn.Softmax(dim=1)

    def forward(self,encoder_outputs,decoder_lstm_layer_outputs):
        
        decoder_lstm_layer_outputs = torch.transpose(decoder_lstm_layer_outputs,dim0=1,dim1=2)
        alignment_scores = torch.bmm(encoder_outputs,decoder_lstm_layer_outputs)
        attention_weights = self.attention_probabilities(alignment_scores)
        attention_weights = torch.transpose(attention_weights,dim0=1,dim1=2)
        context_vectors = torch.bmm(attention_weights,encoder_outputs)

        return context_vectors

In [24]:
class Encoder(torch.nn.Module):
    def __init__(self,src_lang_vocab_size,topic_vector_dim):
        super(Encoder,self).__init__()
        self.first_emebdding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_size,
                                                        embedding_dim=topic_vector_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=topic_vector_dim,hidden_size=topic_vector_dim,
                                               batch_first=True)
        
    def forward(self,X_padded_mini_batch):

        first_embedding_layer_out = self.first_emebdding_layer(X_padded_mini_batch)
        encoder_outputs,(final_encoder_output,final_candidate_cell_state) = self.second_lstm_layer(first_embedding_layer_out)

        return encoder_outputs,(final_encoder_output,final_candidate_cell_state)

In [25]:
class Decoder(torch.nn.Module):
    def __init__(self,dst_lang_vocab_size,topic_vector_dim):
        super(Decoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=dst_lang_vocab_size,
                                                        embedding_dim=topic_vector_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=topic_vector_dim,hidden_size=topic_vector_dim,
                                               batch_first=True)
        self.attention_layer = Attention()
        self.output_layer = torch.nn.Linear(in_features=topic_vector_dim*2,out_features=dst_lang_vocab_size)
        self.output_layer_activation = torch.nn.Softmax(dim=2)

    def forward(self,encoder_outputs,initial_hidden_state,initial_candidate_cell_state,
                Y_padded_mini_batch):

        first_embedding_layer_out = self.first_embedding_layer(Y_padded_mini_batch)
        decoder_lstm_layer_outputs,final_cell_hidden_states = self.second_lstm_layer(first_embedding_layer_out,
                                                                                    (initial_hidden_state,
                                                                                    initial_candidate_cell_state))
        context_vectors = self.attention_layer(encoder_outputs,decoder_lstm_layer_outputs)
        concatenated_lstm_layer_output = torch.concatenate(tensors=(decoder_lstm_layer_outputs,context_vectors),dim=2)
        affine_transformed_output = self.output_layer(concatenated_lstm_layer_output)
        decoder_outputs = self.output_layer_activation(affine_transformed_output)

        return decoder_outputs, final_cell_hidden_states

In [29]:
class Seq2SeqEncDecWithAttn(torch.nn.Module):
    def __init__(self,src_lang_vocab_size,dst_lang_vocab_size,topic_vector_dim):
        super(Seq2SeqEncDecWithAttn,self).__init__()
        self.encoder = Encoder(src_lang_vocab_size,topic_vector_dim)
        self.decoder = Decoder(dst_lang_vocab_size,topic_vector_dim)

    def forward(self,X_padded_mini_batch,Y_padded_mini_batch_input):

        encoder_outputs,(final_encoder_output,final_candidate_cell_state) = self.encoder(X_padded_mini_batch)
        Y_hat_mini_batch, final_cell_hidden_states = self.decoder(encoder_outputs,final_encoder_output,
                                                                  final_candidate_cell_state,Y_padded_mini_batch_input)
        
        return Y_hat_mini_batch

In [30]:
X_padded_train = X_padded[0:13000]
Y_input_padded_train = Y_input_padded[0:13000]
Y_label_padded_train = Y_label_padded[0:13000]

X_padded_test = X_padded[13000:]
Y_input_padded_test = Y_input_padded[13000:]
Y_label_padded_test = Y_label_padded[13000:]

In [31]:
nw = Seq2SeqEncDecWithAttn(src_lang_vocab_size=len(Vs),dst_lang_vocab_size=len(Vd),topic_vector_dim=32)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(params=nw.parameters())
epochs = 5
mb_size = 26

for epoch in range(epochs):
    for i in range(X_padded_train.shape[0]//mb_size):

        X_train_mb = X_padded_train[i*mb_size:(i+1)*mb_size]
        Y_input_train_mb = Y_input_padded_train[i*mb_size:(i+1)*mb_size]
        Y_label_train_mb = Y_label_padded_train[i*mb_size:(i+1)*mb_size]

        Y_label_train_mb = Y_label_train_mb.reshape(Y_label_train_mb.shape[0]*Y_label_train_mb.shape[1],)

        Y_pred_train_mb = nw(X_train_mb,Y_input_train_mb)
        Y_pred_train_mb = Y_pred_train_mb.reshape(Y_pred_train_mb.shape[0]*Y_pred_train_mb.shape[1],
                                                  Y_pred_train_mb.shape[2])
        

        loss_fn_value = loss_fn(Y_pred_train_mb,Y_label_train_mb)

        loss_fn_value.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 25 == 0:
            print("Epoch #{}, Mini Batch #{}, CCE Loss = {}".format(epoch,i,loss_fn_value))

Epoch #0, Mini Batch #0, CCE Loss = 8.863903999328613
Epoch #0, Mini Batch #25, CCE Loss = 7.980758190155029
Epoch #0, Mini Batch #50, CCE Loss = 7.991759300231934
Epoch #0, Mini Batch #75, CCE Loss = 7.990304470062256
Epoch #0, Mini Batch #100, CCE Loss = 7.992072105407715
Epoch #0, Mini Batch #125, CCE Loss = 7.977436542510986
Epoch #0, Mini Batch #150, CCE Loss = 7.957982540130615
Epoch #0, Mini Batch #175, CCE Loss = 7.971670627593994
Epoch #0, Mini Batch #200, CCE Loss = 7.947995662689209
Epoch #0, Mini Batch #225, CCE Loss = 7.947859287261963
Epoch #0, Mini Batch #250, CCE Loss = 7.953736305236816
Epoch #0, Mini Batch #275, CCE Loss = 7.9552321434021
Epoch #0, Mini Batch #300, CCE Loss = 7.935338497161865
Epoch #0, Mini Batch #325, CCE Loss = 7.9902825355529785
Epoch #0, Mini Batch #350, CCE Loss = 7.956906795501709
Epoch #0, Mini Batch #375, CCE Loss = 7.943914413452148
Epoch #0, Mini Batch #400, CCE Loss = 7.958964824676514
Epoch #0, Mini Batch #425, CCE Loss = 7.95852613449096

In [32]:
Vd_idx2vocab = dict(zip(Vd.values(),Vd.keys()))

In [49]:
def generate_translation(eng_sentence):

    tokenized_eng_sentence = tokenizer.tokenize(eng_sentence)
    print(tokenized_eng_sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokenized_eng_sentence)
    token_ids_tensor = torch.tensor(token_ids)
    token_ids_tensor = torch.unsqueeze(token_ids_tensor,0)
    padded_token_ids = torch.nn.utils.rnn.pad_sequence(token_ids_tensor)

    encoder_outputs,(final_encoder_output,final_candidate_cell_state) = nw.encoder(padded_token_ids)
    decoder_first_time_step_input = torch.tensor([hindi_vocab["<SOS>"]]*mb_size)
    #decoder_first_time_step_input = torch.unsqueeze(decoder_first_time_step_input,1)
    final_encoder_output = torch.squeeze(final_encoder_output,0)
    final_candidate_cell_state = torch.squeeze(final_candidate_cell_state,0)
    decoder_first_time_step_output, hidden_cell_states = nw.decoder(encoder_outputs,
                                                                          final_encoder_output,
                                                                          final_candidate_cell_state,
                                                                          decoder_first_time_step_input)
    
    generated_token_id = torch.argmax(decoder_first_time_step_output,1)
    generated_token_id = torch.unsqueeze(generated_token_id,1)

    print(Vd_idx2vocab[generated_token_id])

    for i in range(Nd-1):

        generated_softmax_probabilities,hidden_cell_states = nw.decoder(encoder_outputs,
                                                                        hidden_cell_states[0],hidden_cell_states[1],
                                                                        generated_token_id)
        generated_token_id = torch.argmax(generated_softmax_probabilities,1)

        if generated_token_id == Vd["<EOS"]:
            break

        print(Vd_idx2vocab[generated_token_id])

In [50]:
generate_translation("The semester is going to end")

['▁The', '▁semester', '▁is', '▁going', '▁to', '▁end']


RuntimeError: Expected hidden[0] size (1, 1, 32), got [6, 1, 32]