# Neural Machine Translation

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random 
from tqdm import tqdm 

In [None]:
!pip install indic_nlp_library

Reading train data from train.csv file

In [None]:
import csv

data_hindi=[['hindi']]
data_english=[['english']]
with open('/kaggle/input/cs779-nmt-competition/eng_Hindi_data_train.csv','r') as file:
  my_file=csv.reader(file,delimiter=',')
  for row in my_file:
    data_hindi.append([row[1]])
    data_english.append([row[0]])

printing initial 10 hindi sentences

In [None]:
print(data_hindi[:10])
print(data_english[:10])

Preprocessing the Data

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lem=WordNetLemmatizer()

In [None]:
#dictionary for abbreviations
abbr = { 
"aren't": "are not",
"ain't": "am not",
"could've": "could have",
"couldn't": "could not",
"can't": "cannot",
"'cause": "because",
"doesn't": "does not",
"didn't": "did not",
"don't": "do not",
"hadn't": "had not",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"here's": "here is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"who's": "who is",
"won't": "will not",
"you're": "you are",
"you've":"you have",
"y'all":"you all",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"let's": "let us"
}

In [None]:
import re
import string
def multiple_replace(adict, text):
  # Create a regular expression from all of the dictionary keys
  regex = re.compile("|".join(map(re.escape, adict.keys(  ))))

  # For each match, look up the corresponding value in the dictionary
  return regex.sub(lambda match: adict[match.group(0)], text)


def decontract(sentence):
  sentence=sentence.split()
  sentence=' '.join(sentence)
  sentence = multiple_replace(abbr, sentence)
  return sentence.split(' ')
  
def cleanEng(x):
  x=' '.join(x)
  x=str(x)
  x=x.lower()
  x=re.sub(r'[^a-z0-9]+',' ',x)
  x=re.sub(' +', ' ',x) #removing extra spaces 
  if len(x)>200:
    x=x[:200]
  if x and x[-1]==' ':
    x=x[:-1]
  x=x.split(' ')
  return x

def cleanHindi(x):
  x=' '.join(x)
  x=str(x)
  x=re.sub('[a-zA-Z]','',x)
  translator=str.maketrans('', '', string.punctuation)  #removing puncuations
  x=x.translate(translator)
  x=re.sub(' +', ' ',x)
  if len(x)>200:
    x=x[:200]
  if x and x[-1]==' ':
    x=x[:-1]
  x=x.split(' ')
  return x

In [None]:
for i in range(1,len(data_english)):
  data_english[i]=list(decontract(data_english[i]))
  data_english[i]=cleanEng(data_english[i])
  for j in range(len(data_english[i])):
    data_english[i][j]=lem.lemmatize(data_english[i][j])

In [None]:
from indicnlp.normalize.indic_normalize import BaseNormalizer
remove_nuktas=False
normalizer=BaseNormalizer("hi",remove_nuktas=False)
for i in range(1,len(data_hindi)):
  data_hindi[i]=cleanHindi(data_hindi[i])
  x=' '.join(data_hindi[i])
  x=str(x)
  x=normalizer.normalize(x)
  x=x.split(' ')
  data_hindi[i]=x

In [None]:
print(data_hindi[:1000])
for i in data_hindi:
  if '[' in i:
    print(i)

Analyzing some insigts in Train Dataset

In [None]:
#A simple countvectorizer() from scratch

clean_data_english=data_english
clean_data_hindi=data_hindi
unique_hin=set()
unique_eng=set()
dict_eng={}
dict_hin={}

for i in range(len(clean_data_hindi)):
  for j in range(len(clean_data_hindi[i])):
    if clean_data_hindi[i][j] not in unique_hin:
      unique_hin.add(clean_data_hindi[i][j])
      dict_hin[clean_data_hindi[i][j]]=1
    else:
      dict_hin[clean_data_hindi[i][j]]+=1

for i in range(len(clean_data_english)):
  for j in range(len(clean_data_english[i])):
    if clean_data_english[i][j] not in unique_eng:
      unique_eng.add(clean_data_english[i][j])
      dict_eng[clean_data_english[i][j]]=1
    else:
      dict_eng[clean_data_english[i][j]]+=1

Sorting Dictionary by frequency in decreasing order

In [None]:
dict_eng=sorted(dict_eng.items(),key=lambda p:p[1],reverse=True)  #sorted by frequency
dict_hin=sorted(dict_hin.items(),key=lambda p:p[1],reverse=True)  #sorted by frequency

Word count of sentences in Train Data

In [None]:
word_count_hin=[]
word_count_eng=[]

for i in range(len(clean_data_hindi)):
  word_count_hin.append(len(clean_data_hindi[i]))
  word_count_eng.append(len(clean_data_english[i]))

Add start and end token

In [None]:
def addTokens(x,start=False):
  #Adding END token in sentences
  x.append('<END>')
  #Adding start token in Hindi sentences
  if start:
    x.insert(0,'<START>')
  return list(x)

In [None]:
for i in range(len(clean_data_english)):
  clean_data_english[i]=addTokens(clean_data_english[i],start=True)
  clean_data_hindi[i]=addTokens(clean_data_hindi[i],start=False)

In [None]:
validate_eng=clean_data_english[81858:]
validate_hin=clean_data_hindi[81858:]

Vocab Class

In [None]:
#Vocab Class
class vocab:
  def __init__(self,data,token=True):
    self.data=data
    if token:
      self.word2idx={'<START>':1,'<END>':2,'<PAD>':0}
      self.idx2word={1:'<START>',2:'<END>',0:'<PAD>'}
      self.idx=2

    else:
      self.word2idx={'<PAD>':0,'<END>':1}
      self.idx2word={0:'<PAD>',1:'<END>'}
      self.idx=1

    self.x=[]
    self.create()
    self.vocab_size=self.idx+1

  def create(self):
    max_len=0
    #finding max length of sentence and creating word to index and index to word
    for sentence in  self.data:
      max_len=max(max_len,len(sentence))
      for word in sentence:
        if self.word2idx.get(word) is None:
          self.idx+=1
          self.word2idx[word]=self.idx
          self.idx2word[self.idx]=word
    
    for sentence in self.data:
      sent=[]
      #Converting word to word to index(feature into vector form)
      for word in sentence:
        sent.append(self.word2idx[word])
      #Adding Padding in the last
      for i in range(len(sentence),max_len+1):
        sent.append(0)
      #converting into Tensor
      self.x.append(torch.Tensor(sent))

In [None]:
English_vocab=vocab(clean_data_english[1:],token=True)
Hindi_vocab=vocab(clean_data_hindi[1:],token=False)

In [None]:
from torch.utils.data import Dataset, DataLoader

class parallelData(Dataset):
  def __init__(self):
    self.x=Hindi_vocab.x
    self.y=English_vocab.x
  def __getitem__(self,i):
    return self.x[i],self.y[i]
  def __len__(self):
    return len(self.x)

In [None]:
dataset=parallelData() #dataset

In [None]:
class encoder(nn.Module):

  def __init__(self,input_size,embedding_size,hidden_size,layers,bidirectional,p):
    super().__init__()
    self.embed=nn.Embedding(num_embeddings=input_size,embedding_dim=embedding_size)
    self.lstm=nn.LSTM(input_size=embedding_size,hidden_size= hidden_size,num_layers=layers,batch_first=True,bidirectional=bidirectional)
    self.bidirectional=bidirectional
    self.dropout=nn.Dropout(p)
    self.fc_hidden=nn.Linear(hidden_size*2,hidden_size)
    self.fc_cell=nn.Linear(hidden_size*2,hidden_size)

  def forward(self,x):
    x=self.dropout(self.embed(x))
    output,(hidden_state,cell_state)=self.lstm(x)

    if self.bidirectional:
      hidden=torch.cat((hidden_state[0:1],hidden_state[1:2]),dim=2)
      cell=torch.cat((cell_state[0:1],cell_state[1:2]),dim=2)
      hidden_state = self.fc_hidden(hidden)
      cell_state = self.fc_cell(cell)

    return output,hidden_state,cell_state

class decoder(nn.Module):

  def __init__(self,input_size,embedding_size,hidden_size,layers,p):

    super().__init__()
    self.embed=nn.Embedding(num_embeddings=input_size,embedding_dim=embedding_size)
    self.dropout=nn.Dropout(p)
    self.lstm=nn.LSTM(input_size=embedding_size, hidden_size= hidden_size, num_layers=layers, batch_first = True)
    self.fc=nn.Linear(in_features=hidden_size, out_features=input_size)

  def forward(self,x,hidden_state, cell_state):
    
    x=x.reshape(-1,1)
    x=self.dropout(self.embed(x))
    output,(hidden_state, cell_state)=self.lstm(x,(hidden_state, cell_state))
    output=self.fc(output) 
    output=output.squeeze(dim=1)

    return output,hidden_state,cell_state
    
class AttnDecoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, layers):

    super().__init__()
    self.embed=nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)
    self.lstm=nn.LSTM(input_size=hidden_size*2+embedding_size,hidden_size=hidden_size,num_layers=layers,batch_first=True)
    self.fc=nn.Linear(in_features=hidden_size,out_features=input_size)
    self.energy=nn.Linear(hidden_size*3,1) 
    self.softmax=nn.Softmax(dim=1)
  
  def forward(self,x,hidden_state,cell_state,encoder_states):

    batch_size=encoder_states.shape[0]
    seq_len=encoder_states.shape[1]
    hidden_size=encoder_states.shape[2]

    h_new=hidden_state.repeat(seq_len,1,1) 
    h_new=h_new.permute(1,0,2) 

    energy=self.energy(torch.cat((h_new, encoder_states), dim=2))
    att_weights=self.softmax(energy)
    att_weights=att_weights.permute(0,2,1)
    context=torch.bmm(att_weights, encoder_states)

    x=x.reshape(-1,1)
    x=self.embed(x)

    input_new=torch.cat((context,x), dim=2)

    output,(hidden_state, cell_state)=self.lstm(input_new,(hidden_state, cell_state))
    output=self.fc(output) 
    output=output.squeeze(dim=1) 

    del input_new
    del context
    del h_new

    return output,hidden_state,cell_state

class Attnseq2seq(nn.Module):
  def __init__(self, encoder, att_decoder):

    super().__init__()
    self.encoder=encoder
    self.decoder=att_decoder

  def forward(self, input, target, teaching_force=0.6):
    batch_size =input.shape[0]
    seq_len=target.shape[1]
    eng_vocab_size=English_vocab.vocab_size

    output=torch.zeros((seq_len, batch_size, eng_vocab_size)).to(device)
    encoder_states,hidden, cell=self.encoder(input)
    target=target.permute(1,0) 

    for i in range(1,seq_len):
      out,hidden,cell=self.decoder(x, hidden, cell, encoder_states)
      output[i]=out
      decoder_guess=out.argmax(1)
      if random.random()<teaching_force:
        x=target[i]
      else:
        x=decoder_guess

    return output

In [None]:
#Hyperparameters

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs=20
learning_rate=0.001
batch_size=128
embedding_size=512
hidden_size=512
layers=1
bidirectional=True

In [None]:
loader=DataLoader(dataset,batch_size=batch_size,shuffle=True)  

In [None]:
# Initializing the model and optimizer 

ENC=encoder(Hindi_vocab.vocab_size,embedding_size,hidden_size,layers,bidirectional,0.2).to(device) 
DE=AttnDecoder(English_vocab.vocab_size,embedding_size,hidden_size,1).to(device) 
model=Attnseq2seq(ENC,DE)
model.to(device)
optimizer=optim.Adam(model.parameters(),lr=learning_rate) 
criterion=nn.CrossEntropyLoss(ignore_index=0)  # ignore_index=0 because we have padded the sentences with 0

Train the Model

In [None]:
train_loss = []

# Loading the trained model
MODEL_PATH = '/kaggle/working/model.pt'
import os
if os.path.exists(MODEL_PATH):
  print("Loading the trained model ...")
  checkpoint = torch.load(MODEL_PATH)
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  epoch = checkpoint['epoch']
  loss = checkpoint['loss']
  print(f"The model was trained till #{epoch} iterations last time")

train_loss.append(loss.item())

# Training the model
for epoch in range(num_epochs):
  for id,(x,y) in (enumerate(tqdm(loader))):
    x=x.long().to(device)
    y=y.long().to(device)
    output=model(x,y)
    output=output[1:].reshape(-1,output.shape[2])
    y=y.permute(1,0)
    y=y[1:].reshape(-1)
    optimizer.zero_grad()
    loss=criterion(output,y)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1)
    optimizer.step()

  print(f'[{epoch+1}/{num_epochs}] loss={loss.item()}')
  train_loss.append(loss.item())  # Saving the loss for plotting the graph

  # Saving the model
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, MODEL_PATH)

In [None]:
#plotting loss
import matplotlib.pyplot as plt
plt.plot(train_loss)
plt.show()

Validation Phase

In [None]:
def translate(input):
  with torch.no_grad():
    guess=[]
    encoder_states,hidden,cell=model.encoder(input)
    x=torch.ones((1)).long().to(device)
    c=0
    while True:
      out,hidden,cell=model.decoder(x,hidden,cell,encoder_states)
      x=out.argmax(1)
      c+=1
      guess.append(int(x[0].detach().cpu()))
      #End of the sentence
      if x == 2 or c>100:
        break
  return guess

In [None]:
def prediction(x):
  x=x.long().reshape(1,-1).to(device)
  ans=translate(x)
  res=[]
  for id in ans:
    #getting english word for the corresponding id of hindi word
    res.append(English_vocab.idx2word[id])
  return res

In [None]:
def get(sent):
  token=[]

  for word in sent:
    if Hindi_vocab.word2idx.get(word) is None:
      token.append(Hindi_vocab.word2idx['है']) 
    else:
      token.append(Hindi_vocab.word2idx[word])
  sent=torch.tensor(token).float()
  res=prediction(sent)
  return res

Test Phase

In [None]:
# #Testing
test_hindi=[]
with open('/kaggle/input/cs779-nmt-test/eng_Hindi_data_test_X.csv','r') as file:
  my_file=csv.reader(file,delimiter=',')
  for row in my_file:
    test_hindi.append(row)

In [None]:
print(test_hindi[:10]) # printing first 10 sentences

In [None]:
print(len(test_hindi))  # printing total number of sentences

In [None]:
# #Cleaning the data
for i in range(len(test_hindi)):
  test_hindi[i]=cleanHindi(test_hindi[i])
  x=' '.join(test_hindi[i])
  x=str(x)
  x=normalizer.normalize(x)
  x=x.split(' ')
  test_hindi[i]=x

In [None]:
#Count Vectorizer from scratch for test data
unique_test=set()
dict_test=dict()
word_count_test=[]
for i in range(len(test_hindi)):
  word_count_test.append(len(test_hindi[i]))
  for j in range(len(test_hindi[i])):
    if test_hindi[i][j] in dict_test:
      dict_test[test_hindi[i][j]]+=1
    else:
      dict_test[test_hindi[i][j]]=1
    if test_hindi[i][j] not in unique_hin:
       unique_test.add(test_hindi[i][j])

In [None]:
dict_test=sorted(dict_test.items(),key=lambda p:p[1],reverse=True)

In [None]:
# printing top 10 frequent hindi elements in test data
print(dict_test[:10])

In [None]:
test_dataset=vocab(test_hindi,token=False)

Inference Phase

In [None]:
pred=[]
actu=[]
for i in tqdm(range(len(test_hindi))):
  k=get(test_hindi[i])[:-1]
  q=test_hindi[i][:-1]
  actu.append(q)
  pred.append(k)

Printing the output of the model

In [None]:
for i in range(len(pred)):
  print(pred[i],actu[i])

Writing predicted tranlations to text file

In [None]:
f = open("/kaggle/working/answer.txt","w")
for i in range(len(pred)):
#   print(i,pred[i])
  s=' '.join(pred[i])
  s+='\n'
  f.write(s)
f.close()