# importing required libraries

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

# downloading dataset

In [None]:
%%capture
!curl https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar --output dakshana.tar
!tar -xvf  'dakshana.tar'
!pip install wandb
import wandb

# setting path variables

In [6]:
train_path ="/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv"
test_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv" 
val_path = "/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv"

# Data preprocessing of downloaded data and loading

In [7]:
class DataProcessing():

  def __init__(self,train, val, test):
    self.train_path = train
    self.val_path = val
    self.test_path = test
    self.tokenizer=None
    self.output_tokenizer = None

  def load(self, path):
    input = []
    output = []
  
    df = pd.read_csv(path,sep="\t",names=["target", "input","count"]).astype(str)
    # Add all the  input and target texts with start sequence and end sequence added to target 
    for index, row in df.iterrows():
      inp = row['input']
      out = row['target']
      out = "\t" + out + "\n"
      input.append(inp)
      output.append(out)

    return input, output
  
  def process(self):
    #only train set will have input_tokenizer as none. Validation and test will will use the same.
    train_input, train_output = self.load(self.train_path)
    Tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
    Tokenizer.fit_on_texts(train_input)
    Target_Tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
    Target_Tokenizer.fit_on_texts(train_output)
    self.tokenizer = Tokenizer
    self.output_tokenizer= Target_Tokenizer
    #pad the text
    train_input_tensor = tf.keras.preprocessing.sequence.pad_sequences(Tokenizer.texts_to_sequences(train_input),padding='post')
    train_output_tensor = tf.keras.preprocessing.sequence.pad_sequences(Target_Tokenizer.texts_to_sequences(train_output),padding='post')
    #for dataset which is not training we pad to make maximum length same as train set.

    val_input, val_output = self.load(self.val_path)
    val_input_tensor = tf.keras.preprocessing.sequence.pad_sequences(Tokenizer.texts_to_sequences(val_input),padding='post',maxlen=train_input_tensor.shape[1])
    val_output_tensor = tf.keras.preprocessing.sequence.pad_sequences(Target_Tokenizer.texts_to_sequences(val_output),padding='post', maxlen=train_output_tensor.shape[1])

    test_input, test_output = self.load(self.test_path)
    test_input_tensor = tf.keras.preprocessing.sequence.pad_sequences(Tokenizer.texts_to_sequences(test_input),padding='post',maxlen=train_input_tensor.shape[1])
    test_output_tensor = tf.keras.preprocessing.sequence.pad_sequences(Target_Tokenizer.texts_to_sequences(test_output),padding='post', maxlen=train_output_tensor.shape[1])
    
    return train_input_tensor, train_output_tensor, val_input_tensor, val_output_tensor, test_input_tensor, test_output_tensor


# loading data

In [8]:
data = DataProcessing(train_path, val_path, test_path)
train_input, train_output = data.load(train_path)
val_input, val_output = data.load(val_path)
test_input, test_output = data.load(test_path)
train_input_tensor, train_output_tensor, val_input_tensor, val_output_tensor, test_input_tensor, test_output_tensor = data.process()

In [10]:
num_encoder_tokens = len(data.tokenizer.word_index)+1
num_decoder_tokens = len(data.output_tokenizer.word_index)+1
max_encoder_seq_length =  train_input_tensor.shape[1]
max_decoder_seq_length = train_output_tensor.shape[1]

#convert index to character
index_to_char_input = dict((value, key) for key, value in data.tokenizer.word_index.items())
index_to_char_target = dict((value, key) for key, value in data.output_tokenizer.word_index.items())

# Model class to build a model

In [11]:
class Model():
  def __init__(self, rnn, latent, dropout, embedding):
    self.rnn = rnn
    self.latent=latent
    self.dropout=dropout
    self.embedding = embedding

  def layer(self):
    if self.rnn == "LSTM":
      return keras.layers.LSTM(self.latent, return_state=True, return_sequences=True,dropout=self.dropout)
    elif self.rnn =="GRU":
      return keras.layers.GRU(self.latent, return_state=True, return_sequences=True,dropout=self.dropout)
    else:
      return keras.layers.SimpleRNN(self.latent, return_state=True, return_sequences=True,dropout=self.dropout)

  def build(self, encoderlayers, decoderlayers):
    #input layer ; takes in tokenize input
    encoder_inputs = keras.Input(shape=( max_encoder_seq_length))
    #embedding layer
    embedded = keras.layers.Embedding(num_encoder_tokens, self.embedding)(encoder_inputs)

    if self.rnn == "LSTM":
      encoder = self.layer()
      encoder_outputs, state_h, state_c = encoder(embedded)
      for i in range(encoderlayers-1):
        encoder = self.layer()
        encoder_outputs, state_h, state_c = encoder(encoder_outputs)
      encoder_states = [state_h, state_c]

    else:
      encoder= self.layer()
      encoder_outputs, state = encoder(embedded)
      for i in range(encoderlayers-1):
        encoder = self.layer()
        encoder_outputs, state = encoder(encoder_outputs)
      encoder_states = [state]

    decoder_inputs = keras.Input(shape=( max_decoder_seq_length))
    embed = keras.layers.Embedding(num_decoder_tokens, self.embedding)(decoder_inputs)

    if self.rnn == "LSTM":
      decoder = self.layer()
      decoder_outputs, state_h, state_c = decoder(embed,initial_state = encoder_states)
      for i in range(decoderlayers-1):
        decoder = self.layer()
        decoder_outputs, state_h, state_c = decoder(decoder_outputs,initial_state = encoder_states)
      encoder_states = [state_h, state_c]
    else:
      decoder= self.layer()
      decoder_outputs, state = decoder(embed)
      for i in range(decoderlayers-1):
        decoder = self.layer()
        decoder_outputs, state = decoder(decoder_outputs)
      decoder_states = [state]

    #Adding dense layer at the end
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax",name='final')
    decoder_outputs = decoder_dense(decoder_outputs)
    #specifying model inputs and outputs.
    # encoder_inputs -> Input to encoder
    # decoder_inputs -> Input to decoder for teacher forcing
    # decoder_outputs -> Output
    model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [12]:
import copy
#building inferencess
def build_inference(model,encoder_layers,decoder_layers):
    encoder_inputs = model.input[0]  
    if isinstance(model.layers[encoder_layers+3], keras.layers.LSTM):
      encoder_outputs, state_h, state_c = model.layers[encoder_layers+3].output  
      encoder_states = [state_h, state_c]
    else:
      encoder_outputs, state = model.layers[encoder_layers+3].output  
      encoder_states = [state]
    encoder_model = keras.Model(encoder_inputs, encoder_states)

    decoder_inputs =  keras.Input(shape=( 1))  
    if isinstance(model.layers[encoder_layers+3], keras.layers.LSTM):
      decoder_states_inputs=[]
      decoder_states=[]
      last=None
      for i in range(decoder_layers):
        #every layer must have an input through which we can supply it's hidden state
        decoder_state_input_h = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
        decoder_state_input_c = keras.Input(shape=(latent_dim,),name='inp4_'+str(i))
        x = [decoder_state_input_h, decoder_state_input_c]
        decoder = model.layers[i+encoder_layers+4]
        if i==0:
          decoder_outputs, state_h_dec, state_c_dec = decoder(
              model.layers[i+encoder_layers+2](decoder_inputs), initial_state=x
          )
        else:
          decoder_outputs, state_h_dec, state_c_dec = decoder(
              last, initial_state=x 
          )
        last=decoder_outputs
        decoder_states_inputs.append (decoder_state_input_h)
        decoder_states_inputs.append (decoder_state_input_c)
        decoder_states.append (state_h_dec)
        decoder_states.append (state_c_dec)
    else:
      decoder_states_inputs=[] #Contain all input layers for different hidden state
      decoder_states=[] #Contains the hidden states
      last=None
      for i in range(decoder_layers):
        decoder_state_input = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
        x = [decoder_state_input]
        decoder = model.layers[i+encoder_layers+4]
        if i==0:
          decoder_outputs, state = decoder(
              model.layers[i+encoder_layers+2](decoder_inputs), initial_state=x
          )
        else:
          decoder_outputs, state = decoder(
              last, initial_state=x 
          )
        last=decoder_outputs
        decoder_states_inputs.append (decoder_state_input)
        decoder_states.append (state)      
    decoder_dense = model.get_layer('final')
    decoder_outputs = decoder_dense(last)
    decoder_model = keras.Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )
    return encoder_model,decoder_model

# evaluation class used for validating and accuracy calculation

In [32]:
class evaluation():

  def __init__(self, encoder,decoder,encoder_layers,decoder_layers):
    self.encoder = encoder
    self.decoder = decoder
    self.decoder_layers = decoder_layers
    self.encoder_layers= encoder_layers

  def decodebatch(self, input, batch_size):
      
      # Get encoder output
      states_value = self.encoder.predict(input)
      if rnn_type=='GRU' or 'RNN':
        states_value=[states_value]
      temp=states_value
      for i in range(self.decoder_layers-1):
        temp=temp+states_value
      states_value=temp
      
      # This is contain previously predicted character's index for every words in batch.
      char_index = np.zeros((batch_size, 1))
      # We start with \t for every word in batch
      char_index[:, 0] = data.output_tokenizer.word_index['\t']
      
      predicted = [ "" for i in range(batch_size)]
      done=[False for i in range(batch_size)]
      for i in range(max_decoder_seq_length):
          out = self.decoder.predict(tuple([char_index] + states_value))
          output_prob=out[0]
          states_value = out[1:]
          for j in range(batch_size):
            if done[j]:
              continue          
            sampled_token_index = np.argmax(output_prob[j, -1, :])
            if sampled_token_index == 0:
              sampled_char='\n'
            else:
              sampled_char = index_to_char_target[sampled_token_index]
            if sampled_char == '\n':
              done[j]=True
              continue            
            predicted[j] += sampled_char
            #update the previously predicted characters        
            char_index[j,0]=data.output_tokenizer.word_index[sampled_char]
      return predicted

  def testaccuracy(self):
    success=0
    #Get all the predicted words
    pred=self.decodebatch(test_input_tensor, test_input_tensor.shape[0])
    for index in range(test_input_tensor.shape[0]):
        predicted = pred[index]
        target_word=test_output[index][1:-1]
        #test the word one by one and write to files
        if target_word == predicted:
          success+=1
          f = open("success.txt", "a")
          f.write(test_input[index]+' '+target_word+' '+predicted+'\n')
          f.close()
        else:
          f = open("failure.txt", "a")
          f.write(test_input[index]+' '+target_word+' '+predicted+'\n')
          f.close()
    return float(success)/float(test_input_tensor.shape[0])

  def batchvalidate(self):
    success=0
    #get all the predicted words
    pred = self.decodebatch(val_input_tensor, val_input_tensor.shape[0])
    for index in range(val_input_tensor.shape[0]):
        predicted = pred[index]
        target_word=val_output[index][1:-1]
        #test the words one by one
        if predicted == target_word:
          success+=1
    return float(success)/float(val_input_tensor.shape[0])

# train function

In [None]:
rnn_type=None
embedding_dim=None
model= None
latent_dim = None
enc_layers=None
dec_layers=None
def train(wandb=True):
  global rnn_type
  global embedding_dim
  global model
  global latent_dim
  global enc_layer
  global dec_layer

  if wandb:
    wandb.init()
    rnn_type=wandb.config.cell
    embedding_dim=wandb.config.Embedding
    latent_dim=wandb.config.Latent
    enc_layer=wandb.config.Encoder_layer
    dec_layer=wandb.config.Decoder_layer
    dropout=wandb.config.dropout
    epochs=wandb.config.epochs
    bs=wandb.config.Batch_size
    wandb.run.name = 'epochs_'+str(epochs)+'_bs_'+str(bs)+'_rnn_type_'+str(rnn_type)+'_em_'+str(embedding_dim)+'_latd_'+str(latent_dim)+'_encs_'+str(enc_layer)+'_decs_'+str(dec_layer)+'_dr_'+str(dropout)

  else:
    rnn_type='LSTM'
    embedding_dim=32
    latent_dim=256
    enc_layer=4
    dec_layer=4
    dropout=0.3
    epochs=20
    bs=64

  model=Model(rnn_type, latent_dim, dropout, embedding_dim)
  model=model.build(enc_layer, dec_layer)

  model.compile(
      optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(
                                                              reduction='none'), metrics=["accuracy"]
  )
  hist=model.fit(
        [train_input_tensor, train_output_tensor],
        tf.concat([train_output_tensor[:,1:],tf.zeros((train_output_tensor[:,:].shape[0],1))], axis=1),
        batch_size=bs,
        epochs=epochs,shuffle=True
  )

  # Run inferencing
  encoder_model,decoder_model=build_inference(model,encoder_layers=enc_layer,decoder_layers=dec_layer)
  
  encoder_model,decoder_model=build_inference(model,encoder_layers=enc_layer,decoder_layers=dec_layer)
  eval = evaluation(encoder_model, decoder_model, enc_layer, dec_layer)
  val_acc = eval.batchvalidate()
  print("Validation Accuracy",val_acc)
  if wandb:
    #log train loss and val_acc to wandb
    wandb.log({"train_loss": hist.history['loss'][0]})
    wandb.log({"val_acc":val_acc})

# sweep config

In [None]:
sweep_config = {
    "name": "Bayesian Sweep without attention",
    "method":"bayes",
    "metric": {
        "name": "val_acc",
        "goal":"maximize"
    },
    "parameters": {
        "cell": {"values": ["RNN", "GRU", "LSTM"]},
        "Embedding": {"values": [32, 15, 10]},
        "Latent": {"values": [512, 256]},
        "Encoder_layer": {"values": [3, 5]},
        "Decoder_layer": {"values": [2, 3, 4]},
        "dropout": {"values": [0, 0.2, 0.3]},
        "epochs": {"values": [10, 15, 20]},
        "Batch_size": {"values": [32, 64, 100]}
    }
}

# to run sweep run this cell

In [None]:
wandb.login()
sweep_id = wandb.sweep(sweep_config, project="CS6910-Assignment-3")
wandb.agent(sweep_id, function=train)

# manuall train

In [36]:
rnn_type=None
embedding_dim=None
model= None
latent_dim = None
enc_layers=None
dec_layers=None
#this function is needed for training manually
def manual_train(config):
  global rnn_type
  global embedding_dim
  global model
  global latent_dim
  global enc_layer
  global dec_layer
  rnn_type=config.rnn_type
  embedding_dim=config.embedding_dim
  latent_dim=config.latent_dim
  enc_layer=config.enc_layer
  dec_layer=config.dec_layer
  dropout=config.dropout
  epochs=config.epochs
  bs=config.bs
  
  model=Model(rnn_type, latent_dim, dropout, embedding_dim)
  model=model.build(enc_layer, dec_layer)

  model.compile(
      optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(
                                                              reduction='none'), metrics=["accuracy"]
  )
  tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, show_dtype=True,show_layer_names=True, dpi=96 )
  hist=model.fit(
        [train_input_tensor, train_output_tensor],
        tf.concat([train_output_tensor[:,1:],tf.zeros((train_output_tensor[:,:].shape[0],1))], axis=1),
        batch_size=bs,
        epochs=epochs,shuffle=True
  )
  

  encoder_model,decoder_model=build_inference(model,encoder_layers=enc_layer,decoder_layers=dec_layer)
  eval = evaluation(encoder_model, decoder_model, enc_layer, dec_layer)
  val_acc = eval.batchvalidate()
  # val_acc=batch_validate(encoder_model,decoder_model,enc_layer,dec_layer)
  print("Validation Accuracy",val_acc)
  print("Test Accuracy",eval.testaccuracy())

In [34]:
class configuration:
  def __init__(self, rnn_type, embedding_dim,latent_dim,enc_layer,dec_layer,dropout,epochs,bs):
    self.rnn_type = rnn_type
    self.embedding_dim = embedding_dim
    self.latent_dim = latent_dim
    self.enc_layer = enc_layer
    self.dec_layer = dec_layer
    self.dropout = dropout
    self.epochs = epochs
    self.bs = bs

# below cell gives test accuracy for our best model 

In [None]:
config=configuration('LSTM',32,256,4,4,.3,20,64)
manual_train(config)