In [25]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.16-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 5.4 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 46.3 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.11-py2.py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 47.0 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 952 kB/s 
[?25hCollecting smm

In [26]:
import wandb
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [6]:
!curl https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar --output daksh.tar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1915M  100 1915M    0     0   106M      0  0:00:18  0:00:18 --:--:--  103M


In [3]:
%%capture
!tar -xvf  'daksh.tar'

In [4]:
def data(path,input_tokenizer=None,target_tokenizer=None,input_length=None,target_length=None):
  input_texts = []
  target_texts = []
  
  df = pd.read_csv(path,sep="\t",names=["1", "2","3"]).astype(str)
  if input_tokenizer is None:
      df=df.sample(frac=1)
  # Add all the  input and target texts with start sequence and end sequence added to target 
  for index, row in df.iterrows():
      input_text=row['2']
      target_text= row['1']
      if target_text =='</s>' or input_text=='</s>':
        continue
      target_text = "\t" + target_text + "\n"
      input_texts.append(input_text)
      target_texts.append(target_text)
  
  #only train set will have input_tokenizer as none. Validation and test will will use the same.
  if input_tokenizer is None:
    input_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
    input_tokenizer.fit_on_texts(input_texts)
  input_tensor = input_tokenizer.texts_to_sequences(input_texts)
  input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,padding='post')
  if target_tokenizer is None:
    target_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True)
    target_tokenizer.fit_on_texts(target_texts)
  #tokenize the text
  target_tensor = target_tokenizer.texts_to_sequences(target_texts)
  #pad the text
  target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,padding='post')
  #for dataset which is not training we pad to make maximum length same as train set.
  if input_length is not None and target_length is not None:
      input_tensor=tf.concat([input_tensor,tf.zeros((input_tensor.shape[0],input_length-input_tensor.shape[1]))],axis=1)
      target_tensor=tf.concat([target_tensor,tf.zeros((target_tensor.shape[0],target_length-target_tensor.shape[1]))],axis=1)
  return input_texts,input_tensor,input_tokenizer,target_texts,target_tensor,target_tokenizer

In [7]:
input_texts,input_tensor,input_tokenizer,target_texts,target_tensor,target_tokenizer=data("/content/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.train.tsv")
val_input_texts,val_input_tensor,val_input_tokenizer,val_target_texts,val_target_tensor,val_target_tokenizer=data("/content/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.dev.tsv",input_tokenizer,target_tokenizer,input_tensor.shape[1],target_tensor.shape[1])
test_input_texts,test_input_tensor,test_input_tokenizer,test_target_texts,test_target_tensor,test_target_tokenizer=data("/content/dakshina_dataset_v1.0/ta/lexicons/ta.translit.sampled.test.tsv",input_tokenizer,target_tokenizer,input_tensor.shape[1],target_tensor.shape[1])

In [10]:


num_encoder_tokens = len(input_tokenizer.word_index)+1
num_decoder_tokens = len(target_tokenizer.word_index)+1
max_encoder_seq_length =  input_tensor.shape[1]
max_decoder_seq_length = target_tensor.shape[1]



#convert index to character
index_to_char_input = dict((input_tokenizer.word_index[key], key) for key in input_tokenizer.word_index.keys())
index_to_char_target = dict((target_tokenizer.word_index[key], key) for key in target_tokenizer.word_index.keys())



In [12]:
#Build the model
def build_model(rnn_type,embedding_dim,encoder_layers,decoder_layers,dropout):
  #input layer ; takes in tokenize input
  encoder_inputs = keras.Input(shape=( max_encoder_seq_length))
  #embedding layer
  embed = keras.layers.Embedding(num_encoder_tokens, embedding_dim)(encoder_inputs)
  #will store output of last added layer so that we can add multiple layers
  last_encoder=None
  if rnn_type=='LSTM':
    #adding everything except the last LSTM layer, because in last layer return state=True
    for i in range(encoder_layers-1):
      encoder = keras.layers.LSTM(latent_dim, return_sequences=True,dropout=dropout)
      if i==0:
        encoder_out = encoder(embed)
      else:
        encoder_out = encoder(last_encoder)
      last_encoder=encoder_out
    #last LSTM Layer
    encoder = keras.layers.LSTM(latent_dim, return_state=True,dropout=dropout)
    #handling the corner case, when there is only one LSTM layer.The above loop won't run.
    if encoder_layers == 1:
      encoder_outputs, state_h, state_c = encoder(embed)
    else:
      encoder_outputs, state_h, state_c = encoder(last_encoder)
    #storing the hidden states only
    encoder_states = [state_h, state_c]
  elif rnn_type=='GRU':
    #adding everything except the last GRU layer, because in last layer return state=True    
    for i in range(encoder_layers-1):
      encoder = keras.layers.GRU(latent_dim, return_sequences=True,dropout=dropout)
      if i==0:
        encoder_out = encoder(embed)
      else:
        encoder_out = encoder(last_encoder)
      last_encoder=encoder_out
    #last GRU Layer
    encoder = keras.layers.GRU(latent_dim, return_state=True,dropout=dropout)
    #handling the corner case, when there is only one GRU layer.The above loop won't run
    if encoder_layers == 1:
      encoder_outputs, state = encoder(embed)
    else:
      encoder_outputs, state = encoder(last_encoder)
    encoder_states = [state]
  elif rnn_type=='RNN':
    #adding everything except the last RNN layer, because in last layer return state=True
    for i in range(encoder_layers-1):      
      encoder = keras.layers.SimpleRNN(latent_dim, return_sequences=True,dropout=dropout)
      if i==0:
        encoder_out = encoder(embed)
      else:
        encoder_out = encoder(last_encoder)
      last_encoder=encoder_out
    #last RNN Layer
    encoder = keras.layers.SimpleRNN(latent_dim, return_state=True,dropout=dropout)
    #handling the corner case, when there is only one RNN layer.The above loop won't run
    if encoder_layers == 1:
      encoder_outputs, state = encoder(embed)
    else:
      encoder_outputs, state = encoder(last_encoder)
    encoder_states = [state]  


  decoder_inputs = keras.Input(shape=( max_decoder_seq_length))
  embed = keras.layers.Embedding(num_decoder_tokens, embedding_dim)(decoder_inputs)

  if rnn_type=="LSTM":
    #add all the LSTM layers
    for i in range(decoder_layers):
      decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True,dropout=dropout)
      if i==0:
        decoder_outputs, _, _ = decoder_lstm(embed, initial_state=encoder_states)
      else:  
        decoder_outputs, _, _ = decoder_lstm(last, initial_state=encoder_states)
      last=decoder_outputs
    #Adding dense layer at the end
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax",name='final')
    decoder_outputs = decoder_dense(last)
  elif rnn_type=="GRU":
    #add all the GRU layers
    for i in range(decoder_layers):
      decoder_lstm = keras.layers.GRU(latent_dim, return_sequences=True, return_state=True,dropout=dropout)
      if i==0:
        decoder_outputs, _= decoder_lstm(embed, initial_state=encoder_states)
      else:  
        decoder_outputs, _ = decoder_lstm(last, initial_state=encoder_states)
      last=decoder_outputs
    #Adding dense layer at the end
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax",name='final')
    decoder_outputs = decoder_dense(last)
  elif rnn_type=="RNN":
    #add all the RNN layers
    for i in range(decoder_layers):
      decoder_lstm = keras.layers.SimpleRNN(latent_dim, return_sequences=True, return_state=True,dropout=dropout)
      if i==0:
        decoder_outputs, _= decoder_lstm(embed, initial_state=encoder_states)
      else:  
        decoder_outputs, _ = decoder_lstm(last, initial_state=encoder_states)
      last=decoder_outputs
    #Adding dense layer at the end
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax",name='final')
    decoder_outputs = decoder_dense(last)
  #specifying model inputs and outputs.
  # encoder_inputs -> Input to encoder
  # decoder_inputs -> Input to decoder for teacher forcing
  # decoder_outputs -> Output
  model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
  return model


In [13]:
import copy
def build_inference(model,encoder_layers,decoder_layers):
    encoder_inputs = model.input[0]  
    if isinstance(model.layers[encoder_layers+3], keras.layers.LSTM):
      encoder_outputs, state_h_enc, state_c_enc = model.layers[encoder_layers+3].output  
      encoder_states = [state_h_enc, state_c_enc]
    elif isinstance(model.layers[encoder_layers+3], keras.layers.GRU):
      encoder_outputs, state = model.layers[encoder_layers+3].output  
      encoder_states = [state]
    elif isinstance(model.layers[encoder_layers+3], keras.layers.RNN):
      encoder_outputs, state = model.layers[encoder_layers+3].output  
      encoder_states = [state]
    encoder_model = keras.Model(encoder_inputs, encoder_states)
    decoder_inputs =  keras.Input(shape=( 1))  
    if isinstance(model.layers[encoder_layers+3], keras.layers.LSTM):
      decoder_states_inputs=[]
      decoder_states=[]
      last=None
      for i in range(decoder_layers):
        #every layer must have an input through which we can supply it's hidden state
        decoder_state_input_h = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
        decoder_state_input_c = keras.Input(shape=(latent_dim,),name='inp4_'+str(i))
        x = [decoder_state_input_h, decoder_state_input_c]
        decoder_lstm = model.layers[i+encoder_layers+4]
        if i==0:
          decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
              model.layers[i+encoder_layers+2](decoder_inputs), initial_state=x
          )
        else:
          decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
              last, initial_state=x 
          )
        last=decoder_outputs
        decoder_states_inputs.append (decoder_state_input_h)
        decoder_states_inputs.append (decoder_state_input_c)
        decoder_states.append (state_h_dec)
        decoder_states.append (state_c_dec)
    elif isinstance(model.layers[encoder_layers+3], keras.layers.GRU):
      decoder_states_inputs=[] #Contain all input layers for different GRU's hidden state
      decoder_states=[] #Contains the hidden states
      last=None
      for i in range(decoder_layers):
        decoder_state_input = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
        x = [decoder_state_input]
        decoder_lstm = model.layers[i+encoder_layers+4]
        if i==0:
          decoder_outputs, state = decoder_lstm(
              model.layers[i+encoder_layers+2](decoder_inputs), initial_state=x
          )
        else:
          decoder_outputs, state = decoder_lstm(
              last, initial_state=x 
          )
        last=decoder_outputs
        decoder_states_inputs.append (decoder_state_input)
        decoder_states.append (state)
    elif isinstance(model.layers[encoder_layers+3], keras.layers.RNN):
      decoder_states_inputs=[]
      decoder_states=[]
      last=None
      for i in range(decoder_layers):
        decoder_state_input = keras.Input(shape=(latent_dim,),name='inp3_'+str(i))
        x = [decoder_state_input]
        decoder_lstm = model.layers[i+encoder_layers+4]
        if i==0:
          decoder_outputs, state = decoder_lstm(
              model.layers[i+encoder_layers+2](decoder_inputs), initial_state=x
          )
        else:
          decoder_outputs, state = decoder_lstm(
              last, initial_state=x 
          )
        last=decoder_outputs
        decoder_states_inputs.append (decoder_state_input)
        decoder_states.append (state)      
    decoder_dense = model.get_layer('final')
    decoder_outputs = decoder_dense(last)
    decoder_model = keras.Model(
        [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
    )
    return encoder_model,decoder_model


In [14]:
def decode_batch(rnn_type,input_seq,encoder_model,decoder_model,batch_size,encoder_layers,decoder_layers):
    # Get encoder output
    states_value = encoder_model.predict(input_seq)
    if rnn_type=='GRU' or 'RNN':
      states_value=[states_value]
    nl=states_value
    for i in range(decoder_layers-1):
      nl=nl+states_value
    states_value=nl
    
    # This is contain previously predicted character's index for every words in batch.
    prev_char_index = np.zeros((batch_size, 1))
    # We start with \t for every word in batch
    prev_char_index[:, 0] = target_tokenizer.word_index['\t']
    
    predicted_words = [ "" for i in range(batch_size)]
    done=[False for i in range(batch_size)]
    for i in range(max_decoder_seq_length):
        out = decoder_model.predict(tuple([prev_char_index] + states_value))
        output_probability=out[0]
        states_value = out[1:]
        for j in range(batch_size):
          if done[j]:
            continue          
          sampled_token_index = np.argmax(output_probability[j, -1, :])
          if sampled_token_index == 0:
            sampled_char='\n'
          else:
            sampled_char = index_to_char_target[sampled_token_index]
          if sampled_char == '\n':
            done[j]=True
            continue            
          predicted_words[j] += sampled_char
          #update the previously predicted characters        
          prev_char_index[j,0]=target_tokenizer.word_index[sampled_char]
    return predicted_words


In [15]:
def test_accuracy(encoder_model,decoder_model,encoder_layers,decoder_layers):
  success=0
  #Get all the predicted words
  pred=decode_batch("GRU",test_input_tensor,encoder_model,decoder_model,test_input_tensor.shape[0],encoder_layers,decoder_layers)
  for seq_index in range(test_input_tensor.shape[0]):
      predicted_word = pred[seq_index]
      target_word=test_target_texts[seq_index][1:-1]
      #test the word one by one and write to files
      if target_word == predicted_word:
        success+=1
        f = open("success.txt", "a")
        f.write(test_input_texts[seq_index]+' '+target_word+' '+predicted_word+'\n')
        f.close()
      else:
        f = open("failure.txt", "a")
        f.write(test_input_texts[seq_index]+' '+target_word+' '+predicted_word+'\n')
        f.close()
  return float(success)/float(test_input_tensor.shape[0])

In [16]:
def batch_validate(encoder_model,decoder_model,encoder_layers,decoder_layers):
  success=0
  #get all the predicted words
  pred=decode_batch("GRU",val_input_tensor,encoder_model,decoder_model,val_input_tensor.shape[0],encoder_layers,decoder_layers)
  for seq_index in range(val_input_tensor.shape[0]):
      predicted_word = pred[seq_index]
      target_word=val_target_texts[seq_index][1:-1]
      #test the words one by one
      if predicted_word == target_word:
        success+=1
  return float(success)/float(val_input_tensor.shape[0])

In [17]:
def train():
  global latent_dim
  latent_dim = 256
  global epochs
  epochs = 10
  model=build_model(rnn_type="GRU",embedding_dim=64,encoder_layers=3,decoder_layers=3,dropout=0.3)

  model.compile(
      optimizer="adam", loss=keras.losses.SparseCategoricalCrossentropy(
                                                              reduction='none'), metrics=["accuracy"]
  )
  hist=model.fit(
        [input_tensor, target_tensor],
        tf.concat([target_tensor[:,1:],tf.zeros((target_tensor[:,:].shape[0],1))], axis=1),
        batch_size=100,
        epochs=10,shuffle=True
  )
  # Save model
  model.save("s2s.keras")
  # Restore the model and construct the encoder and decoder.
  inf = keras.models.load_model("/content/s2s.keras")
  encoder_model,decoder_model=build_inference(inf,encoder_layers=3,decoder_layers=3)
  #log train loss to wandb
  val_acc=batch_validate(encoder_model,decoder_model,3,3)
  encoder_model.save("enc_model")
  decoder_model.save("dec_model")
  model.save("seq2seq")


In [None]:
Train_model = train()

In [None]:
inf = keras.models.load_model("/content/s2s.keras")
encoder_model,decoder_model=build_inference(inf,encoder_layers=3,decoder_layers=3)
encoder_model.save("enc_model")
decoder_model.save("dec_model")


In [None]:


encoder_model = keras.models.load_model("/content/enc_model")
decoder_model = keras.models.load_model("/content/dec_model")

val_acc=batch_validate(encoder_model,decoder_model,3,3)


In [22]:
Test_accuracy=test_accuracy(encoder_model,decoder_model,3,3)

In [23]:
print(Test_accuracy*100)

46.27039627039627


In [32]:
wandb.init(project="CS6910_Assignment3", entity="swe-rana")
wandb.log({"Test accuracy": Test_accuracy})

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…