# Imports

In [2]:
!pip install -q wandb

[K     |████████████████████████████████| 2.1MB 23.0MB/s 
[K     |████████████████████████████████| 163kB 48.0MB/s 
[K     |████████████████████████████████| 133kB 44.1MB/s 
[K     |████████████████████████████████| 102kB 8.6MB/s 
[K     |████████████████████████████████| 71kB 6.7MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [3]:
import numpy as np
import pandas as pd

import tensorflow as tf
ks = tf.keras

import wandb
from wandb.keras import WandbCallback as WandbCallback

import datetime
timestr = lambda fmt:datetime.datetime.now().strftime(fmt)

In [4]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Dataset Loading, Characteristic Extraction and Reshaping

In [5]:
basepath = "/content/drive/MyDrive/Sem_8/dl-datasets/dakshina_dataset_v1.0/hi/lexicons"

col_names = ['Dev.','Roman','att.']
STARTCHAR = '\t'
ENDCHAR   = '\n'

def read_as_array(path):
  data = pd.read_csv(path, sep='\t', names=col_names).drop_duplicates(subset="Dev.").dropna()
  data['Dev.'] = STARTCHAR + data['Dev.'] + ENDCHAR
  return np.array(data)[:,:2]

train_data = read_as_array(basepath+"/hi.translit.sampled.train.tsv")
val_data   = read_as_array(basepath+"/hi.translit.sampled.dev.tsv")
test_data  = read_as_array(basepath+"/hi.translit.sampled.test.tsv")

In [6]:
input_vocab = set()
target_vocab = set()

for i in range(len(train_data)):
  for char in train_data[i,1]:
    input_vocab.add(char)
  for char in train_data[i,0]:
    target_vocab.add(char)

input_vocab  = [''] + sorted(list(input_vocab))
target_vocab = [''] + sorted(list(target_vocab))

len_input_vocab  = len(input_vocab)
len_target_vocab = len(target_vocab)

input_dict  = dict([ (char, i) for i, char in enumerate(input_vocab)])
target_dict = dict([ (char, i) for i, char in enumerate(target_vocab)])

max_len_input  = max([ len(word) for data in [train_data[:,1], val_data[:,1], test_data[:,1]] for word in data ])
max_len_target = max([ len(word) for data in [train_data[:,0], val_data[:,0], test_data[:,0]] for word in data ])

In [7]:
def str_to_numarray(strs, output_size, charmap):
  ret = np.zeros((len(strs), output_size), dtype="float32")
  for i, s in enumerate(strs):
    for j, ch in enumerate(s): ret[i,j] = charmap[ch]
  return ret

def vectorize_dataset(data):
  # inputs
  enc_inp = str_to_numarray(data[:,1], max_len_input, input_dict)
  dec_inp = str_to_numarray(data[:,0], max_len_target, target_dict)
  
  # targets
  dec_tgt = np.pad(dec_inp[:,1:],((0,0),(0,1)))
  dec_tgt_onehot = np.zeros((data.shape[0], max_len_target, len_target_vocab), dtype="float32")
  for i in range(len_target_vocab): dec_tgt_onehot[:,:,i] = dec_tgt[:,:]==i
  
  return enc_inp, dec_inp, dec_tgt_onehot

enc_inp_train, dec_inp_train, dec_tgt_train_onehot = vectorize_dataset(train_data)
enc_inp_val,   dec_inp_val,   dec_tgt_val_onehot   = vectorize_dataset(val_data)
enc_inp_test,  dec_inp_test,  dec_tgt_test_onehot  = vectorize_dataset(test_data)

# Functions

## Model Handling

In [15]:
DEFAULT_NETPARAMS = {
    "embed_size": 16, 
    "latent_dim": 256, 
    "enc_layers": 1, 
    "dec_layers": 1, 
    "recurrent_cell": "SimpleRNN",
    "dec_attention": False,
    "dropout": 0, 
    "beam_size": 1, 
    "enc_state_dep": "first"
}

In [58]:
def fresh_training_model(netparams):
  # unpack network parameters
  embed_size = netparams["embed_size"]
  latent_dim = netparams["latent_dim"]
  enc_layers = netparams["enc_layers"]
  dec_layers = netparams["dec_layers"]
  exec("rlexec=ks.layers."+netparams["recurrent_cell"]); recurrent_cell = locals()["rlexec"]
  dec_attention = netparams["dec_attention"]
  dropout    = netparams["dropout"]
  ## add beam search
  enc_state_dep = netparams["enc_state_dep"]

  # encoder layers
  encoder_input = ks.Input(shape=(None,), name="encoder_input")
  encoder_embedding = ks.layers.Embedding(len_input_vocab+1, embed_size, mask_zero=True, name="encoder_embedding")
  encoder_rnns = [recurrent_cell(latent_dim, return_sequences=True, return_state=True, dropout=dropout, name="encoder_rnn_"+str(i)) for i in range(1,enc_layers+1)]

  # encoder feedforward path
  encoder_output = encoder_embedding(encoder_input)
  for encoder_rnn in encoder_rnns:
    encoder_ret = encoder_rnn(encoder_output)
    encoder_output, encoder_state = encoder_ret[0], list(encoder_ret[1:])

  # decoder layers
  decoder_input = ks.Input(shape=(None,), name="decoder_input")
  decoder_embedding = ks.layers.Embedding(len_target_vocab+1, embed_size, mask_zero=True, name="decoder_embedding")
  decoder_rnns = [recurrent_cell(latent_dim, return_sequences=True, return_state=True, dropout=dropout, name="decoder_rnn_"+str(i)) for i in range(1,dec_layers+1)]
  if dec_attention:
    decoder_attention = ks.layers.AdditiveAttention(name="decoder_attention")
    decoder_concat    = ks.layers.Concatenate(name="decoder_concat")
  decoder_dropout = ks.layers.Dropout(dropout, name="decoder_dropout")
  decoder_dense = ks.layers.Dense(len_target_vocab, activation="softmax", name="decoder_dense")

  # decoder feedforward path
  decoder_rnn_out = decoder_embedding(decoder_input)
  for i,decoder_rnn in enumerate(decoder_rnns):
    if enc_state_dep=='first':
      decoder_state_input = encoder_state if i==0 else None
    elif enc_state_dep=='all':
      decoder_state_input = encoder_state
    else:
      decoder_state_input = None
    decoder_ret = decoder_rnn(decoder_rnn_out, initial_state=decoder_state_input)
    decoder_rnn_out = decoder_ret[0]
  if dec_attention:
    context_vec, attn_weights = decoder_attention([decoder_rnn_out, encoder_output], return_attention_scores=True)
    decoder_dense_input = decoder_concat([decoder_rnn_out, context_vec])
  else:
    decoder_dense_input = decoder_rnn_out
  decoder_dense_input = decoder_dropout(decoder_dense_input, training=True)
  decoder_output = decoder_dense(decoder_dense_input)

  model = ks.Model([encoder_input, decoder_input], decoder_output, name="training_model")
  model.netparams = netparams
  return model

def gen_enc_dec_models(model):
  # get layer from name
  layer_idxs = dict([(l.name,i) for i,l in enumerate(model.layers)])
  layer_from_name = lambda s: model.layers[layer_idxs[s]]
  
  # unpack network parameters
  netparams  = model.netparams
  latent_dim = netparams["latent_dim"]
  enc_layers = netparams["enc_layers"]
  dec_layers = netparams["dec_layers"]
  dec_attention = netparams["dec_attention"]
  recurrent_cell_name = netparams["recurrent_cell"]

  # reconstructing encoder model
    # inputs
  encoder_model_input = layer_from_name("encoder_input").input
    # outputs
  encoder_model_ret = layer_from_name("encoder_rnn_"+str(enc_layers)).output
  encoder_model_output, encoder_model_state_output = encoder_model_ret[0], list(encoder_model_ret[1:])
    # model reconstruction
  encoder_model = ks.Model(
    encoder_model_input, 
    [encoder_model_output, encoder_model_state_output], 
    name="encoder_model"
  )

  # reconstructing decoder model
    # inputs
  decoder_model_input  = layer_from_name("decoder_input").input
  decoder_model_encoder_output = ks.Input(shape=(max_len_input,latent_dim,), name="decoder_encoder_output")
  decoder_model_state_input  = []
    # outputs
  decoder_model_output = layer_from_name("decoder_embedding")(decoder_model_input)
  decoder_model_state_output = []
    # model reconstruction
  for dec_layer in range(1,dec_layers+1):
    decoder_model_state_input.append(
        [ks.Input(shape=(latent_dim,), name="decoder_state_h_"+str(dec_layer)), ks.Input(shape=(latent_dim,), name="decoder_state_c_"+str(dec_layer))] 
        if recurrent_cell_name=="LSTM" else [ks.Input(shape=(latent_dim,), name="decoder_state_"+str(dec_layer))]
    )
    decoder_ret = layer_from_name("decoder_rnn_"+str(dec_layer))(decoder_model_output, initial_state = decoder_model_state_input[-1])
    decoder_model_output, decoder_model_state = decoder_ret[0], list(decoder_ret[1:])
    decoder_model_state_output.append(decoder_model_state)
    dec_layer += 1
  if dec_attention:
    decoder_model_context_vec, decoder_model_attention_weights = layer_from_name("decoder_attention")([decoder_model_output, decoder_model_encoder_output], return_attention_scores=True)
    decoder_model_output = layer_from_name("decoder_concat")([decoder_model_output, decoder_model_context_vec])
  decoder_model_output = layer_from_name("decoder_dropout")(decoder_model_output, training=False)
  decoder_model_output = layer_from_name("decoder_dense")(decoder_model_output)
  if dec_attention:
    decoder_model = ks.Model(
      [decoder_model_input, decoder_model_encoder_output, decoder_model_state_input], 
      [decoder_model_output, decoder_model_state_output, decoder_model_attention_weights], 
      name="decoder_model"
    )
  else:
    decoder_model = ks.Model(
      [decoder_model_input, decoder_model_state_input], 
      [decoder_model_output, decoder_model_state_output], 
      name="decoder_model"
    )

  encoder_model.netparams = netparams
  decoder_model.netparams = netparams

  return encoder_model, decoder_model

## Decoding Input String

In [41]:
def decode_sequence(input_seq, models):
  # Convert string input to numerical array
  encoded_input_seq = str_to_numarray([input_seq], max_len_input, input_dict)

  # Unpack testing model and its network parameters
  encoder_model, decoder_model = models
  netparams = decoder_model.netparams
  latent_dim = netparams["latent_dim"]
  recurrent_cell_name = netparams["recurrent_cell"]
  dec_attention = netparams["dec_attention"]
  dec_layers = netparams["dec_layers"]
  enc_state_dep = netparams["enc_state_dep"]
  
  # Run encoder model and create initial state for decoder
  encoder_layer_out, encoder_states_out = encoder_model.predict(encoded_input_seq)
  default_initial_state = [np.zeros((1,latent_dim))]*(2 if recurrent_cell_name=="LSTM" else 1)
  if enc_state_dep=='first':
    decoder_state = [encoder_states_out] + [default_initial_state]*(dec_layers-1)
  elif enc_state_dep=='all':
    decoder_state = [encoder_states_out] * dec_layers
  else:
    decoder_state = [default_initial_state] * dec_layers

  # Generate empty target sequence of length 1.
  target_seq = np.array(target_dict[STARTCHAR],ndmin=2)

  # Sampling loop for a batch of sequences
  # (to simplify, here we assume a batch of size 1).
  stop_condition = False
  decoded_sentence = ""
  while not stop_condition:
    if dec_attention:
      output_tokens, decoder_state, attn_weights = decoder_model.predict([target_seq, encoder_layer_out, decoder_state])
    else:
      output_tokens, decoder_state = decoder_model.predict([target_seq, decoder_state])

    # Sample a token
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = target_vocab[sampled_token_index]
    decoded_sentence += sampled_char

    if dec_attention:
      # attention weights: my time to shine!
      pass
    
    # Exit condition: either hit max length
    # or find stop character.
    if sampled_char == ENDCHAR or len(decoded_sentence) > max_len_target:
      stop_condition = True

    # Update the target sequence (of length 1).
    target_seq[0,0] = sampled_token_index
  return decoded_sentence

# Sample Runs

## Training a model for a particular network configuration

In [None]:
# initialize training model
netparams = DEFAULT_NETPARAMS.copy()
netparams.update({
  "embed_size": 16, 
  "latent_dim": 256, 
  "enc_layers": 3, 
  "dec_layers": 2, 
  "recurrent_cell": "LSTM",
  "dec_attention": True,
  "dropout": 0.05, 
  "beam_size": 1, 
  "enc_state_dep": "first"
})
training_model = fresh_training_model(netparams) # ks.models.load_model(savedModelPath)
training_model.summary()

In [None]:
# training parameters
epochs = 30
batch_size = 32

# compile training model
training_model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)

#wandb.init(project="Dakshina HI Test 1")
# train seq2seq model
training_model.fit(
    [enc_inp_train, dec_inp_train],
    dec_tgt_train_onehot,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=( [enc_inp_val, dec_inp_val], dec_tgt_val_onehot )
    #,callbacks=[WandbCallback(monitor="val_accuracy")]
)

In [None]:
training_model.evaluate(x=[enc_inp_test, dec_inp_test], y=dec_tgt_test_onehot)

In [None]:
training_model.save("/content/drive/MyDrive/Sem_8/trained_models/s2s_samplerun.h5")

## Testing trained model

In [None]:
test_model = training_model # ks.models.load_model("/content/drive/MyDrive/Sem_8/s2s_samplerun.h5")
encoder_model, decoder_model = gen_enc_dec_models(test_model)
encoder_model.summary()
decoder_model.summary()

In [None]:
print(decode_sequence("intel", [encoder_model, decoder_model]))

# Training Sweep

In [None]:
sweeplog_path = "/content/drive/MyDrive/Sem_8/sweeplog.txt"
savemodel_path = "/content/drive/MyDrive/Sem_8/trained-models"
run_sep = '-='*30+'-'

!touch $sweeplog_path

In [None]:
def runWandbSweep():
  wandb.init(project="dakshina_hi_no_attention_1")
  
  tcr_wandb_format     = timestr("%b %d' %H:%M:%S")
  tcr_savemodel_format = timestr("%Y%m%d_%H%M%S")

  cfg = wandb.config
  netparams = DEFAULT_NETPARAMS.copy()
  netparams.update({
    "embed_size": cfg.embedding_size, 
    "latent_dim": cfg.hidden_layer_size,
    "enc_layers": cfg.num_encoder_layers,
    "dec_layers": cfg.num_decoder_layers,
    "recurrent_cell": cfg.recurrent_cell,
    "dec_attention": cfg.decoder_attention,
    "dropout": cfg.dropout,
    "enc_state_dep": cfg.encoder_state_dependencies
  })
  
  log_output = "Sweep run created on "+tcr_wandb_format+" ("+tcr_savemodel_format+") with following sweep parameters:\n"
  for i,k in enumerate(netparams):
    log_output += str(i+1)+". "+k+" = "+str(netparams[k])+'\n'
  log_output += "\n\n"
  open(sweeplog_path,'a').write(log_output)
  
  model = fresh_training_model(netparams)
  model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])
  
  model.fit(
      [enc_inp_train, dec_inp_train],
      dec_tgt_train_onehot,
      batch_size=cfg.batch_size,
      epochs=cfg.epochs,
      validation_data=( [enc_inp_val, dec_inp_val], dec_tgt_val_onehot ),
      callbacks=[WandbCallback(monitor="val_accuracy")]
  )
  model.save(savemodel_path+"/s2s_sweep_"+tcr_savemodel_format+".h5")
  open(sweeplog_path,'a').write("Sweep run completed, model saved with timestamp: "+tcr_savemodel_format+"\n\n"+run_sep+"\n\n")

In [None]:
wandbSweepCfg = {
  "name":"Dakshina HI Parameter Sweep", 
  "metric":{
    "name":"val_accuracy",
    "goal":"maximize"
  }, 
  "method": "bayes", 
  "parameters":{
    # network parameters
    "embedding_size": {"values":[16, 32, 64, 256]},
    "hidden_layer_size": {"values":[16, 32, 64, 256]},
    "num_encoder_layers": {"values":[1, 2, 3]},
    "num_decoder_layers": {"values":[1, 2, 3]},
    "recurrent_cell": {"values":["SimpleRNN", "GRU", "LSTM"]},
    "decoder_attention": {"values":[False, True]},
    "dropout": {"values":[0,0.1,0.2]},
    "encoder_state_dependencies":{"values":["first"]},
    
    # training parameters
    "epochs": { "values":[30] },
    "batch_size": { "values":[32] }
  }
}

open(sweeplog_path,'a').write(run_sep+"\n\nStarted sweep at "+timestr("%H:%M:%S on %b %d, %Y")+'\n\n')

sweepId = wandb.sweep(wandbSweepCfg)#"vasid99/uncategorized/uklnmska"
wandb.agent(sweepId, function = runWandbSweep)