**COGNATE REFLEXES PREDICTION**

In [None]:
#install tensorflow version 2.5
#we faced an issue when used other tensorflow versions
!pip install tensorflow=='2.5'
#similarly pandas and keras import on .ipnyb(Google Colab) were required
!pip install pandas
!pip install keras

In [261]:
#importing appropriate modules
import json
import logging
import os
import random
import time
from typing import Sequence

from absl import app
from absl import flags
import numpy as np
import tensorflow as tf

In [262]:
#building the cognate sets and the vocab from the training file.
def buildDataset(data_dir,train_path):
  #create vocabulary
  #initialize empty vocabulary
  vocabulary = set()
  cogsets = []
  #training file path
  filepath = os.path.join(data_dir, train_path)
  print('Get Training data :', filepath)
  #open training file
  with open(filepath, 'r', encoding='utf-8') as fp:
    next(fp)
    for line in fp:
      parts = tuple(line.strip('\n').split('\t')[1:])
      for p in parts:
        for c in p.split():
          vocabulary.add(c)
      cogsets.append([p.strip() for p in parts])
  #adding possible extra words
  vocabulary = ['<PAD>', '<EOS>', '<BOS>', '<UNK>', '<TARGET>', '<BLANK>'] + sorted(
      list(vocabulary))
  print("COGSETS:",cogsets)
  print("VOCAB:",vocabulary)
  #returning cognets and vocabulary
  return cogsets, vocabulary

In [263]:
#function to read from hyper parameters file
def hyperparameters(checkpoint_dir):
  hyperparam_path = os.path.join(checkpoint_dir, 'hparams.json')
  with open(hyperparam_path, 'r') as fp:
    hyperparams = json.load(fp)
    print("HPARAMS:",hyperparams,"\n")
    return hyperparams

In [264]:
def extend_dataset(cognatesets):
  #each of our datasert has multiple languages we are copying vocab for each dataset 
  #sampleList would be a list of list of vocabs for each language
  number_languages = len(cognatesets[0])
  sampleList = []
  for val in cognatesets:
    sample = []
    for i in range(number_languages):
      if val[i]:
        sample.append(val[i])
      else:
        sample.append('<BLANK>')
    sampleList.append(sample)
  random.shuffle(sampleList)
  return sampleList

In [265]:
#we are using function to take trainnig data mask few words convert it into tensor and return
def create_dataset_forTrain(samples, blen, number_languages, maximumlen, charecterToIndex):
  def language_generator():
    while True:
      for ic in samples:
        inp = []
        result = []
        inpHiddenVal = []
        targetMask = []
        correct = [i for i in range(len(ic)) if ic[i] != '<BLANK>']
        number = random.randint(1, len(correct))
        exist = random.sample(correct, number)
        for i in range(len(ic)):
          template = [charecterToIndex['<BLANK>']] * maximumlen
          seq = [charecterToIndex['<BOS>']] + [
              charecterToIndex[c] if c in charecterToIndex else charecterToIndex['<UNK>']
              for c in ic[i].split()
          ] + [charecterToIndex['<EOS>']]
          for j in range(min(len(seq), maximumlen)):
            template[j] = seq[j]
          result.append(template)
          inp.append(template)
          if i in correct:
            targetMask.append([1.0] * maximumlen)
          else:
            targetMask.append([0.0] * maximumlen)
          if i in exist:
            inpHiddenVal.append([1.0] * maximumlen)
          else:
            inpHiddenVal.append([0.0] * maximumlen)
        inp = tf.constant([inp], dtype='int32')
        result = tf.constant([result], dtype='int32')
        inpHiddenVal = tf.constant([inpHiddenVal], dtype='float32')
        target_mask = tf.constant([targetMask], dtype='float32')

        yield (inp, result, inpHiddenVal, target_mask)

  return tf.data.Dataset.from_generator(
      language_generator,
      output_signature=(tf.TensorSpec(
          shape=(blen, number_languages, maximumlen), dtype='int32'),
                        tf.TensorSpec(
                            shape=(blen, number_languages, maximumlen),
                            dtype='int32'),
                        tf.TensorSpec(
                            shape=(blen, number_languages, maximumlen),
                            dtype='float32'),
                        tf.TensorSpec(
                            shape=(blen, number_languages, maximumlen),
                            dtype='float32')))

In [266]:
#creating infiller model
class Model_Cognet(tf.keras.Model):

  def __init__(self, vocabularySize, hyperparams, bsize, number_languages, mLen):
    super(Model_Cognet, self).__init__()
    self.batch = bsize
    self.units = hyperparams['filters']
    self.kenelSize = hyperparams['kernel_width']
    self.vocab_size = vocabularySize
    self.embedDim = hyperparams['embedding_dim']
    self.nlangs = number_languages
    self.maximumSize = mLen
    self.posVal = hyperparams['sfactor']
    self.embedding = tf.keras.layers.Embedding(self.vocab_size,self.embedDim)    
    self.convolution = tf.keras.layers.Conv2D(filters=self.units, kernel_size=(number_languages,self.kenelSize))
 
    if hyperparams['nonlinearity'] == 'leaky_relu':
      self.activation = tf.keras.layers.LeakyReLU()
    elif hyperparams['nonlinearity'] == 'relu':
      self.activation = tf.keras.layers.ReLU()
    else:
      self.activation = tf.keras.layers.Activation('tanh')

    self.dropout = tf.keras.layers.Dropout(hyperparams['dropout'])

    self.deconvolution = tf.keras.layers.Conv2DTranspose(filters=self.vocab_size, kernel_size=(number_languages, self.kenelSize))

  def call(self, inp, inpMask, training):
    
    rmask = tf.repeat(inpMask, self.embedDim, axis=-1)
    rmask = tf.reshape(
        rmask,
        shape=(self.batch, self.nlangs, self.maximumSize,
               self.embedDim))

    
    inp = self.embedding(inp)
    inp = inp * rmask

    
    sfactor = (self.nlangs * self.maximumSize) / tf.math.reduce_sum(inpMask)
    if self.posVal == 'inputs':
      inp = inp * sfactor

    #convolution layer
    inp = self.convolution(inp)
    if self.posVal == 'conv':
      inp = inp * sfactor
    inp = self.dropout(inp, training=training)
    inp = self.activation(inp)

    #deconvolution layer
    netOut = self.deconvolution(inp)

    return netOut

In [267]:
@tf.function
#calculation of loss
def evaluationLoss(input, prediction, mask):
  cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction='none')
  losses = cross_entropy(y_true=input, y_pred=prediction)
  losses = mask * losses
  losses = tf.reduce_sum(losses)
  return losses

In [268]:
@tf.function
def trainperstep(Model_Cognet, optimizer, input, input_mask, target, target_mask):
  """Single training step."""
  lossperstep = 0
  with tf.GradientTape() as tape:
    logits = Model_Cognet(input, input_mask, training=True)
    lossperstep = evaluationLoss(target, logits, target_mask)
  variables = Model_Cognet.trainable_variables
  gradients = tape.gradient(lossperstep, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return lossperstep

In [269]:
#this function helps while calculating errors it gives where target starts and ends
def evaluate_cognate_set(Model_Cognet, cognateset, chr2idx, max_length):
  """Evaluates given cognate set."""
  target_index = 0
  inputs = []
  input_mask = []
  # Find possible target positions
  for i, p in enumerate(cognateset):
    if p.strip():
      if p == '<TARGET>':
        target_index = i
        inputs.append([chr2idx['<TARGET>']] * max_length)
        input_mask.append([0.0] * max_length)
      else:
        seq = [chr2idx['<BOS>']] + [
            chr2idx[c] if c in chr2idx else chr2idx['<UNK>']
            for c in p.split()
        ] + [chr2idx['<EOS>']]
        template = [chr2idx['<BLANK>']] * max_length
        for j in range(min(len(seq), max_length)):
          template[j] = seq[j]
        inputs.append(template)
        input_mask.append([1.0] * max_length)
    else:
      inputs.append([chr2idx['<BLANK>']] * max_length)
      input_mask.append([0.0] * max_length)

  inputs = tf.constant([inputs], dtype='int32')
  input_mask = tf.constant([input_mask], dtype='float32')

  logits = Model_Cognet(inputs, input_mask, training=False)
  trow = tf.math.argmax(logits[0, target_index, :, :], axis=-1)
  return trow.numpy()

In [270]:
def translate(modelCognate, cognateset, charToIndex, maxLen, indexToChar):
  eval = evaluate_cognate_set(modelCognate, cognateset, charToIndex, maxLen)
  eval = list(eval)
  eval = ' '.join([
      indexToChar[x] for x in eval if indexToChar[x] not in
      ['<PAD>', '<EOS>', '<BOS>', '<UNK>', '<TARGET>', '<BLANK>']
  ])
  return eval

In [271]:
#trains the model
def train_model(data_dir,checkpoint_dir,train_path,dev_path,dev_sol_path):
  cognatesets, vocab = buildDataset(data_dir,train_path)
  chr2idx = {vocab[i]: i for i in range(len(vocab))}
  idx2chr = {i: vocab[i] for i in range(len(vocab))}
  languages = len(cognatesets[0])
  hparams = hyperparameters(checkpoint_dir)
  vocab_size = len(vocab)
  all_samples = extend_dataset(cognatesets)

  # Getting dev data from file
  dev = []
  filepath = os.path.join(data_dir, dev_path)
  with open(filepath, 'r', encoding='utf-8') as devfile:
    #header has languages so skip
    next(devfile)
    for line in devfile:
      parts = tuple(line.strip('\n').split('\t')[1:])
      parts = ['<TARGET>' if p == '?' else p for p in parts]
      dev.append(parts)

  # Get dev solution data
  dev_sol = []
  filepath = os.path.join(data_dir, dev_sol_path)
  with open(filepath, 'r', encoding='utf-8') as devsolfile:
    # Header has languages so skip
    next(devsolfile)
    for line in devsolfile:
      parts = tuple(line.strip('\n').split('\t')[1:])
      dev_sol.append(''.join(parts).strip())
  steps_per_epoch = 500
  batch_size = 1
  max_length = 20
  #Initially no vocabs are createn so false
  vocab_written = False

  #calling model
  infiller = Model_Cognet(vocab_size, hparams, batch_size, languages, max_length)
  optimizer = tf.keras.optimizers.Adam()

  if checkpoint_dir:
      checkpoint = tf.train.Checkpoint(optimizer=optimizer, infiller=infiller)

  logging.info('Training the model ...')
  train_dataset = create_dataset_forTrain(all_samples, batch_size, languages,
                                      max_length, chr2idx)
  err = None

  for epoch in range(500):
    start = time.time()

    tloss = 0

    for (_, (input, target, input_mask,
             targ_mask)) in enumerate(train_dataset.take(steps_per_epoch)):
      loss_per_batch = trainperstep(infiller, optimizer, input, input_mask, target,
                              targ_mask)
      tloss += loss_per_batch
      # print("lossssssss",total_loss)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        tloss / steps_per_epoch))

    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

    # Evaluate on dev set:
    devSetErrors = [0 for l in range(languages)]
    devSetTotal = [0 for l in range(languages)]
    totalErr = 0
    for dset, dsol in zip(dev, dev_sol):
      trgt_idx = dset.index('<TARGET>')
      devSetTotal[trgt_idx] += 1
      pred = translate(infiller, dset, chr2idx, max_length, idx2chr)
      if pred != dsol:
        devSetErrors[trgt_idx] += 1
        totalErr += 1
    devSetErrors = [x / y for x, y in zip(devSetErrors, devSetTotal) if y != 0]
    mean_accuracy = np.mean(devSetErrors)

    if not err or mean_accuracy <= err:
      print('ERROR_UPDATE:', devSetErrors)
      if checkpoint_dir:
        checkpoint.save('/content/checkpoint_dir/best_model.ckpt')
        if not vocab_written:
          hparams = hyperparameters(checkpoint_dir)
          hparams['embedding_dim'] = hparams["embedding_dim"]
          hparams['kernel_width'] = hparams["kernel_width"]
          hparams['filters'] = hparams["filters"]
          hparams['dropout'] = hparams["dropout"]
          hparams['nonlinearity'] = hparams["nonlinearity"]
          hparams['sfactor'] = hparams["sfactor"]
          with open(checkpoint_dir + 'hparams.json', 'w') as vfile:
            json.dump(hparams, vfile)
          with open(
              checkpoint_dir + '/vocab.txt', 'w', encoding='utf-8') as vfile:
            for v in vocab:
              vfile.write(v + '\n')
          vocab_written = True
      best_error = mean_accuracy
    print(best_error, mean_accuracy, '\n')

  logging.info('Done. Shutting down ...')

In [281]:
train_model('/content/data_dir','/content/checkpoint_dir','training-mod-0.10_01.tsv','dev-0.10_01.tsv','dev_solutions-0.10_01.tsv')

In [273]:
#readin data from aldready created vocab file
def vocablist(checkpoint_dir):
  file_path = os.path.join(checkpoint_dir, 'vocab.txt')
  if not file_path:
    raise FileNotFoundError(f'File {file_path} does not exist')
  logging.info('Loading vocab from %s ...', file_path)
  with open(file_path, 'r', encoding='utf8') as f:
    vocab = [symbol.strip() for symbol in f if symbol]
  logging.info('%d symbols loaded.', len(vocab))
  return vocab

In [274]:
#testing data
def test(data_dir,checkpoint_dir):
  hparams = hyperparameters(checkpoint_dir)
  vocab = vocablist(checkpoint_dir)
  chr2idx = {vocab[i]: i for i in range(len(vocab))}
  idx2chr = {i: vocab[i] for i in range(len(vocab))}
  vocab_size = len(vocab)
  batch_size = 1
  max_length = 20

  test_filepath = os.path.join(data_dir, 'test-0.10.tsv')
  preds_filepath = os.path.join(data_dir, 'pred-0.10.tsv')

  with open(test_filepath, 'r', encoding='utf-8') as tfile:
    languages = len(next(tfile).strip('\n').split('\t')) - 1


  best_ckpt_path = tf.train.latest_checkpoint(checkpoint_dir)
  if not best_ckpt_path:
    raise ValueError('No checkpoint available')
  logging.info('Restoring from checkpoint %s ...', best_ckpt_path)
  infiller = Model_Cognet(vocab_size, hparams, batch_size, languages, max_length)
  checkpoint = tf.train.Checkpoint(infiller=infiller)
  checkpoint.restore(best_ckpt_path).expect_partial()

  logging.info('Generating predictions and saving results...')
  with open(preds_filepath, 'w', encoding='utf-8') as predfile:
    with open(test_filepath, 'r', encoding='utf-8') as testfile:
      # Copy the header.
      predfile.write(next(testfile))
      for line in testfile:
        parts = line.strip('\n').split('\t')
        testset = ['<TARGET>' if p == '?' else p for p in parts[1:]]
        target_index = testset.index('<TARGET>')
        pred = translate(infiller, testset, chr2idx, max_length, idx2chr)
        row = ['' for p in parts]
        row[0] = parts[0]
        row[target_index + 1] = pred
        predfile.write('\t'.join(row) + '\n')

In [None]:
test('/content/data_dir','/content/checkpoint_dir')

In [None]:
#import is required for evaluation
!pip install lingrex
!pip install lingpy

In [277]:
#lingrex imports
from lingrex.util import bleu_score
from lingpy import *
from lingpy.evaluate.acd import _get_bcubed_score as bcubed_score
from tabulate import tabulate
from collections import defaultdict
from lingpy.sequence.ngrams import get_n_ngrams
import math

In [278]:
#loads each column of solution and pred file separately
def load_files(path):
    """
    Helper function for simplified cognate formats.
    """
    data = csv2list(path, strip_lines=False)
    header = data[0]
    languages = header[1:]
    out = {}
    sounds = defaultdict(lambda : defaultdict(list))
    for row in data[1:]:
        out[row[0]] = {}
        for language, entry in zip(languages, row[1:]):
            out[row[0]][language] = entry.split()
            for i, sound in enumerate(entry.split()):
                sounds[sound][language] += [[row[0], i]]
    return languages,sounds, out

In [279]:
#evaluating various metrics
def compare_words(firstfile, secondfile, report=True):
    """
    Evaluate the predicted and attested words in two datasets.
    """

    (languages, soundsA, first), (languagesB, soundsB, last) = load_files(firstfile), load_cognate_file(secondfile)
    print("///",languages, soundsA, first)
    all_scores = []
    for language in languages:
        scores = []
        almsA, almsB = [], []
        for key in first:
            if language in first[key]:
                entryA = first[key][language]
                if " ".join(entryA):
                    try:
                        # print("&&&&",entryA)
                        entryB = last[key][language]
                        # print("####",entryB)
                    except KeyError:
                        print("Missing entry {0} / {1} / {2}".format(
                            key, language, secondfile))
                        entryB = ""
                    if not entryB:
                        entryB = (2 * len(entryA)) * ["Ø"]
                    # print(entryA)
                    # print(entryB)
                    almA, almB, _ = nw_align(entryA, entryB)
                    almsA += almA
                    almsB += almB
                    score = 0
                    for a, b in zip(almA, almB):
                        if a == b and a not in "Ø?-":
                            pass
                        elif a != b:
                            score += 1
                    scoreD = score / len(almA)
                    bleu = bleu_score(entryA, entryB, n=4, trim=False)
                    scores += [[key, entryA, entryB, score, scoreD, bleu]]
        if scores:
            p, r = bcubed_score(almsA, almsB), bcubed_score(almsB, almsA)
            fs = 2 * (p*r) / (p+r)
            all_scores += [[
                language,
                sum([row[-3] for row in scores])/len(scores),
                sum([row[-2] for row in scores])/len(scores),
                fs,
                sum([row[-1] for row in scores])/len(scores)]]
    all_scores += [[
        "TOTAL", 
        sum([row[-4] for row in all_scores])/len(languages),
        sum([row[-3] for row in all_scores])/len(languages),
        sum([row[-2] for row in all_scores])/len(languages),
        sum([row[-1] for row in all_scores])/len(languages),
        ]]
    if report:
        print(
                tabulate(
                    all_scores, 
                    headers=[
                        "Language", "ED", "ED (Normalized)", 
                        "B-Cubed FS", "BLEU"], floatfmt=".3f"))
    return all_scores

In [None]:
compare_words('/content/data_dir/pred-0.10.tsv','/content/data_dir/solutions-0.10.tsv')