In [None]:
from sklearn.model_selection import LeaveOneOut,KFold, StratifiedKFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import plot_precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
from matplotlib import pyplot as plt
from keras import backend as K
from random import shuffle
import itertools as tools
import tensorflow as tf
import pickle as pkl
from os import walk
import pandas as pd
import numpy as np
import ast
import gc
import io
import unicodedata
import re
import string
import pickle
import math
import random
import collections
import os,glob

%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip3 install indic-nlp-library

In [None]:
from collections import Counter
from collections import defaultdict

from indicnlp.tokenize import sentence_tokenize
from indicnlp.tokenize import indic_tokenize
from indicnlp import common
#common.INDIC_RESOURCES_PATH="./ExternalDependencies/indic_nlp_resources"

In [None]:
embeddings_path = './drive/MyDrive/Embeddings/embeddings'
dataset_path = './drive/MyDrive/Embeddings/dataset_unbalanced.csv'

In [None]:
class Preprocessor:
    
    def __init__(self, embidding_dims= 100, max_sequence_length = 100, padding_type='post'):        
        self.word_index_len = None
        self.embeddings_matrix = None
        self.embedding_dim = embidding_dims
        self.max_sequence_length = max_sequence_length
        self.padding_type = padding_type 
        self.embeddings_dict = None    

    # function to generate embedding matrix
    def make_embeddings(self, path_to_embeddings= embeddings_path):            
        embeddings_index = {}
        with open(f'{path_to_embeddings}', encoding="utf8") as f:
            count=0
            for line in f:
              values = line.split()
              count+=1
              if (count==1265895 or count==1):
                continue
              try:                
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                if (len(coefs)!=100):
                  continue
                embeddings_index[word] = coefs
              except:
                print(count)
                print(values)
        self.word_index_len=len(embeddings_index)
        self.embeddings_dict={}
        embedding_index = np.zeros((self.word_index_len+1, self.embedding_dim))
        i=1
        for word in embeddings_index:
            embedding_vector = embeddings_index[word]
            self.embeddings_dict[word]=i
            if embedding_vector is not None:
                embedding_index[i] = embedding_vector
            i+=1

        self.embeddings_matrix = embedding_index

    # function to convert sentences to vectors
    def vectorize_sentences(self,sentences):
        vectors=[]
        for sentence in sentences:
            vector=np.zeros(shape=(100))
            words=indic_tokenize.trivial_tokenize(sentence)
            count=0
            for word in words:
              if word in self.embeddings_dict:                
                vector[count]=self.embeddings_dict[word]
              count+=1
              if (count==100):
                break
            vectors.append(vector)
        return vectors
        
          
        

### Model: includes all model related functionalities



Similarity matrix:
trainable matrix M that captures similarity between two sentences according to the equation: 

 ![similarity function](https://drive.google.com/uc?id=1y_ojFiDHkrbOwi7LEXrGo2UTXo4Qbv5o)

where: 
*   Xf: first sentence
*   Xs: second sentence
*   M: similarity matrix (trainable weights)






In [None]:
# defining similarity layer by subclassing keras layer

class SimilarityMatrix(tf.keras.layers.Layer):

    def __init__(self,dims, **kwargs):
        self.dims_length, self.dims_width = dims
        super(SimilarityMatrix, self).__init__(**kwargs)

    def build(self, input_shape):
        
        # Create a trainable weight variable for this layer.
        self._m = self.add_weight(name='M', 
                                    shape=(self.dims_length,self.dims_width),
                                    initializer='uniform',
                                    trainable=True)
        super(SimilarityMatrix, self).build(input_shape)  # Be sure to call this at the end

    def call(self, y): 
        xf, xs = y
        sim1=tf.matmul(xf, self._m)
        transposed = tf.reshape(K.transpose(xs),(-1, 100, 1))
        sim2=tf.matmul(sim1, transposed)
        return sim2

    def compute_output_shape(self, input_shape):
        return (1)


    def get_config(self):

        config = super().get_config().copy()
        config.update({
            'dims_length': self.dims_length, 
            'dims_width': self.dims_width
        })
        return config

Model Helper: includes all training related functions

In [None]:
class ModelHelper:
  
    @staticmethod
    def negative_log_likelihood(y_true, y_pred):
        return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)

    @staticmethod
    def plot_ROC(y_true, y_predictions, title=''):
        ## calculate the FPR, TPR, Thresholds and AUC value
        false_pos_rate, true_pos_rate, thresholds = roc_curve(y_true, y_predictions)
        auc_val = auc(false_pos_rate, true_pos_rate)

        ## plot ROC curve
        plt.figure(1)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(false_pos_rate, true_pos_rate, label=f'{title}' +' (area = {:.3f})'.format(auc_val))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve')
        plt.legend(loc='best')
        plt.show()

    @staticmethod
    def compile_model(model , loss_func, monitor_metrics = ['acc'], optimizer='adam'):
        model.compile(optimizer=optimizer, loss=loss_func, metrics=monitor_metrics)   

    @staticmethod
    def train_model_kfolds(data, model_class, loss_func, num_of_folds, verbose=2, batch_size=128, plot_roc = False, plot_prec_recall = False ):
        
        model_callbacks = [
            tf.keras.callbacks.EarlyStopping(patience=8),
        ]
        
        X_data, y_data = data[0].astype(np.float32), data[1].astype(np.float32)
        
        count = 0

        for train_index, test_index in StratifiedKFold(n_splits=num_of_folds, shuffle=True, random_state=42).split(X_data, y_data):
            
            X_train, X_test = X_data[train_index], X_data[test_index]
            
            y_train, y_test = y_data[train_index], y_data[test_index]
            
            model = model_class()
            model.make_model()
            model = model.model

            ModelHelper.compile_model(model, ModelHelper.negative_log_likelihood)

            model.fit(X_train,y_train,validation_data=(X_test,y_test),verbose=verbose,epochs=100, batch_size=batch_size, callbacks=model_callbacks)
            

            pred = model.predict(X_test).ravel()
            
            loss, acc = model.evaluate(X_test, y_test, batch_size=batch_size)

            print(f'fold #{count+1} test loss: {loss}, test acc: {acc}')
            
            if plot_roc:
              ModelHelper.plot_ROC(y_test, pred, 'test data')

            count += 1
            gc.collect()


Model: includes all the CNN related functions
the make_model function builds a Convolutional Neural Net according to the architecture suggested by the paper as shown in the figure below

![cnn model architecture](https://drive.google.com/uc?id=118Olwuh9VL5_Rt_IBg6G5JfuT2KEl-Z5)

In [None]:
class Model:
    
    def __init__(self):
        self.num_of_folds = int(5)
        self.dataset = None
        self.data = None
        self.model = None
        self.test_data = None
        
        self.preprocessor = Preprocessor()
        self.preprocessor.make_embeddings()  
    
    def make_model(self):

        X_input =  tf.keras.Input(shape=(3, 100), name="input-sentences")
        
        
        embedding_layer = tf.keras.layers.Embedding(input_dim= self.preprocessor.word_index_len+1, 
                                                    output_dim=self.preprocessor.embedding_dim, 
                                                    input_length=self.preprocessor.max_sequence_length,
                                                    trainable = False,
                                                    name='fasttext-embedding-layer')
        embedding_layer.build((None,))
        embedding_layer.set_weights([self.preprocessor.embeddings_matrix])
        
        first_sentence =  embedding_layer(X_input[:,0,:])
        second_sentence =  embedding_layer(X_input[:,1,:])
        third_sentence =  embedding_layer(X_input[:,2,:])
        
        convolutional_filters_map = tf.keras.layers.Conv1D(100,kernel_size=(3), activation='relu', use_bias=True, name='features-map')
        
        Xf = convolutional_filters_map(first_sentence)
        Xs = convolutional_filters_map(second_sentence)         
        Xt = convolutional_filters_map(third_sentence)   


        Xf = tf.keras.layers.MaxPool1D(98, name='first-sentence-pool')(Xf)
        Xs = tf.keras.layers.MaxPool1D(98, name='second-sentence-pool')(Xs)
        Xt = tf.keras.layers.MaxPool1D(98, name='third-sentence-pool')(Xt)

        similarity_fnc = SimilarityMatrix((100,100))

        sim_fs = similarity_fnc([Xf, Xs])
        sim_st = similarity_fnc([Xs, Xt])

        X = tf.keras.layers.concatenate([Xf, sim_fs, Xs, sim_st, Xt])

        X = tf.keras.layers.Dense(256, activation='relu', name='fc1', use_bias=True)(X)
        X = tf.keras.layers.Dropout(0.333)(X)

        X = tf.keras.layers.Dense(512, activation='relu', name='fc2', use_bias=True)(X)
        X = tf.keras.layers.Dropout(0.333)(X)

        X = tf.keras.layers.Dense(512, activation='relu', name='fc3', use_bias=True)(X)
        X = tf.keras.layers.Dropout(0.333)(X)

        X = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(X)

        model = tf.keras.Model(inputs=[X_input], outputs=[X])

        self.model = model
    
    def load_data_from_csv(self, data_path, separator=',', split_train_test=False, make_balanced=False):
        
        '''
            load data from CSV file into dataframe
            
            -- inputs:
                data_path: path to file where data is saved
                separator (optional): value seprator to the file, default is comma
        '''
        
        self.data = pd.read_csv(f'{data_path}', sep=',')
        self.data['data'] = self.data['data'].apply(lambda x: np.array(sentence_tokenize.sentence_split(x,lang='hi')[:3]))
        self.data['data'] = self.data['data'].apply(lambda x: np.array(self.preprocessor.vectorize_sentences(x)))

        if make_balanced:
          freq = list(self.data['label'].value_counts())
          freq = freq[0]//freq[1]-1
          
          df_coherent = self.data.loc[self.data['label'] == 1]
          df_coherent_replecated = pd.concat([df_coherent]*freq, ignore_index=True)
          self.data = pd.concat([df_coherent_replecated, self.data], ignore_index=True)
        
        data_list = self.data['data'].values.tolist()
        label_list = self.data['label'].values.tolist()

        final_data = []
        final_labels = []
        for i in range(len(data_list)):
          data_entry = data_list[i]
          if data_entry.shape[0] == 3:
            final_data.append(data_entry)
            final_labels.append(label_list[i])

        
        self.data = (np.array(final_data), np.array(final_labels).reshape(-1,1))
      
    

### Training the model




In [None]:
m = Model()

In [None]:
m.load_data_from_csv(dataset_path)
unique_elements, counts_elements = np.unique(m.data[1], return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

In [None]:
m.load_data_from_csv(dataset_path, make_balanced=True)
unique_elements, counts_elements = np.unique(m.data[1], return_counts=True)
print(np.asarray((unique_elements, counts_elements)))

In [None]:
m.make_model()

In [None]:
ModelHelper.train_model_kfolds(m.data, Model, ModelHelper.negative_log_likelihood, m.num_of_folds, plot_roc=True)

In [None]:
m.model.summary()