In [65]:
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import tensorflow_hub as hub
import tensorflow as tf 
nltk.download('punkt')

class similarityChecker():

    # init dunder to load model  
    def __init__(self,url):
        self.url= url
        self.model = hub.load(self.url)
        self.sentence_list=[]

    # Function to generate a list of unigrams, bigrams and trigrams of given sentence
    def generate_ngrams(self,sentence):
        ngram_list=[]
        for i in range(1,4): 
            grams=tf.strings.ngrams(sentence,i)
            ngram_list.append(grams)
        return ngram_list
    
    # Function to take a list of sentences as input and store their tokens, lengths, ngrams and embeddings
    def set_sentences(self,sentence_list):
        self.sentence_list=sentence_list
        self.cleaned_list=[self.clean_sentence(sentence) for sentence in sentence_list]
        self.sentence_lens=[len(sentence) for sentence in self.cleaned_list]
        self.ngram_list=[self.generate_ngrams(tokenized_sentence) for tokenized_sentence in self.cleaned_list]
        self.embedded_sentences=[self.embed_ngrams(grams) for grams in self.ngram_list]

    # Function to get embeddings for given unigrams, bigrams and trigrams and return as list
    def embed_ngrams(self,ngram_list):
        emb_ngrams=[]
        #emb_ngrams=np.arr
        for grams in ngram_list:
            embedded_grams=np.array(self.model(grams))
            emb_ngrams.append(embedded_grams)
        return emb_ngrams

    # Function to clean a sentence and return as tokens
    def clean_sentence(self,sentence):
        sentence=re.sub(r"[^a-zA-Z']", ' ', sentence)
        sentence=re.sub(r"[^a-zA-Z ]", '', sentence)
        return word_tokenize(sentence)

    # Function to get similarity score for given n-gram samples
      ## Since USE returns normalized vectors, the cosine similarity can be calculated through the dot product of two vectors  
      ## The dot product is calculated by matrix multiplication of sentence 1 n-grams with the transpose of sentence 2 n-grams  
      ## Max of all values is taken on the horizontal axis to get the maximum similarity for each n-gram  
      ## Score is calculated by getting the sum of similarities and dividing by number of n-grams in sentence 1
    def get_score(self,emb1,emb2):
        # Get number of ngrams in sentence 1
        N1=len(emb1)
        ## Get cosine similarity between n-grams
        ## This is done by getting dot product of n-grams of sentence 1 with n-grams of sentence 2
        sim_matrix=np.matmul(emb1,emb2.T)

        ## Get maximum similarity score as match
        max_sim=np.max(sim_matrix,axis=1)

        ## Get score by dividing sum by number of n-grams in sentence 1
        score=np.sum(max_sim)/N1
        return score

    # Function to calculate similarity score for 2 embedded sentences
    def get_sim_score(self,embeds1,embeds2):
        final_score=0
        # Score has to be divided by sum of length of n-grams i.e. 1+2+3
        div=6
        for i in range(0,3):
          scoren=self.get_score(embeds1[i],embeds2[i])
          print(f'{i+1}-gram score: {scoren}')
          # Multiply scoren with weight and add to final score
          final_score+=(i+1)*scoren
        return final_score/div

    # Function to find the stored sentence that is most similar to input
    def best_sim(self,sentence):
        best_sim=0
        cleaned_input=self.clean_sentence(sentence)
        input_ngrams=self.generate_ngrams(cleaned_input)
        input_emb_ngrams=self.embed_ngrams(input_ngrams)
        input_len=len(cleaned_input)
        score_list=[]
        for i in range(0,len(self.sentence_list)):
          print(f'Comparing with : {self.sentence_list[i]}')
          if input_len>self.sentence_lens[i]:
            score=self.get_sim_score(input_emb_ngrams,self.embedded_sentences[i])
          else:
            score=self.get_sim_score(self.embedded_sentences[i],input_emb_ngrams)
          print("Final score = ",score)
          print('-----------------------')
          score_list.append(score)
            # Select highest similarity score
        best_score=max(score_list)
        # Get sentence with highest similarity
        best_sentence=self.sentence_list[score_list.index(best_score)]
        return best_sentence,best_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [66]:
checker=similarityChecker("https://tfhub.dev/google/universal-sentence-encoder/4")

In [67]:
sentences=["Cancel my order",
           "Show recent items",
           "Show my orders",
           "Track my order",
           "Confirm my order"]
checker.set_sentences(sentences)

In [68]:
checker.best_sim("I don't want my package")

Comparing with : Cancel my order
1-gram score: 0.5626727104187011
2-gram score: 0.43262115120887756
3-gram score: 0.30491383870442706
Final score =  0.3904427548249563
-----------------------
Comparing with : Show recent items
1-gram score: 0.36089038848876953
2-gram score: 0.14835841953754425
3-gram score: 0.05635423461596171
Final score =  0.13777832190195718
-----------------------
Comparing with : Show my orders
1-gram score: 0.5521993637084961
2-gram score: 0.39841723442077637
3-gram score: 0.21370967229207358
Final score =  0.3316938082377116
-----------------------
Comparing with : Track my order
1-gram score: 0.5742344379425048
2-gram score: 0.3852464556694031
3-gram score: 0.24673978487650552
Final score =  0.3474911173184713
-----------------------
Comparing with : Confirm my order
1-gram score: 0.5626727104187011
2-gram score: 0.3768531382083893
3-gram score: 0.1868083675702413
Final score =  0.31280068159103397
-----------------------


('Cancel my order', 0.3904427548249563)

In [69]:
checker.best_sim('Find my package')

Comparing with : Cancel my order
1-gram score: 0.5543013413747152
2-gram score: 0.5327193737030029
3-gram score: 0.3417583107948303
Final score =  0.4408358368608687
-----------------------
Comparing with : Show recent items
1-gram score: 0.4209263324737549
2-gram score: 0.21649831533432007
3-gram score: 0.1797013282775879
Final score =  0.23217115799585977
-----------------------
Comparing with : Show my orders
1-gram score: 0.6375230948130289
2-gram score: 0.6075990200042725
3-gram score: 0.4083702564239502
Final score =  0.5129719840155708
-----------------------
Comparing with : Track my order
1-gram score: 0.6176145871480306
2-gram score: 0.6145361661911011
3-gram score: 0.5404067039489746
Final score =  0.5779845052295262
-----------------------
Comparing with : Confirm my order
1-gram score: 0.5649413267771403
2-gram score: 0.5035541653633118
3-gram score: 0.2844642400741577
Final score =  0.40424039628770614
-----------------------


('Track my order', 0.5779845052295262)

In [70]:
checker.set_sentences(["I hope you have read my mail"])
checker.best_sim("I hope you received my mail")

Comparing with : I hope you have read my mail
1-gram score: 0.8474416732788086
2-gram score: 0.7590710322062174
3-gram score: 0.675020408630371
Final score =  0.7317741605970594
-----------------------


('I hope you have read my mail', 0.7317741605970594)