## Import dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
from nltk.util import ngrams
from nltk import word_tokenize
from itertools import repeat

In [2]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers

## Load pre-trained Universal Sentence Encoder(USE) model

In [3]:
url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(url)

## Create suite of functions

### Function to remove special characters from the sentence and return it in a tokenized format

In [4]:
def clean_sentence(sentence):
    sentence=re.sub(r"[^a-zA-Z']", ' ', sentence)
    sentence=re.sub(r"[^a-zA-Z ]", '', sentence)
    return word_tokenize(sentence)

### Function to generate n-grams from given sentence

In [5]:
def get_ngrams(sentence_tokens,n):
    ngram_list=[]
    for i in range(1,n+1): 
        grams=ngrams(sentence_tokens,i)
        grams=[' '.join(gram) for gram in grams]
        ngram_list.append(grams)
    return ngram_list

### Function to embed the n-grams of given sentence. Zero vector is returned if no n-grams could be formed

In [6]:

def embed_ngrams(ngram_list):
    if ngram_list:
        return np.array(model(ngram_list))
    else:
        return np.zeros((1,512))

### **Function to get similarity score for given n-gram samples**
- Since USE returns normalized vectors, the cosine similarity can be calculated through the dot product of two vectors  
- The dot product is calculated by matrix multiplication of sentence 1 n-grams with the transpose of sentence 2 n-grams  
- Max of all values is taken on the horizontal axis to get the maximum similarity for each n-gram  
- Score is calculated by getting the sum of similarities and dividing by number of n-grams in sentence 1

In [7]:
def get_score(sent1_ngrams,sent2_ngrams):
    N1=len(sent1_ngrams)
    ## Get embedded ngrams
    emb_ngrams1=embed_ngrams(sent1_ngrams)
    emb_ngrams2=embed_ngrams(sent2_ngrams)

    ## Get cosine similarity between n-grams
    ## This is done by getting dot product of n-grams of sentence 1 with n-grams of sentence 2
    sim_matrix=np.matmul(emb_ngrams1,emb_ngrams2.T)

    ## Get maximum similarity score as match
    max_sim=np.max(sim_matrix,axis=1)

    ## Get score by dividing sum by number of n-grams in sentence 1
    score=np.sum(max_sim)/N1
    return score

### Master function to get overall similarity score
- Sentences are first cleaned and tokenized
- Then sets of n-grams are obtained
- The similarity score is calculated for each n-gram
- The final sentence similarity is calculated by the weighted sum of the n-gram scores

In [8]:
def get_sim_score(sentence1,sentence2):
    # Clean both sentences
    cleaned_sent1=clean_sentence(sentence1)
    cleaned_sent2=clean_sentence(sentence2)

    # Possible solution for different length?
    if len(cleaned_sent2)>len(cleaned_sent1):
       cleaned_sent1,cleaned_sent2=cleaned_sent2,cleaned_sent1
    n=3
    # Get unigrams, bigrams and trigrams for sentence 1 and sentence 2
    sent1_ngrams=get_ngrams(cleaned_sent1,n)
    sent2_ngrams=get_ngrams(cleaned_sent2,n)
    final_score=0
    div=6
    # n-gram score
    for i in range(0,n):
        scoren=get_score(sent1_ngrams[i],sent2_ngrams[i])
        print(f'{i+1}-gram score: {scoren}')
        # Multiply scoren with weight and add to final score
        final_score+=(i+1)*scoren
    # Divide final score by the common divisor
    return final_score/div

### Function to get most similar sentence from a list of sentences

In [9]:
def get_best_score(input_sentence,sentence_list):
    score_list=[]
    # iterate over list of sentences to get a list of scores
    for sentence in sentence_list:
        print(f'Comparing with : {sentence}')
        score=get_sim_score(input_sentence,sentence)
        print("Final score = ",score)
        print('-----------------------')
        score_list.append(score)
    # Select highest similarity score
    best_score=max(score_list)
    # Get sentence with highest similarity
    best_sentence=sentence_list[score_list.index(best_score)]
    return best_sentence,best_score

### Calling function on a few samples

In [10]:
sentence_sim=get_sim_score("I hope you received my mail","I hope you have read my mail")
print("Sentence similarity =",sentence_sim)

1-gram score: 0.8474415370396206
2-gram score: 0.7590710322062174
3-gram score: 0.6750203132629394
Sentence similarity = 0.7317740902068123


### Testing with similar words

In [11]:
sentences=["Cancel my order",
           "Show recent items",
           "Show my orders",
           "Track my order",
           "Confirm my order"]
sentence,score=get_best_score("Find my package",sentences)


Comparing with : Cancel my order
1-gram score: 0.6111734708150228
2-gram score: 0.5327193140983582
3-gram score: 0.3417583107948303
Final score =  0.45031450523270505
-----------------------
Comparing with : Show recent items
1-gram score: 0.466095765431722
2-gram score: 0.2164982706308365
3-gram score: 0.1797012984752655
Final score =  0.23969936701986524
-----------------------
Comparing with : Show my orders
1-gram score: 0.6386563777923584
2-gram score: 0.6075990796089172
3-gram score: 0.4083702862262726
Final score =  0.5131608992815018
-----------------------
Comparing with : Track my order
1-gram score: 0.6304430961608887
2-gram score: 0.6145361661911011
3-gram score: 0.5404067039489746
Final score =  0.5801225900650024
-----------------------
Comparing with : Confirm my order
1-gram score: 0.6111734708150228
2-gram score: 0.5035542249679565
3-gram score: 0.2844642400741577
Final score =  0.4119457734955682
-----------------------


In [12]:
print(f"Input is most similar to '{sentence}' with a score of {score}")

Input is most similar to 'Track my order' with a score of 0.5801225900650024


### Testing with very different sentence

In [13]:
sentence2,score2=get_best_score("I don't want my package",sentences)

Comparing with : Cancel my order
1-gram score: 0.5626726150512695
2-gram score: 0.4326210916042328
3-gram score: 0.30491383870442706
Final score =  0.3904427190621694
-----------------------
Comparing with : Show recent items
1-gram score: 0.36089043617248534
2-gram score: 0.14835841953754425
3-gram score: 0.056354264418284096
Final score =  0.13777834475040437
-----------------------
Comparing with : Show my orders
1-gram score: 0.5521992683410645
2-gram score: 0.39841729402542114
3-gram score: 0.21370969216028848
Final score =  0.331693822145462
-----------------------
Comparing with : Track my order
1-gram score: 0.5742343902587891
2-gram score: 0.3852463960647583
3-gram score: 0.24673974514007568
Final score =  0.34749106963475546
-----------------------
Comparing with : Confirm my order
1-gram score: 0.5626726150512695
2-gram score: 0.3768530488014221
3-gram score: 0.1868083874384562
Final score =  0.31280064582824707
-----------------------


In [14]:
print(f"Input is most similar to '{sentence2}' with a score of {score2}")

Input is most similar to 'Cancel my order' with a score of 0.3904427190621694
