## Import dependencies

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
from nltk.util import ngrams
from nltk import word_tokenize
from itertools import repeat

In [2]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers

## Load pre-trained Universal Sentence Encoder(USE) model

In [3]:
url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(url)

## Create suite of functions

### Function to remove special characters from the sentence and return it in a tokenized format

In [4]:
def clean_sentence(sentence):
    sentence=re.sub(r"[^a-zA-Z']", ' ', sentence)
    sentence=re.sub(r"[^a-zA-Z ]", '', sentence)
    return word_tokenize(sentence)

### Function to generate unigrams, bigrams and trigrams from given sentence

In [5]:
def get_ngrams(sentence_tokens):
    unigrams=ngrams(sentence_tokens,1)
    unigrams=[' '.join(gram) for gram in unigrams]
    bigrams=ngrams(sentence_tokens,2)
    bigrams=[' '.join(gram) for gram in bigrams]
    trigrams=ngrams(sentence_tokens,3)
    trigrams=[' '.join(gram) for gram in trigrams]
    return unigrams,bigrams,trigrams
    

### Function to embed the n-grams of given sentence. Zero vector is returned if no n-grams could be formed

In [6]:
def embed_ngrams(ngram_list):
    if ngram_list:
        return np.array(model(ngram_list))
    else:
        return np.zeros((1,512))

### **Function to get similarity score for given n-gram samples**
- Since USE returns normalized vectors, the cosine similarity can be calculated through the dot product of two vectors  
- The dot product is calculated by matrix multiplication of sentence 1 n-grams with the transpose of sentence 2 n-grams  
- Max of all values is taken on the horizontal axis to get the maximum similarity for each n-gram  
- Score is calculated by getting the sum of similarities and dividing by number of n-grams in sentence 1

In [7]:
def get_score(sent1_ngrams,sent2_ngrams):
    N1=len(sent1_ngrams)
    ## Get embedded ngrams
    emb_ngrams1=embed_ngrams(sent1_ngrams)
    emb_ngrams2=embed_ngrams(sent2_ngrams)

    ## Get cosine similarity between n-grams
    ## This is done by getting dot product of n-grams of sentence 1 with n-grams of sentence 2
    sim_matrix=np.matmul(emb_ngrams1,emb_ngrams2.T)

    ## Get maximum similarity score as match
    max_sim=np.max(sim_matrix,axis=1)

    ## Get score by dividing sum by number of n-grams in sentence 1
    score=np.sum(max_sim)/N1
    return score

### Master function to get overall similarity score
- Sentences are first cleaned and tokenized
- Then unigrams, bigrams and trigrams are obtained
- The similarity score is calculated for each n-gram
- The final sentence similarity is calculated by the weighted sum of the n-gram scores

In [8]:
def get_sim_score(sentence1,sentence2):
    # Clean both sentences
    cleaned_sent1=clean_sentence(sentence1)
    cleaned_sent2=clean_sentence(sentence2)
    # Possible solution for different length?
    if len(cleaned_sent2)>len(cleaned_sent1):
       cleaned_sent1,cleaned_sent2=cleaned_sent2,cleaned_sent1

    # Get unigrams, bigrams and trigrams for sentence 1 and sentence 2
    sent1_unigrams,sent1_bigrams,sent1_trigrams=get_ngrams(cleaned_sent1)
    sent2_unigrams,sent2_bigrams,sent2_trigrams=get_ngrams(cleaned_sent2)

    # Unigram Score
    score1=get_score(sent1_unigrams,sent2_unigrams)
    print("Unigram score:",score1)
    print("_________________________________")
    # Bigram score
    score2=get_score(sent1_bigrams,sent2_bigrams)
    print("Bigram score:",score2)
    print("_________________________________")
    # Trigram Score
    score3=get_score(sent1_trigrams,sent2_trigrams)
    print("Trigram score:",score3)
    print("_________________________________")
    sentence_similarity=  1*score1/6 + 2*score2/6 + 3*score3/6
    return sentence_similarity

### Calling function on a few samples

In [12]:
sentence_sim=get_sim_score("I hope you received my mail","I hope you have read my mail")
print("Sentence similarity=",sentence_sim)

Unigram score: 0.8474415370396206
_________________________________
Bigram score: 0.7590710322062174
_________________________________
Trigram score: 0.6750203132629394
_________________________________
Sentence similarity= 0.7317740902068124


In [10]:
print("Sentence similarity=",get_sim_score("Cancel my order please","Show my last bought items"))

Unigram score: 0.5047336101531983
_________________________________
Bigram score: 0.40157806873321533
_________________________________
Trigram score: 0.3316713372866313
_________________________________
Sentence similarity= 0.38381729324658714


In [11]:
print("Sentence similarity=",get_sim_score("I don't want my order","I don't want my order to be delayed"))

Unigram score: 0.7891566753387451
_________________________________
Bigram score: 0.7568678174700055
_________________________________
Trigram score: 0.7355847358703613
_________________________________
Sentence similarity= 0.7516077529816401
