In [1]:
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import tensorflow_hub as hub
import tensorflow as tf 
import fasttext.util
nltk.download('punkt')

class FTsimilarityChecker():

    # init dunder to load model  
    def __init__(self):
        #self.url= url
        self.model = fasttext.load_model('FastText Model/cc.en.300.bin')
        self.sentence_list=[]

    # Function to generate a list of unigrams, bigrams and trigrams of given sentence
    def generate_ngrams(self,sentence_tokens):
        ngram_list=[]
        for i in range(1,4): 
            grams=list(ngrams(sentence_tokens,i))
            
            ngram_list.append([' '.join(gram) for gram in grams])
        return ngram_list
    
    # Function to take a list of sentences as input and store their tokens, lengths, ngrams and embeddings
    def set_sentences(self,sentence_list):
        self.sentence_list=list(sentence_list)
        self.cleaned_list=[self.clean_sentence(sentence) for sentence in sentence_list]
        self.sentence_lens=[len(sentence) for sentence in self.cleaned_list]
        self.ngram_list=[self.generate_ngrams(tokenized_sentence) for tokenized_sentence in self.cleaned_list]
        self.embedded_sentences=[self.embed_ngrams(grams) for grams in self.ngram_list]

    # Function to get embeddings for given unigrams, bigrams and trigrams and return as list
    '''
    def embed_ngrams(self,ngram_list):
        emb_ngrams=[]
        #emb_ngrams=np.arr
        for grams in ngram_list:
            embedded_grams=np.array(self.model(grams))
            emb_ngrams.append(embedded_grams)
        return emb_ngrams
    '''
    def get_norm_vector(self,gram):
        sentence_vector=self.model.get_sentence_vector(gram)
        norm_vector=sentence_vector/np.linalg.norm(sentence_vector)
        return norm_vector

    def embed_ngrams(self,ngram_list):
        emb_list=[]
        for grams in ngram_list:
            if len(grams):
                embeds=np.array(list(map(self.get_norm_vector, grams)))
                emb_list.append(embeds)
        return emb_list

    # Function to clean a sentence and return as tokens
    def clean_sentence(self,sentence):
        sentence=re.sub(r"[^a-zA-Z']", ' ', sentence)
        sentence=re.sub(r"[^a-zA-Z ]", '', sentence)
        return word_tokenize(sentence)

    # Function to get similarity score for given n-gram samples
      ## Since USE returns normalized vectors, the cosine similarity can be calculated through the dot product of two vectors  
      ## The dot product is calculated by matrix multiplication of sentence 1 n-grams with the transpose of sentence 2 n-grams  
      ## Max of all values is taken on the horizontal axis to get the maximum similarity for each n-gram  
      ## Score is calculated by getting the sum of similarities and dividing by number of n-grams in sentence 1
    def get_score(self,emb1,emb2):
        
        # Get number of ngrams in sentence 1
        N1=len(emb1)
        
        ## Get cosine similarity between n-grams
        ## This is done by getting dot product of n-grams of sentence 1 with n-grams of sentence 2
        try:
            sim_matrix=np.matmul(emb1,emb2.T)
        except:
            sim_matrix=0

        ## Get maximum similarity score as match
        try:
            max_sim=np.max(sim_matrix,axis=1)
        except:
            max_sim=0
        
        ## Get score by dividing sum by number of n-grams in sentence 1
        score=np.sum(max_sim)/N1
        return score

    # Function to calculate similarity score for 2 embedded sentences
    def get_sim_score(self,embeds1,embeds2):
        final_score=0
        # Score has to be divided by sum of length of n-grams i.e. 1+2+3
        div=0
        for i in range(0,min(len(embeds1),len(embeds2))):
          
          scoren=self.get_score(embeds1[i],embeds2[i])
          print(f'{i+1}-gram score: {scoren}')
          
          # Multiply scoren with weight and add to final score
          if scoren != 0:
            final_score+=(i+1)*scoren
            div+= i+1
        return final_score/div

    # Function to find the stored sentence that is most similar to input
    def best_sim(self,sentence):
        cleaned_input=self.clean_sentence(sentence)
        input_ngrams=self.generate_ngrams(cleaned_input)
        input_emb_ngrams=self.embed_ngrams(input_ngrams)
        input_len=len(cleaned_input)
        score_list=[]
        for i in range(0,len(self.sentence_list)):
          print(f'Comparing with : {self.sentence_list[i]}')
          if input_len>self.sentence_lens[i]:
            score=self.get_sim_score(input_emb_ngrams,self.embedded_sentences[i])
          else:
            score=self.get_sim_score(self.embedded_sentences[i],input_emb_ngrams)
          print("Final score = ",score)
          print('-----------------------')
          score_list.append(score)
        # Select highest similarity score
        best_score=max(score_list)
        # Get sentence with highest similarity
        best_sentence=self.sentence_list[score_list.index(best_score)]
        return best_sentence,best_score

    # Function to compare 2 input sentences and give similarity score
    def compare_sentences(self,sentence1,sentence2):
        # Clean both sentences
        cleaned_sent1=self.clean_sentence(sentence1)
        cleaned_sent2=self.clean_sentence(sentence2)

        # Make sure sentence 1 is the longer sentence
        if len(cleaned_sent2)>len(cleaned_sent1):
          cleaned_sent1,cleaned_sent2=cleaned_sent2,cleaned_sent1

        # Generate ngrams for both sentences
        sent1_ngrams=self.generate_ngrams(cleaned_sent1)
        sent2_ngrams=self.generate_ngrams(cleaned_sent2)

        # Embed ngrams for both sentences
        sent1_emb_ngrams=self.embed_ngrams(sent1_ngrams)
        sent2_emb_ngrams=self.embed_ngrams(sent2_ngrams)

        # Calculate and return similarity score
        final_score=self.get_sim_score(sent1_emb_ngrams,sent2_emb_ngrams)
        print(f'Similarity between "{sentence1}" and "{sentence2} = {final_score}')
        return final_score

    # Function to find best similarity with user entered list of sentences
    def best_from_list(self,sentence1,sentences):
        # Clean list of sentences
        cleaned_list=[self.clean_sentence(sentence) for sentence in sentences]
        sentence_lens=[len(sentence) for sentence in cleaned_list]
        ngram_list=[self.generate_ngrams(tokenized_sentence) for tokenized_sentence in cleaned_list]
        embedded_sentences=[self.embed_ngrams(grams) for grams in ngram_list]

        # Clean sentence 1
        cleaned_input=self.clean_sentence(sentence1)
        input_ngrams=self.generate_ngrams(cleaned_input)
        input_emb_ngrams=self.embed_ngrams(input_ngrams)
        input_len=len(cleaned_input)
        score_list=[]
        for i in range(0,len(sentences)):
          print(f'Comparing with : {sentences[i]}')
          if input_len>=sentence_lens[i]:
            score=self.get_sim_score(input_emb_ngrams,embedded_sentences[i])
          else:
            score=self.get_sim_score(embedded_sentences[i],input_emb_ngrams)
          print("Final score = ",score)
          print('-----------------------')
          score_list.append(score)
        # Select highest similarity score
        best_score=max(score_list)
        # Get sentence with highest similarity
        best_sentence=sentences[score_list.index(best_score)]
        return best_sentence,best_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Download Fasttext model

In [None]:
#fasttext.util.download_model('en', if_exists='ignore')

### Create class object

In [2]:
checker=FTsimilarityChecker()



### Initialize sentence list

In [3]:
sentences=["Cancel my order",
           "Show recent items",
           "Show my orders",
           "Track my order",
           "Confirm my order"]
checker.set_sentences(sentences)

In [4]:
checker.ngram_list

[[['Cancel', 'my', 'order'], ['Cancel my', 'my order'], ['Cancel my order']],
 [['Show', 'recent', 'items'],
  ['Show recent', 'recent items'],
  ['Show recent items']],
 [['Show', 'my', 'orders'], ['Show my', 'my orders'], ['Show my orders']],
 [['Track', 'my', 'order'], ['Track my', 'my order'], ['Track my order']],
 [['Confirm', 'my', 'order'],
  ['Confirm my', 'my order'],
  ['Confirm my order']]]

### Check similarity for various sentences

In [5]:
checker.compare_sentences('Hello world',"Hello world")

1-gram score: 0.9999999403953552
2-gram score: 1.0
Similarity between "Hello world" and "Hello world = 0.999999980131785


0.999999980131785

In [6]:
sentence,score=checker.best_sim("Cancel my order")
print(f'Highest similarity with "{sentence}" with score = {score}')

Comparing with : Cancel my order
1-gram score: 1.0
2-gram score: 0.9999998807907104
3-gram score: 1.0
Final score =  0.9999999602635702
-----------------------
Comparing with : Show recent items
1-gram score: 0.2643624146779378
2-gram score: 0.30793842673301697
3-gram score: 0.3360864520072937
Final score =  0.3147497706943088
-----------------------
Comparing with : Show my orders
1-gram score: 0.6095080773035685
2-gram score: 0.7331385612487793
3-gram score: 0.644798219203949
Final score =  0.668363309568829
-----------------------
Comparing with : Track my order
1-gram score: 0.7109317779541016
2-gram score: 0.8036900758743286
3-gram score: 0.7677028775215149
Final score =  0.7702367603778839
-----------------------
Comparing with : Confirm my order
1-gram score: 0.7908719380696615
2-gram score: 0.8587917685508728
3-gram score: 0.8389230966567993
Final score =  0.8375374608569676
-----------------------
Highest similarity with "Cancel my order" with score = 0.9999999602635702


In [7]:
checker.best_sim("I don't want my package")

Comparing with : Cancel my order
1-gram score: 0.4870284557342529
2-gram score: 0.5441968441009521
3-gram score: 0.5295750300089518
Final score =  0.5273578723271688
-----------------------
Comparing with : Show recent items
1-gram score: 0.17735130786895753
2-gram score: 0.17905811965465546
3-gram score: 0.19400978088378906
Final score =  0.1862494816382726
-----------------------
Comparing with : Show my orders
1-gram score: 0.4551896095275879
2-gram score: 0.46752655506134033
3-gram score: 0.43185075124104816
Final score =  0.4476324955622355
-----------------------
Comparing with : Track my order
1-gram score: 0.4870284557342529
2-gram score: 0.5410385131835938
3-gram score: 0.4684861898422241
Final score =  0.49576067527135215
-----------------------
Comparing with : Confirm my order
1-gram score: 0.4870284557342529
2-gram score: 0.5453283786773682
3-gram score: 0.5475306510925293
Final score =  0.5367128610610962
-----------------------


('Confirm my order', 0.5367128610610962)

In [8]:
checker.compare_sentences("I hope you have read my mail","I hope you received my mail")

1-gram score: 0.8520445823669434
2-gram score: 0.8508358796437582
3-gram score: 0.8616189956665039
Similarity between "I hope you have read my mail" and "I hope you received my mail = 0.8564288881089953


0.8564288881089953

## Loading Test data

In [9]:
import pandas as pd
import unicodedata

In [10]:
data=pd.read_csv('Data/similarity-testing-data.csv')

In [11]:
data.columns

Index(['Alias', 'Label'], dtype='object')

In [12]:
aliases=list(data['Alias'])
labels=list(data['Label'])

In [13]:
aliases=[unicodedata.normalize("NFKD", alias) for alias in aliases]

In [14]:
labels=[unicodedata.normalize("NFKD", label) for label in labels]

In [15]:
len(labels),len(aliases)

(316, 316)

In [16]:
set(labels)

{'Add my address',
 'Add to cart',
 'Add to wishlist',
 'Cancel my order',
 'Check my points',
 'Check working hours',
 'Find stores nearby',
 'Get membership',
 'How to pay',
 'Place an order',
 'Search for items',
 'Show bestsellers',
 'Show big deals',
 'Show cart',
 'Show contact information',
 'Show my orders',
 'Show product information',
 'Show recent items',
 'Show reservation',
 'Show return policy',
 'Track my order',
 'Use my coupons',
 'View new arrivals',
 'Write a complaint',
 'Write a review'}

In [17]:
checker.set_sentences(set(labels))

In [18]:
type(checker.sentence_list)

list

In [19]:
checker.best_sim('Hello my order')

Comparing with : Show recent items
1-gram score: 0.23769301176071167
2-gram score: 0.28698933124542236
3-gram score: 0.30486783385276794
Final score =  0.28771252930164337
-----------------------
Comparing with : Add to wishlist
1-gram score: 0.287508487701416
2-gram score: 0.4982783794403076
3-gram score: 0.4846133291721344
Final score =  0.45631753901640576
-----------------------
Comparing with : Show cart
1-gram score: 0.21007925271987915
2-gram score: 0.2145421802997589
Final score =  0.21305453777313232
-----------------------
Comparing with : Search for items
1-gram score: 0.2623240152994792
2-gram score: 0.3414364457130432
3-gram score: 0.34922072291374207
Final score =  0.3321431792444653
-----------------------
Comparing with : Show reservation
1-gram score: 0.16774451732635498
2-gram score: 0.16436296701431274
Final score =  0.16549015045166016
-----------------------
Comparing with : Cancel my order
1-gram score: 0.7288436889648438
2-gram score: 0.8286188244819641
3-gram sc

('Cancel my order', 0.7923769354820251)

In [20]:
correct=0
for i in range(0,len(aliases)):
    sentence,score=checker.best_sim(aliases[i])
    if  sentence==labels[i]:
        print(sentence,labels[i])
        correct+=1

Comparing with : Show recent items
1-gram score: 0.26483603318532306
2-gram score: 0.29195651412010193
3-gram score: 0.34274986386299133
Final score =  0.31283310883575016
-----------------------
Comparing with : Add to wishlist
1-gram score: 0.287508487701416
2-gram score: 0.4982783794403076
3-gram score: 0.468625545501709
Final score =  0.44832364718119305
-----------------------
Comparing with : Show cart
1-gram score: 0.23722227414449057
2-gram score: 0.251961886882782
Final score =  0.24704868263668486
-----------------------
Comparing with : Search for items
1-gram score: 0.25453780094782513
2-gram score: 0.3414364457130432
3-gram score: 0.3833999037742615
Final score =  0.34793506728278273
-----------------------
Comparing with : Show reservation
1-gram score: 0.19488751888275146
2-gram score: 0.20559881627559662
Final score =  0.2020283838113149
-----------------------
Comparing with : Cancel my order
1-gram score: 0.7088770071665446
2-gram score: 0.8067578077316284
3-gram scor

In [21]:
correct

160

In [22]:
len(labels)

316

In [23]:
checker.best_sim("Cancel my order")

Comparing with : Show recent items
1-gram score: 0.2643624146779378
2-gram score: 0.30793842673301697
3-gram score: 0.3360864520072937
Final score =  0.3147497706943088
-----------------------
Comparing with : Add to wishlist
1-gram score: 0.336869478225708
2-gram score: 0.4982783794403076
3-gram score: 0.5469393134117126
Final score =  0.49570736289024353
-----------------------
Comparing with : Show cart
1-gram score: 0.23674865563710532
2-gram score: 0.26266980171203613
Final score =  0.2540294196870592
-----------------------
Comparing with : Search for items
1-gram score: 0.315083106358846
2-gram score: 0.3414364457130432
3-gram score: 0.39321279525756836
Final score =  0.3629323972596062
-----------------------
Comparing with : Show reservation
1-gram score: 0.19441392024358115
2-gram score: 0.2304058074951172
Final score =  0.21840851174460518
-----------------------
Comparing with : Cancel my order
1-gram score: 1.0
2-gram score: 0.9999998807907104
3-gram score: 1.0
Final score

('Cancel my order', 0.9999999602635702)