In [17]:
# check history for duplicate questions
import torch
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')

question = 'esophageal cancer symptoms'
query_embedding = model.encode(question, convert_to_tensor=True)


history = ['symptoms of esophageal cancer',  # 4 duplicates
           'symptoms esophageal cancer',
           'what is the symptoms of esophageal cancer',
           'causes and symptoms of esophageal cancer',
            'what type of cancer does cigars cause',  # 4 new
           'lung cancer causes and symptoms',
           'mouth cancer symptoms',
           'causes of mouth cancer']

corpus_embeddings = model.encode(history, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 5 scores
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
print(cos_scores)

# Find the closest question in the history based on cosine similarity
top_results = torch.topk(cos_scores, k=1)
if top_results[0] > 0.8:
    print('Duplicate detected')

tensor([0.9750, 0.9668, 0.9342, 0.9046, 0.4253, 0.5013, 0.4864, 0.4324])
tensor([0.9750])
Duplicate detected
symptoms of esophageal cancer (Score: 0.9750)


In [1]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

# Single list of sentences - Possible tens of thousands of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

paraphrases = util.paraphrase_mining(model, sentences)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

The new movie is awesome 		 The new movie is so great 		 Score: 0.9816
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6247
I love pasta 		 The new movie is so great 		 Score: 0.2605
I love pasta 		 The new movie is awesome 		 Score: 0.2526
I love pasta 		 The cat plays in the garden 		 Score: 0.2455
I love pasta 		 Do you like pizza? 		 Score: 0.1997
The cat sits outside 		 A woman watches TV 		 Score: 0.1837
The cat plays in the garden 		 A woman watches TV 		 Score: 0.1760
A man is playing guitar 		 Do you like pizza? 		 Score: 0.1080
A woman watches TV 		 The new movie is so great 		 Score: 0.1008


In [2]:
# check duplicate questions
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('distilbert-base-nli-stsb-quora-ranking')

# Single list of sentences - Possible tens of thousands of sentences
sentences = ['esophageal cancer symptoms',
             'symptoms of esophageal cancer',
             'symptoms esophageal cancer']

paraphrases = util.paraphrase_mining(model, sentences)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

100%|██████████| 245M/245M [00:07<00:00, 33.9MB/s] 


symptoms of esophageal cancer 		 symptoms esophageal cancer 		 Score: 0.9784
esophageal cancer symptoms 		 symptoms of esophageal cancer 		 Score: 0.9750
esophageal cancer symptoms 		 symptoms esophageal cancer 		 Score: 0.9668
