# This is very simple and mini-project to calculate the similarity between two sentences using Natural Language Processing(NLP).

# Import some useful libraries

In [1]:
import re
import nltk
#!pip install Unidecode
import unidecode
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize

In [2]:
#For example:-
#sentence1 = " The symbol # is commonly used with numbers, especially in American English."
#sentence2 = "We use the symbol % to indicate a percentage (that is, an amount in 100)"

In [3]:
sentence1 = input("Enter first string: ").lower()
sentence2 = input("Enter second string: ").lower()

Enter first string: The symbol # is commonly used with numbers, especially in American English.
Enter second string: We use the symbol % to indicate a percentage (that is, an amount in 100)


# Preprocessing(Tokenization+Cleaning)of text input data.

In [4]:
stopwords=nltk.corpus.stopwords.words("english")
wn=nltk.WordNetLemmatizer()
def remove_other_symbol(text):
    cleaned_text = re.sub(r'[^\w\s]','',str(text))
    cleaned_text = re.sub(r'\d+','',str(cleaned_text))
    cleaned_text=cleaned_text.lower()
    #tokenize_word = re.split('\W+',(cleaned_text))
    #new_cleaned_text = [word for word in tokenize_word if word not in stopwords]
    return cleaned_text

# To remove special Characters and digits.

In [5]:
cleaned_sent1 = remove_other_symbol(sentence1)
print(cleaned_sent1)
print("----------------------------------------------------------------------------------")
cleaned_sent2 = remove_other_symbol(sentence2)
print(cleaned_sent2)

the symbol  is commonly used with numbers especially in american english
----------------------------------------------------------------------------------
we use the symbol  to indicate a percentage that is an amount in 


# Tokenization of two different sentences.

In [6]:
def text_tokenize(txt):
    tokenized_txt=re.split('\W+',(txt))
    return tokenized_txt

In [7]:
tokenized_sent1 = text_tokenize(cleaned_sent1)
print(tokenized_sent1)
print("------------------------------------------------------------------------------------------")
tokenized_sent2 = text_tokenize(cleaned_sent2)
print(tokenized_sent2)

['the', 'symbol', 'is', 'commonly', 'used', 'with', 'numbers', 'especially', 'in', 'american', 'english']
------------------------------------------------------------------------------------------
['we', 'use', 'the', 'symbol', 'to', 'indicate', 'a', 'percentage', 'that', 'is', 'an', 'amount', 'in', '']


# Remove Stopwords from the tokenized words

In [8]:
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words("english")
print(" , ".join(stopwords))

i , me , my , myself , we , our , ours , ourselves , you , you're , you've , you'll , you'd , your , yours , yourself , yourselves , he , him , his , himself , she , she's , her , hers , herself , it , it's , its , itself , they , them , their , theirs , themselves , what , which , who , whom , this , that , that'll , these , those , am , is , are , was , were , be , been , being , have , has , had , having , do , does , did , doing , a , an , the , and , but , if , or , because , as , until , while , of , at , by , for , with , about , against , between , into , through , during , before , after , above , below , to , from , up , down , in , out , on , off , over , under , again , further , then , once , here , there , when , where , why , how , all , any , both , each , few , more , most , other , some , such , no , nor , not , only , own , same , so , than , too , very , s , t , can , will , just , don , don't , should , should've , now , d , ll , m , o , re , ve , y , ain , aren , 

[nltk_data] Downloading package stopwords to C:\Users\Waseem Akram
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def remove_sw(tokenized_word):
    remove_stopword=[word for word in tokenized_word if word not in stopwords]
    return remove_stopword

In [10]:
remove_sw_sent1 = remove_sw(tokenized_sent1)
print(remove_sw_sent1)
print("------------------------------------------------------------------------------------------")
remove_sw_sent2 = remove_sw(tokenized_sent2)
print(remove_sw_sent2)

['symbol', 'commonly', 'used', 'numbers', 'especially', 'american', 'english']
------------------------------------------------------------------------------------------
['use', 'symbol', 'indicate', 'percentage', 'amount', '']


# To lemmatization each sentence1 and Sentence2

In [11]:
wn=nltk.WordNetLemmatizer()
def lemmatization(token_words):
    lemmatized_txt=[wn.lemmatize(word) for word in token_words]
    return lemmatized_txt

In [12]:
nltk.download('wordnet')
lematized_sent1 = lemmatization(remove_sw_sent1)
print(lematized_sent1)
print("------------------------------------------------------------------------------------------")
lematized_sent2 = lemmatization(remove_sw_sent2)
print(lematized_sent2)

[nltk_data] Downloading package wordnet to C:\Users\Waseem Akram
[nltk_data]     Khan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['symbol', 'commonly', 'used', 'number', 'especially', 'american', 'english']
------------------------------------------------------------------------------------------
['use', 'symbol', 'indicate', 'percentage', 'amount', '']


In [13]:
complete_cleaned_sent1 = " ".join(lematized_sent1)
print(complete_cleaned_sent1)
print("---------------------------------------------------------------------------------------------")
complete_cleaned_sent2 = " ".join(lematized_sent2)
print(complete_cleaned_sent2)

symbol commonly used number especially american english
---------------------------------------------------------------------------------------------
use symbol indicate percentage amount 


# To form a single Corpus from two different sentences

In [14]:
corpus = [complete_cleaned_sent1, complete_cleaned_sent2]

# To convert the sentences into vector form

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vectorizer1_2 = TfidfVectorizer()
features_matrix = vectorizer1_2.fit_transform(corpus)
print(vectorizer1_2.vocabulary_ )

{'symbol': 8, 'commonly': 2, 'used': 10, 'number': 6, 'especially': 4, 'american': 0, 'english': 3, 'use': 9, 'indicate': 5, 'percentage': 7, 'amount': 1}


# To Compute the Cosine Similarity between two sentences

In [17]:
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics import pairwise_distances

In [18]:
cos_sim = cosine_similarity(features_matrix)
print("--------Cosine Similarity betweeen the two given sentences-------")
print("Cosine Similarity between two sentences:", cos_sim[0][1])

--------Cosine Similarity betweeen the two given sentences-------
Cosine Similarity between two sentences: 0.09349477497536716


In [19]:
cosine_distance = 1-cos_sim[0][1]
print("Cosine Distance between two sentences:",cosine_distance)

Cosine Distance between two sentences: 0.9065052250246328
