In [21]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) + len(c))

In [22]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [23]:
text_1a = '50 Inch Class H6570G 4K Ultra HD Android Smart TV with Alexa Compatibility 2.5” 2020 Model Black Silver White HDR LED'
text_1b = 'Hisense H6570G'
text_2a = 'QN75Q90TAFXZA crystal 2.5” Quantum LCD'
text_2b = 'Samsung crystal UN55TU8000FXZA QLED'
text_3a = 'EGLF2 50 Ultra Full Motion Articulating TV Wall Mount Bracket swivel full'
text_3b = 'VIZIO EGLF2'

In [24]:
get_cosine_sim(text_1a.lower(), text_1b.lower())[0][1]

0.15811388300841894

In [25]:
get_jaccard_sim(text_1a.lower(), text_1b.lower())

0.041666666666666664

In [26]:
get_cosine_sim(text_2a.lower(), text_2b.lower())[0][1]

0.25

In [27]:
get_jaccard_sim(text_2a.lower(), text_2b.lower())

0.1

In [28]:
get_cosine_sim(text_3a.lower(), text_3b.lower())[0][1]

0.1889822365046136

In [29]:
get_jaccard_sim(text_3a.lower(), text_3b.lower())

0.07142857142857142

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy

In [31]:
writer = pd.ExcelWriter('Prod.xlsx', engine='xlsxwriter')

In [32]:
product_1 = [text_1a, text_1b]

count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(product_1)

prod_1_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(prod_1_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['prod_1a', 'prod_1b'])
df

Unnamed: 0,2020,4k,50,alexa,android,black,class,compatibility,h6570g,hd,...,hisense,inch,led,model,silver,smart,tv,ultra,white,with
prod_1a,1,1,1,1,1,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
prod_1b,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0


In [33]:
df.to_excel(writer, sheet_name="Prod_1")

In [34]:
cosim = cosine_similarity(df, df)[0]
cosim

array([1.        , 0.15811388])

In [35]:
product_2 = [text_2a, text_2b]

count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix_2 = count_vectorizer.fit_transform(product_2)

prod_2_term_matrix = sparse_matrix_2.todense()
df_2 = pd.DataFrame(prod_2_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['prod_2a', 'prod_2b'])
df_2

Unnamed: 0,crystal,lcd,qled,qn75q90tafxza,quantum,samsung,un55tu8000fxza
prod_2a,1,1,0,1,1,0,0
prod_2b,1,0,1,0,0,1,1


In [36]:
df_2.to_excel(writer, sheet_name="Prod_2")

In [37]:
cosim = cosine_similarity(df_2, df_2)[0]
cosim

array([1.  , 0.25])

In [38]:
product_3 = [text_3a, text_3b]

count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
sparse_matrix_3 = count_vectorizer.fit_transform(product_3)

prod_3_term_matrix = sparse_matrix_3.todense()
df_3 = pd.DataFrame(prod_3_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['prod_3a', 'prod_3b'])
df_3

Unnamed: 0,50,articulating,bracket,eglf2,full,motion,mount,swivel,tv,ultra,vizio,wall
prod_3a,1,1,1,1,2,1,1,1,1,1,0,1
prod_3b,0,0,0,1,0,0,0,0,0,0,1,0


In [39]:
df_3.to_excel(writer, sheet_name="Prod_3")

In [40]:
cosim = cosine_similarity(df_3, df_3)[0]
cosim

array([1.        , 0.18898224])

In [41]:
writer.save()

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
text_1a = '50 Inch Class H6570G 4K Ultra HD Android Smart TV with Alexa Compatibility 2.5” 2020 Model Black Silver White HDR LED'
text_1b = 'Hisense H6570G'
text_2a = 'QN75Q90TAFXZA crystal 2.5” Quantum LCD'
text_2b = 'Samsung crystal UN55TU8000FXZA QLED'
text_3a = 'EGLF2 50 Ultra Full Motion Articulating TV Wall Mount Bracket swivel full'
text_3b = 'VIZIO EGLF2'

In [3]:
corp_1 = [text_1a, text_1b]

In [5]:
vect = TfidfVectorizer(min_df=1, stop_words="english")                                                                                                                                                                                                   
tfidf = vect.fit_transform(corp_1)                                                                                                                                                                                                                       
pairwise_similarity = tfidf * tfidf.T 

In [7]:
pairwise_similarity.toarray()

array([[1.        , 0.09588553],
       [0.09588553, 1.        ]])

In [8]:
corp_2 = [text_2a, text_2b]

In [9]:
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corp_2)
pairwise_similarity = tfidf * tfidf.T

In [10]:
pairwise_similarity.toarray()

array([[1.        , 0.14438356],
       [0.14438356, 1.        ]])

In [11]:
corp_3 = [text_3a, text_3b]

In [12]:
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corp_3)
pairwise_similarity = tfidf * tfidf.T

In [13]:
pairwise_similarity.toarray()

array([[1.        , 0.13378509],
       [0.13378509, 1.        ]])

In [15]:
tfidf.toarray()

array([[0.32433627, 0.32433627, 0.32433627, 0.23076793, 0.32433627,
        0.32433627, 0.32433627, 0.32433627, 0.32433627, 0.        ,
        0.32433627],
       [0.        , 0.        , 0.        , 0.57973867, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.81480247,
        0.        ]])