# This notebook performs similarity analysis


In [1]:
# Import the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

In [2]:
filename=r'..\DatasetEnglish.csv'
data=pd.read_csv(filename)
data=data.drop(['index'], axis=1)
data.head()

data_use=data[['talk_title','talk_description','speakers_name', 'topic_0_name','topic_1_name','topic_2_name','topic_3_name',
              'topic_4_name','topic_5_name','topic_6_name','topic_7_name','transcript','related_talk_1_slug',
               'related_talk_2_slug','related_talk_3_slug','related_talk_4_slug','related_talk_5_slug']]

### Calculating cosine similarity

In [3]:
tfidf = TfidfVectorizer(stop_words='english')

data['combined_features'] = data['talk_slug'] + ' ' + data['talk_description'] + ' ' + data['speakers_name'] + ' ' +data['topic_0_name'] + ' ' + data['transcript']
data['combined_features'] = data['combined_features'].fillna('')

tfidf_matrix = tfidf.fit_transform(data['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Pushing vectors to Zilliz vector db

In [8]:
from pymilvus import MilvusClient

# Replace uri and API key with your own
client = MilvusClient(
    uri='',
    token=''
)

In [45]:
cosine_sim_list = cosine_sim.tolist()

for i in range(4000,len(data)):
    ent={}
    ent['talk_id']=int(data['talk_id'][i])
    ent['vector']=cosine_sim_list[i]
    res = client.insert(
      collection_name='CosineSimilarity',
      data=ent
    )

In [27]:
schema = client.describe_collection(collection_name='CosineSimilarity')
print(schema)

{'collection_name': 'CosineSimilarity', 'auto_id': True, 'num_shards': 1, 'description': 'Collection with cosine similarity vectors', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': 5, 'params': {}, 'element_type': 0, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'vector', 'description': '', 'type': 101, 'params': {'dim': 4959}, 'element_type': 0}], 'aliases': [], 'collection_id': 445765802482541573, 'consistency_level': 2, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}


In [6]:
# user selects a talk 
def get_title_from_index(index):
    return data[data.index == index]['talk_slug'].values[0]

def get_index_from_title(title):
    return data[data.talk_slug == title].index.values[0]

talk_user_likes = data['talk_slug'].loc[1]
talk_index = get_index_from_title(talk_user_likes)

similar_talks = list(enumerate(cosine_sim[talk_index]))
similar_talks = sorted(similar_talks, key=lambda x: x[1], reverse=True)

print('Talks similar to ' + talk_user_likes + ' are:\n')
for i in range(10):
    print(get_title_from_index(similar_talks[i][0]))

In [7]:
data['talk_slug'].loc[1]

'aala_el_khani_what_it_s_like_to_be_a_parent_in_a_war_zone'

In [None]:
## 3. Cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df['fully_cleaned']= df['transcript'].apply(clean_text)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['fully_cleaned'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(cosine_sim.shape)
print(cosine_sim[0])

In [None]:
sns.clustermap(cosine_sim, cmap='hot', annot=True, dendrogram_ratio=0.1) 
plt.show()

In [None]:
## 4. Jaccard similarity

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import jaccard_score

sentences = list(df['fully_cleaned'])
vectorizer = CountVectorizer(binary=True)

vectors = vectorizer.fit_transform(sentences)
vectors = vectors.toarray()

jaccard_sim = []

for i in range(len(vectors)): 
    for j in range(i+1, len(vectors)): 
        score = jaccard_score(vectors[i], vectors[j], average='micro') 
        jaccard_sim.append((score, i, j))

In [None]:
x = [i for score, i, j in jaccard_sim] 
y = [j for score, i, j in jaccard_sim] 
c = [score for score, i, j in jaccard_sim]


plt.scatter(x, y, c=c, cmap='Blues') 
plt.colorbar() 
plt.xlabel('Word set index') 
plt.ylabel('Word set index') plt.show()

In [None]:
n = len(word_sets) # Number of word sets 
jaccard_matrix = np.zeros((n, n)) # Initialize an empty matrix 
for score, i, j in jaccard_sim: # Loop through the list of scores 
    jaccard_matrix[i, j] = score # Fill the matrix with the scores 
    jaccard_matrix[j, i] = score # The matrix is symmetric

jaccard_df = pd.DataFrame(jaccard_matrix, index=df.index, columns=df.index)

sns.heatmap(jaccard_df, annot=True, cmap=‘Blues’) 
plt.show()

import matplotlib.pyplot as plt

Extract the x and y coordinates and the Jaccard similarity scores from the list

In [None]:
## 5. Euclidean distance

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

vectorizer2 = CountVectorizer()

count_matrix = vectorizer2.fit_transform(df['fully_cleaned'])

euclidean_dist = euclidean_distances(count_matrix, count_matrix)

print(euclidean_dist.shape)
print(euclidean_dist[0])