# Cosine Similarity
$cos(\theta) = \frac{A\cdot B}{|A||B|} = \frac{dot(A\cdot B)}{norm(A)norm(B)}$

In [2]:
# cosine similarity
import numpy as np
vec1 = [4,45,3,5]
vec2 = [3,52,12,14]
cosine_similarity = np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
print(cosine_similarity)

0.9765835979265888


# Jaccard Similarity

$J(A,B) = \frac{|A\cap B|}{|A\cup B|}$

In [3]:
# Jacquard similarity
import numpy as np
def jaccard(list1,list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

a = [0.1,2,4,2,4,5,6]
b = [4,2,3,3,4,4,0]

jaccard(a,b)

0.16666666666666666

# Word Mover's Distance

WMD enables us to assess the "distance" between two documents in a meaningful way even when they have no words in common. It uses **word2vec** vector embeddings of words.

The intuition behind WMD is that we find the minimum "traveling distance" between documents. It is superior to BOW because WMD can take the underlying geometry into account.

This method is introduced by Matt Kusner et al. in [From Word Embedding To Document Distances](http://proceedings.mlr.press/v37/kusnerb15.pdf)

In [16]:
# Word Mover's Distance
import logging # initialize logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = 'The president greets the press in Chicago'

In [17]:
# remove stopwords
from nltk.corpus import stopwords
from nltk import download
download('stopwords') # Download stopwords list
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\610\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def preprocess(sentence):
    return [w for w in sentence.lower().split() if w not in stop_words]

sentence_1 = preprocess(sentence_1)
sentence_2 = preprocess(sentence_2)

In [25]:
import gensim.downloader as api
import json
info = api.info()
# print(json.dumps(info, indent=4))

# available corpora
for corpus_name, corpus_data in sorted(info['corpora'].items()):
    print(
        '%s (%d records): %s' % (
            corpus_name,
            corpus_data.get('num_records', -1),
            corpus_data['description'][:40] + '...',
        )
    )

20-newsgroups (18846 records): The notorious collection of approximatel...
__testing_matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Synopsis of t...
__testing_multipart-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Synopsis of t...
fake-news (12999 records): News dataset, contains text and metadata...
patent-2017 (353197 records): Patent Grant Full Text. Contains the ful...
quora-duplicate-questions (404290 records): Over 400,000 lines of potential question...
semeval-2016-2017-task3-subtaskA-unannotated (189941 records): SemEval 2016 / 2017 Task 3 Subtask A una...
semeval-2016-2017-task3-subtaskBC (-1 records): SemEval 2016 / 2017 Task 3 Subtask B and...
text8 (1701 records): First 100,000,000 bytes of plain text fr...
wiki-english-20171001 (4924894 records): Extracted Wikipedia dump from October 20...


In [26]:
# available models
for model_name, model_data in sorted(info['models'].items()):
    print(
        '%s (%d records): %s' % (
            model_name,
            model_data.get('num_records', -1),
            model_data['description'][:40] + '...',
        )
    )

__testing_word2vec-matrix-synopsis (-1 records): [THIS IS ONLY FOR TESTING] Word vecrors ...
conceptnet-numberbatch-17-06-300 (1917247 records): ConceptNet Numberbatch consists of state...
fasttext-wiki-news-subwords-300 (999999 records): 1 million word vectors trained on Wikipe...
glove-twitter-100 (1193514 records): Pre-trained vectors based on  2B tweets,...
glove-twitter-200 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-25 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-twitter-50 (1193514 records): Pre-trained vectors based on 2B tweets, ...
glove-wiki-gigaword-100 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-200 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-300 (400000 records): Pre-trained vectors based on Wikipedia 2...
glove-wiki-gigaword-50 (400000 records): Pre-trained vectors based on Wikipedia 2...
word2vec-google-news-300 (3000000 records): Pre-trai

In [34]:
# model = api.load('glove-wiki-gigaword-300')
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
model = KeyedVectors.load_word2vec_format(datapath(r"C:/Users/610/gensim-data/word2vec-google-news-300/GoogleNews-vectors-negative300.bin"), binary=True)

2022-02-19 13:44:54,200 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-02-19 13:44:54,201 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2022-02-19 13:44:54,201 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2022-02-19T13:44:54.201922', 'gensim': '4.1.2', 'python': '3.9.8 (tags/v3.9.8:bb3fdcf, Nov  5 2021, 20:48:33) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042', 'event': 'created'}
2022-02-19 13:44:54,245 : INFO : loading projection weights from C:/Users/610/gensim-data/word2vec-google-news-300/GoogleNews-vectors-negative300.bin
2022-02-19 13:45:09,664 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:/Users/610/gensim-data/word2vec-google-news-

In [35]:
model.most_similar("glass")

[('R._Mazzei_fused', 0.6665399670600891),
 ('Christian_Audigier_nightclub', 0.6632695198059082),
 ('copper_alloy_garnets', 0.6343654990196228),
 ('Nelmeus', 0.6274422407150269),
 ('fiber_fusion_splicing', 0.6229819655418396),
 ('Plexiglass', 0.5858588814735413),
 ('slashing_Leonardo_DiCaprio', 0.5850011110305786),
 ('plexiglass', 0.5823022723197937),
 ('Plexiglas', 0.5803930759429932),
 ("#Q'##_unaudited", 0.5798528790473938)]

In [37]:
distance = model.wmdistance(sentence_1, sentence_2)
print('distance = %.4f' % distance)

2022-02-19 13:47:19,050 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-02-19 13:47:19,050 : INFO : built Dictionary(8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...) from 2 documents (total 8 corpus positions)
2022-02-19 13:47:19,051 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(8 unique tokens: ['illinois', 'media', 'obama', 'speaks', 'chicago']...) from 2 documents (total 8 corpus positions)", 'datetime': '2022-02-19T13:47:19.051494', 'gensim': '4.1.2', 'python': '3.9.8 (tags/v3.9.8:bb3fdcf, Nov  5 2021, 20:48:33) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19042', 'event': 'created'}


distance = 1.0175
