In [1]:
# https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
from gensim.models import KeyedVectors

In [2]:
filename = './data/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
v = model['centrist']
v.shape

(300,)

In [4]:
"""Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively.

This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and 
the vectors for each word in the model. 
The method corresponds to the word-analogy and distance scripts in the original word2vec implementation."""

model.most_similar(positive=['centralized'], negative=['decentralized'], topn=10)

[('McAfee_ePO', 0.33990055322647095),
 ('Centralized', 0.3273981809616089),
 ('OneSign_SSO', 0.3217175006866455),
 ('Tripwire_Log', 0.3142997622489929),
 ('Sun_xVM_Ops', 0.31407076120376587),
 ('customizable_dashboards', 0.306085467338562),
 ('centralized_repository', 0.3059951663017273),
 ('Stratusphere', 0.3033236265182495),
 ('NitroView_ESM', 0.3030901551246643),
 ('Absolute_Monitoring', 0.3026299476623535)]

In [5]:
"""
Get the entity from entities_list most similar to entity1.
"""

model.most_similar_to_given('centrist', ['boss', 'promotion', 'hierarchy', 'slow', 'decision'])

'boss'

In [6]:
"""
Compute cosine similarity between two sets of words.
"""

job_desc = ['Collaborate', 'design', 'Front', 'end', 'Back', 'end', 'programming', 'teams', 
            'concept', 'build', 'test', 'launch', 'dynamic', 'websites', 'industry',
            'best', 'practices', 'Work', 'closely', 'other', 'web', 'developers', 'ensure',
            'client',  'marketing', 'goals', 'objectives', 'understood', 'met', 
            'established', 'timelines', 'highest', 'level','quality']
cent = model.n_similarity(['boss', 'promotion', 'hierarchy', 'slow', 'decision'], job_desc)
desc = model.n_similarity(['disperse', 'leader',  'project', 'organization','structure'], job_desc)
print(f'cent ({cent}), desc ({desc})')
prag = model.n_similarity(['pragmatic', 'practical', 'logical', 'efficient', 'realistic', 'feedback'], job_desc)
emot = model.n_similarity(['affecting', 'exciting', 'passionate', 'sentimental', 'spontaneous', 'critics'], job_desc)
print(f'prag ({prag}), emot ({emot})')

cent (0.3759588393837981), desc (0.45332777873369534)
prag (0.5008825755824164), emot (0.3243110580524497)


> Issue with this approach, it seems that it's capturing also antonyms, fact that we want to avoid since we are trying to retrieve antonym concepts, maybe a solution can be proposed to overcome this behavior, besides, it seems that with the method "most_similar", this issue can be overcomed. Furthermore, "most_similar" with just a positive word is the same as this method

In [10]:
"""
Find the top-N most similar words.
"""

model.similar_by_word('centric', topn=10)

[('oriented', 0.6055328845977783),
 ('centricity', 0.6044423580169678),
 ('orientated', 0.5023170113563538),
 ('componentised', 0.4599813222885132),
 ('CoCreate_3rd', 0.45777708292007446),
 ('socially_networked', 0.4537206292152405),
 ('holistic', 0.4428867995738983),
 ('silo_ed', 0.44246262311935425),
 ('gamified', 0.4413653016090393),
 ('integrated', 0.43601399660110474)]

In [12]:
model.similarity('centric', 'liberal')

0.2286931824495394

In [19]:
model.distance('centralized', 'decision')

0.8977424775823725