In [1]:
# https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
# https://nlp.stanford.edu/projects/glove/
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import os

In [2]:
dim_n = 300
glove_input_file = f'./data/glove.6B.{dim_n}d.txt'
word2vec_output_file = f'./data/glove.6B.{dim_n}d.txt.word2vec'
if not os.path.isfile(word2vec_output_file): 
    glove2word2vec(glove_input_file, word2vec_output_file)

In [3]:
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [4]:
v = model['centrist']
v.shape

(300,)

In [19]:
"""Find the top-N most similar words. Positive words contribute positively towards the similarity, negative words negatively.

This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and 
the vectors for each word in the model. 
The method corresponds to the word-analogy and distance scripts in the original word2vec implementation."""

model.most_similar(positive=['informatics'], negative=['decentralized'], topn=10)

[('biomedical', 0.4151154160499573),
 ('gerontology', 0.38601911067962646),
 ('sciences', 0.3781599998474121),
 ('microbiology', 0.3649481534957886),
 ('biostatistics', 0.3528899550437927),
 ('science', 0.3505825996398926),
 ('neurology', 0.3437531292438507),
 ('radiology', 0.3431781828403473),
 ('pharmacology', 0.3430456221103668),
 ('biology', 0.3413241505622864)]

In [7]:
"""
Get the entity from entities_list most similar to entity1.
"""

model.most_similar_to_given('centralized', ['slow', 'decision', 'making'])

'slow'

In [37]:
"""
Compute cosine similarity between two sets of words.
"""

ph1 = ['collaborate', 'design', 'front', 'end', 'back', 'end', 'programming', 'teams', 
    'concept', 'build', 'test', 'launch', 'dynamic', 'websites', 'industry',
    'best', 'practices']
ph2 =['work', 'closely', 'other', 'web', 'developers', 'ensure',
    'client',  'marketing', 'goals', 'objectives', 'understood', 'met', 
    'established', 'timelines', 'highest', 'level','quality']
ph3 = ['expert', 'wordpress']
ph4 = ['working', 'e-commerce', 'agile', 'environment']
ph5 = ['coding', 'wordpress', 'environment', 'develop', 'update', 'code',
       'themes', 'plugins']
ph6 = ['modify', 'existing', 'code', 'needed']
ph7 = ['coding', 'custom', 'wordpress', 'theme' , 'template', 
       'files', 'using']

job_desc_list = [ph1, ph2, ph3, ph4, ph5, ph6, ph7]
job_desc = ph1 + ph2 + ph3 + ph4 + ph5 + ph6 + ph7
cent = model.n_similarity(['boss', 'promotion', 'hierarchy', 'slow', 'decision-making', 'bureaucracy'], job_desc)
desc = model.n_similarity(['disperse', 'leader',  'project', 'organization','structure'], job_desc)
print(f'cent ({cent}), desc ({desc})')
prag = model.n_similarity(['pragmatic', 'practical', 'logical', 'efficient', 'realistic', 'feedback'], job_desc)
emot = model.n_similarity(['affecting', 'exciting', 'passionate', 'sentimental', 'spontaneous', 'critics'], job_desc)
print(f'prag ({prag}), emot ({emot})')

for phrase in job_desc_list:
    print(phrase)
    cent = model.n_similarity(['boss', 'promotion', 'hierarchy', 'slow', 'decision-making', 'bureaucracy'], phrase)
    desc = model.n_similarity(['disperse', 'leader',  'project', 'organization','structure'], phrase)
    print(f'cent ({cent}), desc ({desc})')
    prag = model.n_similarity(['pragmatic', 'practical', 'logical', 'efficient', 'realistic', 'feedback'], phrase)
    emot = model.n_similarity(['affecting', 'exciting', 'passionate', 'sentimental', 'spontaneous', 'critics'], phrase)
    print(f'prag ({prag}), emot ({emot})')


cent (0.47398929274116886), desc (0.5955710323450699)
prag (0.5655847590140385), emot (0.3561073412126275)
['collaborate', 'design', 'front', 'end', 'back', 'end', 'programming', 'teams', 'concept', 'build', 'test', 'launch', 'dynamic', 'websites', 'industry', 'best', 'practices']
cent (0.45004881310673656), desc (0.611796903062255)
prag (0.4949022530931788), emot (0.39344615353288764)
['work', 'closely', 'other', 'web', 'developers', 'ensure', 'client', 'marketing', 'goals', 'objectives', 'understood', 'met', 'established', 'timelines', 'highest', 'level', 'quality']
cent (0.5069139327421917), desc (0.583098296478995)
prag (0.5190717879291836), emot (0.334331363199194)
['expert', 'wordpress']
cent (0.030650676948431866), desc (0.06383689309691577)
prag (0.09509065855783401), emot (0.029601195638117736)
['working', 'e-commerce', 'agile', 'environment']
cent (0.3572978455394575), desc (0.36084835626671624)
prag (0.4932104876341778), emot (0.24048926428182987)
['coding', 'wordpress', 'en

> Issue with this approach, it seems that it's capturing also antonyms, fact that we want to avoid since we are trying to retrieve antonym concepts, maybe a solution can be proposed to overcome this behavior, besides, it seems that with the method "most_similar", this issue can be overcomed. Furthermore, "most_similar" with just a positive word is the same as this method

In [41]:
"""
Find the top-N most similar words.
"""

model.similar_by_word('centralized', topn=10)

[('centralised', 0.8113839030265808),
 ('decentralized', 0.706671953201294),
 ('hierarchical', 0.5671628713607788),
 ('decentralised', 0.5647639036178589),
 ('bureaucracy', 0.5224347114562988),
 ('centralization', 0.5045151710510254),
 ('centrally', 0.5039045214653015),
 ('bureaucracies', 0.49719032645225525),
 ('centralizing', 0.496930330991745),
 ('computerized', 0.47032514214515686)]

In [39]:
model.similarity('centralized', 'decentralized')

0.7066719959008728

In [38]:
model.distance('centralized', 'decentralized')

0.2933280040991272

In [40]:
model.rank('centralized', 'decentralized')

2

In [12]:
model.n_similarity(['rule-based'], job_desc)

-0.10952533800340147

In [13]:
model.n_similarity(['emotional', 'relationship'], job_desc)

0.4081094339442425

In [14]:
print(model.similarity('centralised', 'decentralised'))
print(model.most_similar(positive=['centralised'], negative=['decentralised']))
model.similar_by_word('centralised', topn=10)

0.6436651890065143
[('centralized', 0.29213541746139526), ('control', 0.289389044046402), ('kings', 0.274245023727417), ('required', 0.2557501494884491), ('royal', 0.2550400495529175), ('factory', 0.2513066232204437), ('traffic', 0.2509596347808838), ('proving', 0.24832189083099365), ('training', 0.24686117470264435), ('sign', 0.246623694896698)]


[('centralized', 0.8113839030265808),
 ('decentralised', 0.6436651945114136),
 ('decentralized', 0.6053079962730408),
 ('centrally', 0.47633200883865356),
 ('computerised', 0.45558688044548035),
 ('hierarchical', 0.45417290925979614),
 ('centralize', 0.44843581318855286),
 ('decentralize', 0.43602314591407776),
 ('two-tier', 0.4281821548938751),
 ('centralizing', 0.42436620593070984)]

In [15]:
model.words_closer_than('centralised', 'decentralised')

['centralized']

In [16]:
model.most_similar(positive=['emotional', 'relationship'])

[('relationships', 0.7057441473007202),
 ('feelings', 0.5835412740707397),
 ('psychological', 0.5553158521652222),
 ('intimacy', 0.5453625917434692),
 ('friendship', 0.5399602651596069),
 ('intimate', 0.537808895111084),
 ('emotionally', 0.5338315963745117),
 ('kind', 0.5336318016052246),
 ('emotions', 0.5320711135864258),
 ('relations', 0.5308881998062134)]

In [17]:
model.similarity('centric', 'liberal')

0.024979651269173793

In [29]:
ter_cen = [ 'boss', 'slow', 'decision-making', 'bureaucracy', 'promotion', 'hierarchy',
            'functional', 'centralized', 'vertical',
           ]
    
print(model.n_similarity(ter_cen, job_desc))
print(model.n_similarity(['functional'], job_desc))


0.5386022876272359
0.3430903698391148
