## Imports

In [10]:
from gensim.models import Word2Vec
from gensim.test.utils import datapath
import gensim.downloader as api

import multiprocessing

## Load the dataset

In [24]:
# load the Text8-dataset
dataset = api.load("text8")

## Train the model

In [13]:
# get the number of cores
num_processes = multiprocessing.Pool()._processes

# train the Word2Vec-model
model = Word2Vec(dataset, min_count=5, workers=num_processes)

## Get index from key

In [36]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
        
    print(f"word #{index}/{len(model.wv.index_to_key)} is: {word}")

word #0/71290 is: the
word #1/71290 is: of
word #2/71290 is: and
word #3/71290 is: one
word #4/71290 is: in
word #5/71290 is: a
word #6/71290 is: to
word #7/71290 is: zero
word #8/71290 is: nine
word #9/71290 is: two


## Get the word vector

In [39]:
king_vw = model.wv['king']

print(len(king_vw))
print(king_vw)

100
[ 0.6099961   2.807867    2.06987     0.83153987  0.26194066  2.29551
  0.02573501  1.4557197  -0.16459201  0.3859     -0.24922632 -2.8708599
  2.2126596   4.1852174  -0.5333733  -0.1643643   0.83662015  1.104826
  2.1866484   0.82956624 -1.2793326   0.61501837 -2.152865   -0.48138657
 -0.6696346   1.2233275  -0.37354955 -0.24445823  0.2144425  -0.66999483
  1.9057432  -1.2599318  -0.4251985  -0.6840563   3.6047142  -1.1980398
  1.3020844  -1.3659182  -0.7605614  -1.2023554  -0.67999476  2.2126727
  1.6952603  -0.9444782  -2.3419273  -4.177555   -0.9417651   0.5004199
  3.2611377  -0.80868864  4.3374867  -0.4358617  -0.02254349  3.465822
  0.07535721 -1.7328625   2.1816294  -0.47624537  0.24411225 -1.5124265
  0.46778315  0.38498572  0.89346856 -0.6038507   0.17813212 -0.70155334
 -2.4322836   1.297255    2.4208796  -1.8331443   2.7508209  -0.45999214
 -2.8364363  -3.1342063  -1.0767957  -0.5769557  -1.49582     2.299692
 -2.2343066   3.05936    -2.753236   -1.5379939  -0.2151284  

## Similarity

In [40]:
def get_similarity(w1, w2):
    print('%r\t%r\t%.2f' % (w1, w2, model.wv.similarity(w1, w2)))

In [41]:
get_similarity('car', 'train')

'car'	'train'	0.57


In [44]:
get_similarity('car', 'art')

'car'	'art'	-0.03


In [51]:
get_similarity('person', 'emotion')

'person'	'emotion'	0.45


In [14]:
def find_similar_words(word):
    similar_words = model.wv.most_similar(word)
    print(f"Similar words to '{word}':")
    
    for word, similarity in similar_words:
        print(word, similarity)

In [22]:
find_similar_words('game')

Similar words to 'game':
games 0.8612557649612427
gameplay 0.7024416327476501
match 0.6732522249221802
console 0.6679466366767883
gurps 0.6583449244499207
fps 0.6534695625305176
multiplayer 0.6337052583694458
card 0.6317827105522156
craps 0.6222929358482361
scoring 0.6219169497489929


In [56]:
res = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.similar_by_vector(res)[:4]

[('king', 0.665647566318512),
 ('kings', 0.5485600829124451),
 ('women', 0.545667290687561),
 ('queen', 0.5354700684547424)]