# Creating Word Vectors with word2vec

**N.B.**: use "`git pull`" anywhere in the `TensorFlow-LiveLessons` directory to update to latest notebooks

#### Load dependencies

In [None]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [None]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

#### Load data

In [None]:
nltk.download('gutenberg')

In [None]:
from nltk.corpus import gutenberg

In [None]:
gutenberg.fileids()

In [None]:
len(gutenberg.fileids())

#### Tokenize text

In [None]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [None]:
gberg_sent_tokens[0:5]

In [None]:
gberg_sent_tokens[1]

In [None]:
word_tokenize(gberg_sent_tokens[1])

In [None]:
word_tokenize(gberg_sent_tokens[1])[14]

In [None]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [None]:
gberg_sents[0:5]

In [None]:
gberg_sents[4]

In [None]:
gberg_sents[4][14]

#### Run word2vec

In [None]:
# model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=8)

In [None]:
# model.save('../raw_gutenberg_model.w2v')

#### Explore model

In [None]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('../raw_gutenberg_model.w2v')

In [None]:
model['dog']

In [None]:
len(model['dog'])

In [None]:
model.most_similar('dog') # distance

In [None]:
model.most_similar('think')

In [None]:
model.most_similar('day')

In [None]:
model.most_similar('father')

In [None]:
# close, but not quite; distinctly in female direction: 
model.most_similar(positive=[ ], negative=[ ]) 

In [None]:
model.most_similar(positive=[ ], negative=[ ], topn=30) 

In [None]:
# impressive for such a small data set, without any cleaning, e.g., to lower case (covered next)

#### Reduce word vector dimensionality with t-SNE

In [None]:
model.wv.vocab

In [None]:
len(model.wv.vocab)

In [None]:
# X = model[model.wv.vocab]

In [None]:
# tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [None]:
# X_2d = tsne.fit_transform(X)

In [None]:
# coords_df = pd.DataFrame(X_2d, columns=['x','y'])
# coords_df['token'] = model.wv.vocab.keys()

In [None]:
# coords_df.to_csv('../raw_gutenberg_tsne.csv', index=False)

#### Visualize 2D representation of word vectors

In [None]:
coords_df = pd.read_csv('../raw_gutenberg_tsne.csv')

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)

In [None]:
output_notebook() # output bokeh plots inline in notebook

In [None]:
subset_df = coords_df.sample(n=5000)

In [None]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)