In [2]:
import gensim
from gensim.models import word2vec
import logging

from keras.layers import Input, Embedding, merge
from keras.models import Model

import tensorflow as tf
import numpy as np

import urllib.request
import os
import zipfile

vector_dim = 100
root_path = os.getcwd()


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
def download_file(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


In [4]:
# convert the input data into a list of integer indexes aligning with the wv indexes
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = f.read(f.namelist()[0]).split()
    return data

In [5]:
def convert_data_to_index(string_data, wv):
    index_data = []
    for word in string_data:
        if word in wv:
            index_data.append(wv.vocab[word].index)
    return index_data

In [25]:
## download the data 
url = 'http://mattmahoney.net/dc/'
filename = download_file('text8.zip', url, 31344016)
if not os.path.exists((root_path +"\\"+ filename).strip('.zip')):
    zipfile.ZipFile(root_path+"\\"+filename).extractall()

Found and verified text8.zip


# Gensim Word2Vec

In [7]:
from datetime import datetime
tstart = datetime.now()
sentences = word2vec.Text8Corpus((root_path +"\\"+ filename).strip('.zip'))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = word2vec.Word2Vec(sentences, iter=10, min_count=10, size=100, workers=4)
tend = datetime.now()
print (tend - tstart)

Found and verified text8.zip


2018-02-02 22:59:39,771 : INFO : collecting all words and their counts
2018-02-02 22:59:39,774 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-02-02 22:59:44,930 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2018-02-02 22:59:44,931 : INFO : Loading a fresh vocabulary
2018-02-02 22:59:45,225 : INFO : min_count=10 retains 47134 unique words (18% of original 253854, drops 206720)
2018-02-02 22:59:45,225 : INFO : min_count=10 leaves 16561031 word corpus (97% of original 17005207, drops 444176)
2018-02-02 22:59:45,402 : INFO : deleting the raw counts dictionary of 253854 items
2018-02-02 22:59:45,417 : INFO : sample=0.001 downsamples 38 most-common words
2018-02-02 22:59:45,418 : INFO : downsampling leaves estimated 12333563 word corpus (74.5% of prior 16561031)
2018-02-02 22:59:45,419 : INFO : estimated required memory for 47134 words and 100 dimensions: 61274200 bytes
2018-02-02 22:59:45,636 : INFO : resetting l

0:02:40.415000


In [8]:
# save the model
model.save(root_path + "w2v_gensim.model")

2018-02-02 23:03:12,158 : INFO : saving Word2Vec object under C:\Users\sarora\Documents\workshop\Keras_Gensim_Workshopw2v_gensim.model, separately None
2018-02-02 23:03:12,158 : INFO : not storing attribute syn0norm
2018-02-02 23:03:12,159 : INFO : not storing attribute cum_table
2018-02-02 23:03:13,167 : INFO : saved C:\Users\sarora\Documents\workshop\Keras_Gensim_Workshopw2v_gensim.model


In [9]:
## get the word vector
word_vectors = model.wv
## delete the model to free RAM space
del model

In [10]:
# get the word vector of "of"
print(word_vectors['of'])

[ 0.3061564  -0.96115386 -0.8630267   1.8129046  -0.06767506  0.92952305
  0.3761787  -1.4928774   1.3614581  -1.4495217  -0.28127846  0.2622818
 -0.54526836  1.2730536  -0.6922045   0.2692608   0.94456685 -0.06606608
  0.22785585 -0.26709765 -1.1088084   1.0772316   2.0432854   1.8930763
  0.30403492  0.7586897  -0.493812   -0.01467971 -0.22679994  1.3555989
 -1.6310763  -0.42341593  0.3056518  -0.46745062 -0.83427227  0.03583196
 -1.1058726  -0.26517883  0.60105604 -0.91067505 -0.64660984 -0.7610009
 -0.8554591   0.7420312  -0.8882003  -1.1422915   2.2729921  -0.1819321
  0.48678255  0.5457513  -0.62141335  0.4376641  -0.02157935 -0.45318267
  0.0508013   2.344787    0.04180916 -0.74063176 -0.39010966  1.0210367
 -0.81849724 -2.2233632  -0.2897524   0.07855511  0.8952815  -0.89665014
  0.48190504 -0.47030613  0.5526116  -0.7043509  -1.3606175  -0.66960335
 -1.1854233   1.2719721  -1.137403    2.2702281   2.0156488  -0.21478519
  0.49565825  0.05780362  1.4134271   0.05242933 -0.52665

In [11]:
# get the most common words
print(word_vectors.index2word[0], word_vectors.index2word[1], word_vectors.index2word[2])  

the of and


In [12]:
# get the least common words
vocab_size = len(word_vectors.vocab)
print(word_vectors.index2word[vocab_size - 1], word_vectors.index2word[vocab_size - 2], word_vectors.index2word[vocab_size - 3]) 

zechs lilies campylobacter


In [14]:
# get female for actor
word_vectors.most_similar_cosmul(positive=['woman', 'actor'], negative=['man'])

[('actress', 1.1354467868804932),
 ('singer', 1.0441545248031616),
 ('playwright', 1.0072388648986816),
 ('judi', 0.9815788269042969),
 ('entertainer', 0.9794433116912842),
 ('comedienne', 0.9782858490943909),
 ('musician', 0.9759577512741089),
 ('novelist', 0.9746918082237244),
 ('ballerina', 0.9724231958389282),
 ('comedian', 0.9664108157157898)]

In [15]:
# similarity check
print ("Similarity between woman and man = ",word_vectors.similarity('woman', 'man'))
print ("Similarity between man and girl = " ,word_vectors.similarity('man', 'girl'))
print ("Similarity between man and tiger = ",word_vectors.similarity('man', 'tiger'))
print ("Similarity between man and fox = ",word_vectors.similarity('man', 'fox'))   

Similarity between woman and man =  0.7183470966698886
Similarity between man and girl =  0.6545588697169208
Similarity between man and tiger =  0.15145760451573023
Similarity between man and fox =  0.1565765262268731


In [16]:
#odd one out
print(word_vectors.doesnt_match("green blue red zebra".split()))   

zebra


# KERAS using word embedding

In [17]:
def create_embedding_matrix(model):
    # convert the wv word vectors into a numpy matrix that is suitable for insertion
    # into our Keras models
    embedding_matrix = np.zeros((len(word_vectors.vocab), vector_dim))
    for i in range(len(word_vectors.vocab)):
        embedding_vector = word_vectors[word_vectors.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [19]:
#model = gensim.models.Word2Vec.load(root_path + "\\w2v_gensim.model")
embedding_matrix = create_embedding_matrix(word_vectors)

In [20]:
valid_size = 5  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# input words - in this case we do sample by sample evaluations of the similarity
valid_word = Input((1,), dtype='int32')
other_word = Input((1,), dtype='int32')

In [21]:
# setup the embedding layer
embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                  weights=[embedding_matrix])
embedded_a = embeddings(valid_word)
embedded_b = embeddings(other_word)
similarity = merge([embedded_a, embedded_b], mode='cos', dot_axes=2)
# create the Keras model
k_model = Model(input=[valid_word, other_word], output=similarity)

  name=name)


In [22]:
def get_sim(valid_word_idx, vocab_size):
    sim = np.zeros((vocab_size,))
    in_arr1 = np.zeros((1,))
    in_arr2 = np.zeros((1,))
    in_arr1[0,] = valid_word_idx
    for i in range(vocab_size):
        in_arr2[0,] = i
        out = k_model.predict_on_batch([in_arr1, in_arr2])
        sim[i] = out
    return sim

In [24]:
word_vectors
# now run the model and get the closest words to the valid examples
for i in range(valid_size):
    valid_word = word_vectors.index2word[valid_examples[i]]
    top_k = 6  # number of nearest neighbors
    sim = get_sim(valid_examples[i], len(word_vectors.vocab))
    nearest = (-sim).argsort()[1:top_k + 1]
    log_str = 'Nearest to %s:' % valid_word
    for k in range(top_k):
        close_word = word_vectors.index2word[nearest[k]]
        log_str = '%s %s,' % (log_str, close_word)
    print(log_str)

Nearest to his: her, him, himself, faramir, my, their,
Nearest to no: nothing, any, none, little, neither, only,
Nearest to between: irreconcilable, among, across, within, respectively, with,
Nearest to they: we, you, themselves, them, theirs, these,
Nearest to from: back, through, onto, across, periodically, via,
