In [1]:
import matplotlib.pyplot as plt
import random
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import tensorflow as tf
from word2vec import Word2Vec

In [2]:
sess = tf.InteractiveSession()

In [3]:
# Load training results.
training_results_path = "training-results"
model = Word2Vec(sess,
                 for_training = False,
                 training_results_path = training_results_path)

In [4]:
embeddings = model.tf_embeddings.eval()

In [5]:
# Visualize embeddings.
def plot_with_labels(low_dim_embeds, labels):
    plt.figure(figsize = (20, 20))
    for i, label in enumerate(labels):
        x, y = low_dim_embeds[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                    xy = (x, y),
                    xytext = (5, 2),
                    textcoords = "offset points",
                    ha = "right",
                    va = "bottom")

tsne = TSNE(perplexity = 30, n_components = 2, init = "pca", n_iter = 5000)
plot_only = 500
low_dim_embeds = tsne.fit_transform(embeddings[:plot_only, :])
labels = [model.words[i] for i in xrange(plot_only)]
plot_with_labels(low_dim_embeds, labels)

In [6]:
def read_analogy_questions(filename):
    questions_by_section = {}
    with open(filename, "rb") as fp:
        for line in fp:
            line = line.strip()
            if line.startswith(":"):
                section_name = line[2:]
                questions_by_section[section_name] = []
                questions = questions_by_section[section_name]
            else:
                q = line.split()
                if (q[0].lower() not in model.word2id or
                    q[1].lower() not in model.word2id or
                    q[2].lower() not in model.word2id or
                    q[3].lower() not in model.word2id):
                    continue
                questions.append(q)
    return questions_by_section
            
            
questions_by_section = read_analogy_questions("questions-words.txt")
print questions_by_section.keys()

['gram3-comparative', 'gram8-plural', 'capital-common-countries', 'city-in-state', 'family', 'gram9-plural-verbs', 'gram2-opposite', 'currency', 'gram4-superlative', 'gram6-nationality-adjective', 'gram7-past-tense', 'gram5-present-participle', 'capital-world', 'gram1-adjective-to-adverb']


In [7]:
def plot_connections(low_dim_beds, labels):
    plt.figure(figsize = (20, 20))
    for i, label in enumerate(labels):
        x, y = low_dim_embeds[i, :]
        plt.scatter(x, y)
        plt.text(x, y, label, fontsize=16)
    for i in xrange(len(labels) / 2):
        x1, y1 = low_dim_embeds[i*2, :]
        x2, y2 = low_dim_embeds[i*2+1, :]
        plt.plot([x1, x2], [y1, y2], 'b--')    

In [8]:
def plot_connections(low_dim_beds, labels):
    plt.figure(figsize = (20, 20))
    for i, label in enumerate(labels):
        x, y = low_dim_embeds[i, :]
        plt.scatter(x, y)
        plt.text(x, y, label, fontsize=16)
    for i in xrange(len(labels) / 2):
        x1, y1 = low_dim_embeds[i*2, :]
        x2, y2 = low_dim_embeds[i*2+1, :]
        plt.plot([x1, x2], [y1, y2], 'b--')


questions = questions_by_section["gram3-comparative"]
question_pairs = set()
for q in questions:
    if (q[0].lower() not in model.word2id or
        q[1].lower() not in model.word2id or 
        q[2].lower() not in model.word2id or 
        q[3].lower() not in model.word2id):
        continue
    question_pairs.add(tuple(q[0:2]))
    question_pairs.add(tuple(q[2:4]))
question_pairs = list(question_pairs)
question_words = [w for p in question_pairs for w in p]
question_words_indices = [model.word2id[w.lower()] for w in question_words]

pca = PCA(2)
high_dim_embeds = embeddings[question_words_indices, :]
low_dim_embeds = pca.fit(high_dim_embeds).transform(high_dim_embeds)
plot_connections(low_dim_embeds, question_words)

In [9]:
def compute_analogy(a, b, c):
    q = np.asarray([[model.word2id[a.lower()], model.word2id[b.lower()], model.word2id[c.lower()]]], np.int32)
    results = model.predict_analogy(q)
    for r in results[0]:
        if r not in q:
            return model.words[r]


questions = questions_by_section["family"]
random.shuffle(questions)
for q in questions[:30]:
    r = compute_analogy(q[0], q[1], q[2])
    if r != q[3].lower():
        print q[0], ':', q[1], '---', q[2], ':(', q[3], 'v.s.',  r, ')'

man : woman --- groom :( bride v.s. marriage )
son : daughter --- stepfather :( stepmother v.s. vipsania )
grandson : granddaughter --- grandfather :( grandmother v.s. paternal )
stepfather : stepmother --- grandpa :( grandma v.s. captioned )
uncle : aunt --- brothers :( sisters v.s. gummo )
stepfather : stepmother --- grandson :( granddaughter v.s. infanta )
groom : bride --- dad :( mom v.s. mother )
prince : princess --- dad :( mom v.s. gnasher )
grandpa : grandma --- brothers :( sisters v.s. brother )
brother : sister --- groom :( bride v.s. longwood )
brother : sister --- grandfather :( grandmother v.s. paternal )
husband : wife --- grandpa :( grandma v.s. tapp )
he : she --- brothers :( sisters v.s. brother )
husband : wife --- grandfather :( grandmother v.s. father )
uncle : aunt --- groom :( bride v.s. nellis )
husband : wife --- brother :( sister v.s. son )
prince : princess --- nephew :( niece v.s. wife )
grandpa : grandma --- husband :( wife v.s. daughter )
man : woman --- gr