In [1]:
from graph import DataGraph
from collections import defaultdict

import numpy as np
import pandas as pd
import gensim
# import spacy
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import math
import scipy.spatial.distance as dist

In [2]:
plotly.tools.set_credentials_file(username='tadas.t', api_key='Ngz5K6kLcZm19NzZxH9b')

In [3]:
def get_word_clusters(df, dg):
    new_clusters = []                               
    for cluster in dg.clusters:  
        cluster, hom = cluster                                                                  
        new_cluster = [df[0][x] for x in cluster]
        new_clusters.append((new_cluster, hom[:10]))
    return new_clusters

In [4]:
GOOD_TYPES = ['word2vec', 'glove']
def get_wordvec(word, model, model_type):
    if model_type not in GOOD_TYPES:
        raise ValueError('bad model type')
    if model_type == 'word2vec':
        return model[word]
    if model_type == 'glove':
        return model(word).vector

In [5]:
def get_arccosdist_matrix(model, model_type):
#     USES ARCCOS-DIST
    df = pd.read_csv('/home/tadas/words_smaller.txt')
    df = pd.DataFrame(data=list(set(list(df['word']))))

    distance_matrix = []

    for word in df[0]:
        vector = get_wordvec(word, model, model_type)
        row = []
        for word_2 in df[0]:
            if word == word_2:
                row.append(0)
                continue
            vector_2 = get_wordvec(word_2, model, model_type)
            distance = math.degrees(np.arccos(1 - dist.cosine(vector, vector_2)))
            row.append(distance)
        distance_matrix.append(row)

    distance_matrix = np.array(distance_matrix)
    return df, distance_matrix

In [6]:
def get_euclidean_matrix(model, model_type):
#     USES EUCLIDEAN DISTANCE
    df = pd.read_csv('/home/tadas/words_smaller.txt')
    df = pd.DataFrame(data=list(set(list(df['word']))))

    distance_matrix = []

    for word in df[0]:
        vector = get_wordvec(word, model, model_type)
        row = []
        for word_2 in df[0]:
            if word == word_2:
                row.append(0)
                continue
            vector_2 = get_wordvec(word_2, model, model_type)
            distance = np.linalg.norm(vector - vector_2)
            row.append(distance)
        distance_matrix.append(row)

    distance_matrix = np.array(distance_matrix)
    return df, distance_matrix

In [7]:
def produce_data(df, matrix, start, end, step):
    cluster_list = []
    for epsilon in np.arange(start, end, step):
        dg = DataGraph(matrix, epsilon, 300)
        print('Started clustering {0}'.format(epsilon))
        dg.cluster(report_homology=True)
        clusters = get_word_clusters(df, dg)
        cluster_list.append(clusters)
    return cluster_list

In [8]:
def prepare_data(cluster_list):
    word_y_values = defaultdict(list)
    word_homologies = defaultdict(list)

    for clusters in cluster_list:
        for i, stuff in enumerate(clusters):
            cluster, homology = stuff
            for word in cluster:
                word_y_values[word].append(i)
                word_homologies[word].append(homology)
    return word_y_values, word_homologies

In [9]:
def plot_data(relevant_words, x_values, word_y_values, word_homologies, filename, title):
    data = []
    for word in relevant_words:
        trace = go.Scatter(
            x = x_values,
            y = word_y_values[word],
            text = [str(x) for x in word_homologies[word]],
            name = word,
            hoverinfo='text+name'
        )
        data.append(trace)


    layout= go.Layout(
        title= title,
        hovermode= 'x',
        xaxis= dict(
            title= 'Angle',
            ticklen= 5,
            zeroline= False,
            gridwidth= 2,
        ),
        yaxis=dict(
            title= 'Words',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend= False
    )

    fig= go.Figure(data=data, layout=layout)
    py.plot(fig, filename=filename)

In [10]:
model = gensim.models.KeyedVectors.load_word2vec_format('/home/tadas/GoogleNews-vectors-negative300.bin', binary=True)
# model = spacy.load('en_vectors_glove_md')
df, matrix = get_euclidean_matrix(model, model_type='word2vec')
# start = 40
# stop = 72
# step = 2
# cluster_list = produce_data(df, matrix, start, stop, step)
# word_y_values, word_homologies = prepare_data(cluster_list)
# plot_data(list(range(start,stop,step)), word_y_values, word_homologies, 'word2vec_{0}_{1}_{2}'.format(start, stop, step))

In [11]:
np.max(matrix)

6.2600741386413574

In [12]:
np.mean(matrix)

4.1030439713147242

In [13]:
np.std(matrix)

0.63930167263783655

In [11]:
start = 1
stop = 3.5
step = 0.5
cluster_list = produce_data(df, matrix, start, stop, step)
word_y_values, word_homologies = prepare_data(cluster_list)

Started clustering 1.0
Started clustering 1.5
Started clustering 2.0
Started clustering 2.5
Started clustering 3.0


In [18]:
relevant_words = ['riverfront',
   'oceanfront',
   'reef',
   'cay',
   'beach',
   'shore',
   'coast',
   'seaboard',
   'river',
   'hill',
   'incline',
   'camber',
   'bend',
   'slope',
   'ledge',
   'embankment',
   'levee',
   'reservoir',
   'cliff',
   'mound',
   'pitch',
   'field',
   'riverside',
   'lakeshore',
   'lakeside',
   'seafront',
   'waterfront',
   'lakefront']

In [19]:
plot_data(relevant_words, list(np.arange(start,stop,step)), word_y_values, word_homologies, 
          'word2vec_{0}_{1}_{2}_nature-euclidean'.format(start, stop, step), 
          title='Word2vec nature related words - Euclidean distance')