In [None]:
import numpy as np
import pickle
import json
import tensorflow as tf

In [None]:
def get_embedding(data, model_type, vector_type, embed_size):
    embed_filename = output_folder_name + data + "_" + model_type + "_" + vector_type + "_" + embed_size + ".pkl"
    vocab_filename = output_folder_name + data + "_" + model_type + "_" + vector_type + "_" + embed_size + "_dict.json"
    reverse_vocab_filename = output_folder_name + data + "_" + model_type + "_" + vector_type + "_" + embed_size + "_reversedict.json"
        
    embedding = np.load(embed_filename)
    
    with open(vocab_filename,"r") as f:
        vocab = json.load(f)
    with open(reverse_vocab_filename,"r") as f:
        reverse_vocab = json.load(f)
    
    data_dict = {
        "data": data,
        "embed": embedding,
        "vocab": vocab,
        "reverse_vocab": reverse_vocab   
    }
        
    return data_dict

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
    plt.figure(figsize=(14, 14))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
    
def plot_embedding(embedding, m, n, reverse_dictionary):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    low_dim_embs = tsne.fit_transform(embedding[m:n, :])
    labels = [reverse_dictionary[i] for i in xrange(m,n)]
    plot_with_labels(low_dim_embs, labels)
    return low_dim_embs

def top_similar(low_dim_embs, mapping, reverse_mapping, valid_words):
    similarity = tf.matmul(
          low_dim_embs, low_dim_embs, transpose_b=True)

    sim = tf.Session().run(similarity)
    for i in range(len(valid_words)):
        if(valid_words[i] in mapping):
            index = mapping[valid_words[i]]
            valid_word = reverse_mapping[index]
            top_k = 20  # number of nearest neighbors
            nearest = (-sim[index, :][:500]).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in xrange(top_k):
                close_word = reverse_mapping[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

In [None]:
def similar_clusters(data, words, embed_range=[0,500]):
    embedding = data["embed"]
    data_dict = data["vocab"]
    data_reverse_dict = data["reverse_vocab"]
    low_dim_embs = plot_embedding(embedding, embed_range[0], embed_range[1], data_reverse_dict)
    top_similar(low_dim_embs, data_dict, data_reverse_dict, words)

In [None]:
data = "formspring"
model_type = "blstm_attention"
vector_type = "sswe"
embed_size = "50"
output_folder_name = "results/"

words =  ['gay','slave','evidence','hate','fat']

data_dict = get_embedding(data, model_type, vector_type, embed_size)

similar_clusters(get_embedding(data, model_type, vector_type, embed_size), words)

In [None]:
def get_similarityMatrix(low_dim_embs):
    similarity = tf.matmul(
          low_dim_embs, low_dim_embs, transpose_b=True)

    sim = tf.Session().run(similarity)
    return sim

def similarWords(word, sim, mapping, reverse_mapping, top_k):
    index = mapping[word]
    log_str = ''
    try:
        valid_word = reverse_mapping[index]
        nearest = (-sim[index, :][:1000]).argsort()[1:top_k + 1]

        for k in xrange(top_k):
            close_word = reverse_mapping[nearest[k]]
            log_str = '%s%s, ' % (log_str, close_word)
    except:
        print "Word beyond 10k"
    return log_str

def get_tsneembedding(embedding, m, n):
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    low_dim_embs = tsne.fit_transform(embedding[m:n, :])
    return low_dim_embs

def comparison(dict_1, dict_2, dict_3, words, top_k):
    for word in words:
        print("Word: " + str(word))
        print(dict_1["data"] + ": " + similarWords(word, get_similarityMatrix(dict_1["embed"]), dict_1["vocab"], dict_1["reverse_vocab"], top_k))
        print(dict_2["data"] + ": " + similarWords(word, get_similarityMatrix(dict_2["embed"]), dict_2["vocab"], dict_2["reverse_vocab"], top_k))
        print(dict_3["data"] + ": " + similarWords(word, get_similarityMatrix(dict_3["embed"]), dict_3["vocab"], dict_3["reverse_vocab"], top_k))       
        print("----------------------------")


In [None]:
words =  ['gay','slave','evidence','fat','religion','ass']