In [None]:
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from gensim.models import KeyedVectors #keyed vectors saves space
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import matplotlib.cm as cm


In [None]:
# Lets load our pre-traindem model to avoid parsing text and training again
filename = 'Word2Vec/Word2Vec.txt'
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(filename, unicode_errors='ignore')

In [None]:
#https://github.com/RaRe-Technologies/gensim/wiki/Recipes-&-FAQ#q10-loading-a-word2vec-model-fails-with-unicodedecodeerror-utf-8-codec-cant-decode-bytes-in-position-
# load pre-trained model trained on wiki sites 
filename = 'SloTrainedModel/model.txt'
pre_trained_model = gensim.models.KeyedVectors.load_word2vec_format(filename, unicode_errors='ignore') #model was badly trained, without unicode utf-8

pre_trained_model_english = gensim.models.KeyedVectors.load_word2vec_format('SloTrainedModel/GoogleNews-vectors-negative300.bin', binary=True)  

In [None]:
#1 Plotting with preplexity 4 for small amount of data on graph

def PlotGraphPreplexity4(title,keys, image_name):
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        for similar_word, _ in w2v_model.wv.most_similar(word, topn=50):
            words.append(similar_word)
            embeddings.append(w2v_model.wv[similar_word])
        embedding_clusters.append(embeddings)
        word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=4, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
    tsne_plot_similar_words(title, keys, embeddings_en_2d, word_clusters, 1, image_name)
    

def PlotGraphPreplexity50(title,keys, image_name):
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        for similar_word, _ in pre_trained_model.most_similar(word, topn=100):
            words.append(similar_word)
            embeddings.append(pre_trained_model[similar_word])
        embedding_clusters.append(embeddings)
        word_clusters.append(words)
    
    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=50, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
    tsne_plot_similar_words(title, keys, embeddings_en_2d, word_clusters, 1, image_name)
    
def PlotGraphPreplexity50_english(title,keys, image_name):
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        for similar_word, _ in pre_trained_model_english.most_similar(word, topn=100):
            words.append(similar_word)
            embeddings.append(pre_trained_model_english[similar_word])
        embedding_clusters.append(embeddings)
        word_clusters.append(words)
    
    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=50, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
    tsne_plot_similar_words(title, keys, embeddings_en_2d, word_clusters, 1, image_name)

    
def PlotGraphPreplexity15(title,keys, image_name):
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        for similar_word, _ in w2v_model.wv.most_similar(word, topn=100):
            words.append(similar_word)
            embeddings.append(w2v_model.wv[similar_word])
        embedding_clusters.append(embeddings)
        word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
    tsne_plot_similar_words(title, keys, embeddings_en_2d, word_clusters, 1, image_name)

In [None]:
def Plot3D(title, label):
    words_wp = []
    embeddings_wp = []
    for word in list(w2v_model.wv.vocab):
        embeddings_wp.append(w2v_model.wv[word])
    words_wp.append(word)

    tsne_wp_3d = TSNE(perplexity=70, n_components=3, init='pca', n_iter=3500, random_state=12)
    embeddings_wp_3d = tsne_wp_3d.fit_transform(embeddings_wp)
    tsne_plot_3d(title, label, embeddings_wp_3d, a=0.8)
    
def tsne_plot_3d(title, label, embeddings, a=1):
    fig = plt.figure()
    ax = Axes3D(fig)
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a, label=label)
    plt.legend(loc=4)
    plt.title(title)
    plt.show()

In [None]:
# Method for generating 2d graph of similar words by clusters
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=12)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()