**After data tokenizing**

In [None]:
#Making doc2vec Vector

import gensim

def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

data_for_training = list(tagged_document(tokenized_doc))

doc2vec = gensim.models.doc2vec.Doc2Vec(vector_size=64, min_count=2, epochs=30)
doc2vec.build_vocab(data_for_training)
doc2vec.train(data_for_training, total_examples=doc2vec.corpus_count, epochs=doc2vec.epochs)

In [None]:
#Selecting keywords 

#ex) keys['fast','slow'....]
keys = []


embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in doc2vec.wv.most_similar(word, topn=15):
        words.append(similar_word)
        embeddings.append(doc2vec[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
#Tsne plotting

#n_components: dimensions (default:2)
#perplexity: nearest neighbor number (default:30)
#init: reset embeddings. Pick between random, pca (pca is more stable)
#n_iter: iteration number

from sklearn.manifold import TSNE
import numpy as np

tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
# Test visualizing

from matplotlib import rc
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import font_manager
f_path = "C:/windows/Fonts/malgun.ttf" #font path
font_manager.FontProperties(fname=f_path).get_name()
rc('font', family='Malgun Gothic')
plt.title('테스트입니다')

In [None]:
#TSNE plotting similar_words

import matplotlib.cm as cm
import seaborn as sns

def tsne_plot_similar_words(labels, embedding_clusters, word_clusters, a=0.7):
    plt.figure(figsize=(16, 11))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:,0]
        y = embeddings[:,1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            ann = '   {}\n'.format(word)
            plt.annotate(word, alpha=0.6, xy=(x[i], y[i]), xytext=(10, 10), 
                         textcoords='offset points', ha='right', va='bottom', size=8)
    sns.set(font="Malgun Gothic", rc={"axes.unicode_minus":False})        
    plt.legend(loc=4)
    plt.grid(True)
    plt.savefig("전체_clustering.png", format='png', dpi=150, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words(keys, embeddings_en_2d, word_clusters)

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

# tsne plot for below word
def tsne_plot(for_word, w2v_model):
    # trained word2vec model dimention
    sns.set_style('whitegrid')
    dim_size = w2v_model.wv.vectors.shape[1]

    arrays = np.empty((0, dim_size), dtype='f')
    word_labels = [for_word]
    color_list  = ['Blue']

    # adds the vector of the query word
    arrays = np.append(arrays, w2v_model.wv.__getitem__([for_word]), axis=0)

    # gets list of most similar words
    sim_words = w2v_model.wv.most_similar(for_word, topn=25)

    # adds the vector for each of the closest words to the array
    for wrd_score in sim_words:
        wrd_vector = w2v_model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('black')
        arrays = np.append(arrays, wrd_vector, axis=0)

    #---------------------- Apply PCA and tsne to reduce dimention --------------

    # fit 2d PCA model to the similar word vectors
    model_pca = PCA(n_components = 10).fit_transform(arrays)

    # Finds 2d coordinates t-SNE
    np.set_printoptions(suppress=True)
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(model_pca)

    # Sets everything up to plot
    df_plot = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words_name': word_labels,
                       'words_color': color_list})

    #------------------------- tsne plot Python -----------------------------------

    # plot dots with color and position
    plt.rcParams["font.family"]='Malgun Gothic'
    plt.figure(figsize=(12, 7))
    plot_dot = sns.regplot(data=df_plot,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df_plot['words_color']
                                 }
                    )

    # Adds annotations with color one by one with a loop
    for line in range(0, df_plot.shape[0]):
         plot_dot.text(df_plot["x"][line],
                 df_plot['y'][line],
                 '  ' + df_plot["words_name"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df_plot['words_color'][line],
                 weight='normal'
                ).set_size(15)

    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)

    plt.title('t-SNE visualization for word "{}'.format(for_word.title()) +'"')

**Running Code**

In [None]:

tsne_plot(for_word='word', w2v_model=word2vec)