In [None]:
import csv
import string
import multiprocessing
import gensim
import nltk
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import logging

from time import time
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from adjustText import adjust_text

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Count the number of cores in a computer
cores = multiprocessing.cpu_count()
t = time()

words = []
sentences = []

with open("data/data_origin.DUMP", encoding="utf8") as tsv:

    for line in csv.reader(tsv, dialect="excel-tab"):
        sentence = line[2]

        # trim punctuation, make it lowercase
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()

        cleared_words = []
        # print(sentence)

        all_words = nltk.word_tokenize(sentence)
        # print(all_words)

        for word in all_words:
            if word not in stopwords.words('turkish'):
                cleared_words.append(word)
                words.append(word)

        sentences.append(cleared_words)
        
print('Time to tokenize: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('models/model_ep30.bin', binary=True)

In [None]:
vector_list = [w2v_model.wv[word] for word in words if word in w2v_model.wv.vocab]
# Zip the words together with their vector representations
word_vec_zip = zip(words, vector_list)

# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')
df.head(5)

In [None]:
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)

# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(df[:400])

sns.set()
# Initialize figure
fig, ax = plt.subplots(figsize = (11.7, 8.27))
sns.scatterplot(tsne_df[:, 0], tsne_df[:, 1], alpha = 0.5)

# initialize list of texts
texts = []
words_to_plot = list(np.arange(0, 400, 10))

# Append words to list
for word in words_to_plot:
    texts.append(plt.text(tsne_df[word, 0], tsne_df[word, 1], df.index[word], fontsize = 14))
    
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()