In [16]:
import csv
import string
from time import time  # To time our operations
import multiprocessing
import gensim
from gensim.models import Word2Vec, KeyedVectors
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from adjustText import adjust_text

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JesterPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JesterPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Count the number of cores in a computer
cores = multiprocessing.cpu_count()
t = time()

words = []
sentences = []

with open("data/data_origin.DUMP", encoding="utf8") as tsv:

    for line in csv.reader(tsv, dialect="excel-tab"):
        sentence = line[2]

        # trim punctuation, make it lowercase
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()

        cleared_words = []
        # print(sentence)

        all_words = nltk.word_tokenize(sentence)
        # print(all_words)

        for word in all_words:
            if word not in stopwords.words('turkish'):
                cleared_words.append(word)
                words.append(word)

        sentences.append(cleared_words)
        
print('Time to tokenize: {} mins'.format(round((time() - t) / 60, 2)))

Time to tokenize: 44.18 mins


In [None]:
# print(cleared_words)
w2v_model = Word2Vec(min_count=2,
                     window=2,
                     size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores - 1)
t = time()

w2v_model.build_vocab(sentences, progress_per=100)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

w2v_model.wv.save_word2vec_format('models/model_ep30.bin', binary=True)

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('models/model_ep30.bin', binary=True)

In [15]:
vector_list = [w2v_model.wv[word] for word in cleared_words if word in w2v_model.wv.vocab]
# Zip the words together with their vector representations
word_vec_zip = zip(cleared_words, vector_list)

# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
ip2location,-0.101466,0.037244,-0.020054,-0.01124,-0.055811,0.087108,0.051623,-0.042056,-0.066669,-0.018448,...,0.003971,0.060623,0.061555,0.015079,0.028624,-0.073406,-0.027791,-0.026536,0.005175,-0.026865
ziyaretçilerin,-0.157579,0.01582,0.056065,0.007031,-0.082792,0.060464,-0.031279,0.045727,0.025722,-0.03749,...,-0.007301,0.016531,-0.066774,0.003128,-0.126874,-0.099833,0.021727,-0.015166,0.014899,-0.071311
ip,-0.022103,0.127344,0.031323,-0.014199,-0.024021,0.013495,0.000649,0.039871,-0.08884,-0.081739,...,0.010576,0.072798,-0.0409,-0.105983,0.070004,-0.079335,-0.030621,-0.084071,0.030121,-0.012683
adresi,0.066656,-0.04585,0.011353,0.023845,-0.0183,-0.019117,0.049746,0.023449,-0.079959,-0.014143,...,-0.03891,0.117772,0.019391,-0.074028,-0.060765,-0.113108,0.088901,-0.002159,0.018068,-0.017342
ülke,0.059328,-0.0226,0.03343,-0.00018,-0.121959,-0.134915,-0.174018,-0.038452,-0.04877,0.017513,...,-0.078978,-0.008231,-0.014137,-0.064235,-0.001165,-0.096079,-0.064948,0.02583,-0.038948,-0.139727


In [None]:
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)

# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(df[:400])

sns.set()
# Initialize figure
fig, ax = plt.subplots(figsize = (11.7, 8.27))
sns.scatterplot(tsne_df[:, 0], tsne_df[:, 1], alpha = 0.5)

# initialize list of texts
texts = []
words_to_plot = list(np.arange(0, 400, 10))

# Append words to list
for word in words_to_plot:
    texts.append(plt.text(tsne_df[word, 0], tsne_df[word, 1], df.index[word], fontsize = 14))
    
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()