In [1]:
import csv
import string
from time import time  # To time our operations
import multiprocessing
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JesterPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JesterPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
# Count the number of cores in a computer
cores = multiprocessing.cpu_count()

with open("data/data_origin.DUMP", encoding="utf8") as tsv:
    counter = 0

    sentences = []
    for line in csv.reader(tsv, dialect="excel-tab"):
        counter += 1
        sentence = line[2]

        # trim punctuation
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()

        cleared_words = []
        # print(sentence)

        all_words = nltk.word_tokenize(sentence)
        # print(all_words)

        for word in all_words:
            if word not in stopwords.words('turkish'):
                cleared_words.append(word)

        sentences.append(cleared_words)

        if counter == 10000:
            break
            

In [35]:
#print(sentences)
w2v_model = Word2Vec(min_count=2,
                     window=2,
                     size=500,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores - 1)
t = time()

w2v_model.build_vocab(sentences, progress_per=100)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:39:16: collecting all words and their counts
INFO - 22:39:16: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:39:16: PROGRESS: at sentence #100, processed 1377 words, keeping 933 word types
INFO - 22:39:16: PROGRESS: at sentence #200, processed 2701 words, keeping 1771 word types
INFO - 22:39:16: PROGRESS: at sentence #300, processed 3818 words, keeping 2475 word types
INFO - 22:39:16: PROGRESS: at sentence #400, processed 5095 words, keeping 3163 word types
INFO - 22:39:16: PROGRESS: at sentence #500, processed 6415 words, keeping 3816 word types
INFO - 22:39:16: PROGRESS: at sentence #600, processed 7992 words, keeping 4349 word types
INFO - 22:39:16: PROGRESS: at sentence #700, processed 9303 words, keeping 4943 word types
INFO - 22:39:16: PROGRESS: at sentence #800, processed 10733 words, keeping 5537 word types
INFO - 22:39:16: PROGRESS: at sentence #900, processed 12145 words, keeping 6107 word types
INFO - 22:39:16: PROGRESS: at sentence #10

INFO - 22:39:16: PROGRESS: at sentence #8700, processed 113941 words, keeping 33769 word types
INFO - 22:39:16: PROGRESS: at sentence #8800, processed 115252 words, keeping 34107 word types
INFO - 22:39:16: PROGRESS: at sentence #8900, processed 116663 words, keeping 34392 word types
INFO - 22:39:16: PROGRESS: at sentence #9000, processed 118001 words, keeping 34691 word types
INFO - 22:39:16: PROGRESS: at sentence #9100, processed 119118 words, keeping 34898 word types
INFO - 22:39:16: PROGRESS: at sentence #9200, processed 120333 words, keeping 35138 word types
INFO - 22:39:16: PROGRESS: at sentence #9300, processed 121641 words, keeping 35364 word types
INFO - 22:39:16: PROGRESS: at sentence #9400, processed 122885 words, keeping 35545 word types
INFO - 22:39:16: PROGRESS: at sentence #9500, processed 124546 words, keeping 35903 word types
INFO - 22:39:16: PROGRESS: at sentence #9600, processed 126084 words, keeping 36250 word types
INFO - 22:39:16: PROGRESS: at sentence #9700, proc

Time to build vocab: 0.09 mins


In [36]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 22:39:21: training model with 5 workers on 14069 vocabulary and 500 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 22:39:22: worker thread finished; awaiting finish of 4 more threads
INFO - 22:39:22: worker thread finished; awaiting finish of 3 more threads
INFO - 22:39:22: worker thread finished; awaiting finish of 2 more threads
INFO - 22:39:22: worker thread finished; awaiting finish of 1 more threads
INFO - 22:39:22: worker thread finished; awaiting finish of 0 more threads
INFO - 22:39:22: EPOCH - 1 : training on 131166 raw words (78074 effective words) took 0.8s, 93104 effective words/s
INFO - 22:39:23: worker thread finished; awaiting finish of 4 more threads
INFO - 22:39:23: worker thread finished; awaiting finish of 3 more threads
INFO - 22:39:23: worker thread finished; awaiting finish of 2 more threads
INFO - 22:39:23: worker thread finished; awaiting finish of 1 more threads
INFO - 22:39:23: worker thread finished; awaiting finish of 0 more thread

INFO - 22:39:35: worker thread finished; awaiting finish of 1 more threads
INFO - 22:39:35: worker thread finished; awaiting finish of 0 more threads
INFO - 22:39:35: EPOCH - 17 : training on 131166 raw words (78012 effective words) took 0.8s, 97607 effective words/s
INFO - 22:39:36: worker thread finished; awaiting finish of 4 more threads
INFO - 22:39:36: worker thread finished; awaiting finish of 3 more threads
INFO - 22:39:36: worker thread finished; awaiting finish of 2 more threads
INFO - 22:39:36: worker thread finished; awaiting finish of 1 more threads
INFO - 22:39:36: worker thread finished; awaiting finish of 0 more threads
INFO - 22:39:36: EPOCH - 18 : training on 131166 raw words (77854 effective words) took 0.8s, 98691 effective words/s
INFO - 22:39:37: worker thread finished; awaiting finish of 4 more threads
INFO - 22:39:37: worker thread finished; awaiting finish of 3 more threads
INFO - 22:39:37: worker thread finished; awaiting finish of 2 more threads
INFO - 22:39:3

Time to train the model: 0.4 mins


In [47]:
# print(w2v_model.wv.most_similar(positive=["torres"]))
#print(w2v_model.wv['liverpool'])
print(len(w2v_model.wv['liverpool']))


500


In [49]:
from sklearn.manifold import TSNE

# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)

# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(df[:400])

sns.set()
# Initialize figure
fig, ax = plt.subplots(figsize = (11.7, 8.27))
sns.scatterplot(tsne_df[:, 0], tsne_df[:, 1], alpha = 0.5)

# Import adjustText, initialize list of texts
from adjustText import adjust_text
texts = []
words_to_plot = list(np.arange(0, 400, 10))

# Append words to list
for word in words_to_plot:
    texts.append(plt.text(tsne_df[word, 0], tsne_df[word, 1], df.index[word], fontsize = 14))
    
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()


IndexError: too many indices for array