In [1]:
import csv
import string
from time import time  # To time our operations
import multiprocessing
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JesterPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JesterPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Count the number of cores in a computer
cores = multiprocessing.cpu_count()

cleared_words = []
with open("data/data_origin.DUMP", encoding="utf8") as tsv:
    counter = 0

    sentences = []
    for line in csv.reader(tsv, dialect="excel-tab"):
        counter += 1
        sentence = line[2]

        # trim punctuation
        sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()

        
        # print(sentence)

        all_words = nltk.word_tokenize(sentence)
        # print(all_words)

        for word in all_words:
            if word not in stopwords.words('turkish'):
                cleared_words.append(word)
    

        sentences.append(cleared_words)

        if counter == 100:
            break

In [12]:
#print(sentences)
w2v_model = Word2Vec(min_count=2,
                     window=2,
                     size=20,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores - 1)
t = time()

w2v_model.build_vocab(sentences, progress_per=100)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 23:12:05: collecting all words and their counts
INFO - 23:12:05: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 23:12:05: collected 933 word types from a corpus of 137700 raw words and 100 sentences
INFO - 23:12:05: Loading a fresh vocabulary
INFO - 23:12:05: effective_min_count=2 retains 933 unique words (100% of original 933, drops 0)
INFO - 23:12:05: effective_min_count=2 leaves 137700 word corpus (100% of original 137700, drops 0)
INFO - 23:12:05: deleting the raw counts dictionary of 933 items
INFO - 23:12:05: sample=6e-05 downsamples 933 most-common words
INFO - 23:12:05: downsampling leaves estimated 38573 word corpus (28.0% of prior 137700)
INFO - 23:12:05: estimated required memory for 933 words and 20 dimensions: 615780 bytes
INFO - 23:12:05: resetting layer weights


Time to build vocab: 0.01 mins


In [13]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 23:12:07: training model with 5 workers on 933 vocabulary and 20 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 23:12:07: worker thread finished; awaiting finish of 4 more threads
INFO - 23:12:07: worker thread finished; awaiting finish of 3 more threads
INFO - 23:12:07: worker thread finished; awaiting finish of 2 more threads
INFO - 23:12:07: worker thread finished; awaiting finish of 1 more threads
INFO - 23:12:07: worker thread finished; awaiting finish of 0 more threads
INFO - 23:12:07: EPOCH - 1 : training on 137700 raw words (38763 effective words) took 0.1s, 363510 effective words/s
INFO - 23:12:08: worker thread finished; awaiting finish of 4 more threads
INFO - 23:12:08: worker thread finished; awaiting finish of 3 more threads
INFO - 23:12:08: worker thread finished; awaiting finish of 2 more threads
INFO - 23:12:08: worker thread finished; awaiting finish of 1 more threads
INFO - 23:12:08: worker thread finished; awaiting finish of 0 more threads


INFO - 23:12:09: worker thread finished; awaiting finish of 1 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 0 more threads
INFO - 23:12:09: EPOCH - 17 : training on 137700 raw words (38773 effective words) took 0.1s, 525505 effective words/s
INFO - 23:12:09: worker thread finished; awaiting finish of 4 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 3 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 2 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 1 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 0 more threads
INFO - 23:12:09: EPOCH - 18 : training on 137700 raw words (38568 effective words) took 0.1s, 610758 effective words/s
INFO - 23:12:09: worker thread finished; awaiting finish of 4 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 3 more threads
INFO - 23:12:09: worker thread finished; awaiting finish of 2 more threads
INFO - 23:12

Time to train the model: 0.04 mins


In [15]:
# print(w2v_model.wv.most_similar(positive=["torres"]))
#print(w2v_model.wv['liverpool'])
#print(len(w2v_model.wv['liverpool']))


import pandas as pd

vector_list = [w2v_model.wv[word] for word in cleared_words if word in w2v_model.wv.vocab]
# Zip the words together with their vector representations
word_vec_zip = zip(cleared_words, vector_list)

# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')
df.head(5)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
ngc,-0.209774,-0.241672,0.092565,-0.140927,0.152449,0.608804,0.020826,-0.055008,-0.293602,-0.183029,-0.216971,-0.391334,0.144403,0.085533,0.256235,0.148873,-0.08138,0.069558,0.151434,0.038945
5713,-0.216018,-0.24279,0.090713,-0.148366,0.153302,0.611608,0.005301,-0.031468,-0.321257,-0.17105,-0.19331,-0.369749,0.110494,0.086334,0.273616,0.176703,-0.065582,0.04045,0.159417,0.043743
başak,-0.093968,-0.103567,0.145338,-0.190256,-0.052438,0.569115,-0.067764,-0.176069,-0.101432,-0.08326,-0.294332,-0.420528,0.212798,0.123074,0.342216,0.060055,-0.207561,0.184594,0.127818,0.074506
takımyıldızı,-0.106712,-0.094184,0.146957,-0.188268,-0.059321,0.573527,-0.033603,-0.19619,-0.083539,-0.083332,-0.321583,-0.413458,0.198734,0.10735,0.347165,0.065343,-0.195594,0.167131,0.124381,0.069785
bölgesinde,-0.104716,-0.11271,0.163021,-0.189474,-0.064034,0.55886,-0.019822,-0.212901,-0.096824,-0.073751,-0.326998,-0.403558,0.174738,0.128722,0.353276,0.052133,-0.222858,0.153916,0.123988,0.061288


In [16]:
from sklearn.manifold import TSNE
import seaborn as sns

# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)

# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(df[:400])

sns.set()
# Initialize figure
fig, ax = plt.subplots(figsize = (11.7, 8.27))
sns.scatterplot(tsne_df[:, 0], tsne_df[:, 1], alpha = 0.5)

# Import adjustText, initialize list of texts
from adjustText import adjust_text
texts = []
words_to_plot = list(np.arange(0, 400, 10))

# Append words to list
for word in words_to_plot:
    texts.append(plt.text(tsne_df[word, 0], tsne_df[word, 1], df.index[word], fontsize = 14))
    
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()


NameError: name 'sns' is not defined