In [4]:
!wget http://lsa.tmit.bme.hu/files/wiki_train_doc2vec_text.txt

--2023-11-07 12:00:37--  http://lsa.tmit.bme.hu/files/wiki_train_doc2vec_text.txt
Resolving lsa.tmit.bme.hu (lsa.tmit.bme.hu)... 152.66.246.99
Connecting to lsa.tmit.bme.hu (lsa.tmit.bme.hu)|152.66.246.99|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 243973232 (233M) [text/plain]
Saving to: ‘wiki_train_doc2vec_text.txt’


2023-11-07 12:01:08 (7.74 MB/s) - ‘wiki_train_doc2vec_text.txt’ saved [243973232/243973232]



In [14]:
!pwd

/content


In [15]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('/content/wiki_train_doc2vec_text.txt')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [16]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

In [29]:
for w in model.wv.most_similar(positive=['király'], topn=20):
  print(w)

('királynak', 0.8022586107254028)
('királyt', 0.7918736338615417)
('királlyal', 0.7583029270172119)
('herceg', 0.7438232898712158)
('királyné', 0.7249489426612854)
('trón', 0.7246353626251221)
('ulászló', 0.7228331565856934)
('királynő', 0.7227612733840942)
('királyhoz', 0.7164027690887451)
('uralkodó', 0.716214120388031)
('henrik', 0.713676393032074)
('császár', 0.7059786915779114)
('trónt', 0.7007117867469788)
('ferdinánd', 0.6963745951652527)
('eduárd', 0.6831820011138916)
('fejedelem', 0.68222576379776)
('királyok', 0.6734300851821899)
('szultán', 0.668687105178833)
('trónra', 0.665826141834259)
('hunyadi', 0.6651430726051331)


In [None]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)