Uncomment if you are using colab:

In [None]:
# !pip install pymorphy2==0.9.1
# !pip install gensim==4.1.2

In [None]:
import re
from typing import Dict, Iterable, List

import gensim
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import plotly
import pymorphy2
from IPython.display import display
from scipy.stats import pearsonr, spearmanr
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate
from tqdm import tqdm_notebook

plt.style.use("ggplot")
nltk.download("stopwords")
nltk.download("punkt")

In [None]:
SEED = 42
np.random.seed(SEED)

DATA_PATH = "./data/w2v_dataset.csv.zip"
EVAL_PATH = "./data/ru_simlex965.tsv"

Uncomment if you are using colab:

In [None]:
# !mkdir ./data
# !wget https://raw.githubusercontent.com/vadim0912/ML2023/main/lecture08/data/w2v_dataset.csv.zip -O $DATA_PATH
# !wget https://raw.githubusercontent.com/vadim0912/ML2023/main/lecture08/data/ru_simlex965.tsv -O $EVAL_PATH

# Dataset

In [None]:
df = pd.read_csv(DATA_PATH)

df.head()

In [None]:
df.sentence.str.len().hist(bins=200);

In [None]:
df.language.value_counts()

In [None]:
df.shape

In [None]:
corpus = df.sentence.values

# Preprocessing

In [None]:
char2count = df.sentence.apply(lambda x: list(x.lower())).explode().value_counts()

"".join(char2count.index)

In [None]:
def tokenize(text: str) -> List[str]:
    return re.findall("[оаеитнсврмлкдпузябгчіьыжхйшцющъoэфєёї]+", text.lower())

In [None]:
stopword_set = set(nltk.corpus.stopwords.words("russian"))

In [None]:
lemmatizer = pymorphy2.MorphAnalyzer()

lemmatizer_cache = {}


def lemmatize(token: str) -> str:
    if lemmatizer.word_is_known(token):
        if token not in lemmatizer_cache:
            lemmatizer_cache[token] = lemmatizer.parse(token)[0].normal_form
        return lemmatizer_cache[token]
    return token

In [None]:
def prepare_sentence_dataset(documents: Iterable[str]) -> List[List[str]]:
    tokenized_sentences = []
    for document in tqdm_notebook(documents):
        for sentence in nltk.sent_tokenize(document):
            lemmatized_tokens = [lemmatize(token) for token in tokenize(sentence)]
            tokenized_sentences.append(
                [token for token in lemmatized_tokens if token not in stopword_set]
            )
    return tokenized_sentences

In [None]:
sentence_dataset = prepare_sentence_dataset(corpus)

In [None]:
len(sentence_dataset)

In [None]:
corpus[4]

In [None]:
sentence_dataset[4]

# Word2Vec training

In [None]:
word2vec = gensim.models.Word2Vec(
    vector_size=100, sg=0, window=5, min_count=5, negative=20
)

In [None]:
word2vec.build_vocab(sentence_dataset)

In [None]:
len(word2vec.wv.index_to_key)

In [None]:
%%time

word2vec.train(sentence_dataset, total_examples=word2vec.corpus_count, epochs=30);

In [None]:
word2vec.wv.most_similar("мама")

In [None]:
test_words = ["можливість", "чоловік", "возможность", "мужчина"]

for word in test_words:
    print(word)
    print(
        tabulate(
            word2vec.wv.most_similar(word),
            tablefmt="orgtbl",
            headers=("neighbor", "score"),
        ),
        end="\n\n",
    )

# Visualization

In [None]:
index2word = np.array(word2vec.wv.index_to_key)

In [None]:
embeddings = word2vec.wv.vectors

In [None]:
embeddings.shape

In [None]:
ids = np.random.randint(low=0, high=index2word.size, size=2000)

In [None]:
embeddings_reduced = TSNE(random_state=SEED, n_components=2).fit_transform(
    embeddings[ids]
)

In [None]:
def plot_tsne_embeddings(embeddings: np.ndarray, annotations: np.ndarray) -> None:
    trace = plotly.graph_objs.Scattergl(
        x=embeddings[:, 0],
        y=embeddings[:, 1],
        name="Embedding",
        mode="markers",
        marker={"colorscale": "Viridis", "size": 6, "line": {"width": 0.5}, "opacity": 0.75},
        text=annotations,
    )

    layout = {
        "title": "Word2Vec 2D TSNE Embeddings",
        "yaxis": {"zeroline": False},
        "xaxis": {"zeroline": False},
        "hovermode": "closest",
        "width": 800,
        "height": 800,
    }

    display(plotly.graph_objs.Figure(data=[trace], layout=layout))

In [None]:
plot_tsne_embeddings(embeddings_reduced, index2word[ids])

# Sentence Embeddings

In [None]:
def embed_text(
    text: Iterable[str], word2index: Dict[str, int], word_embeddings: np.ndarray
) -> np.ndarray:
    embs = np.array(
        [
            word_embeddings[word2index[word]]
            for word in text
            if word in word2index and word not in stopword_set
        ]
    )

    if embs.shape[0] > 0:
        return embs.mean(0, keepdims=True)
    else:
        return np.zeros((1, word_embeddings.shape[1]))

In [None]:
word2index = word2vec.wv.key_to_index

In [None]:
talks = [
    [lemmatize(token) for token in tokenize(text) if token not in stopword_set]
    for text in corpus
]

In [None]:
talk2vec = np.concatenate([embed_text(talk, word2index, embeddings) for talk in talks])

In [None]:
ids = np.random.randint(low=0, high=index2word.size, size=10_000)

talk2vec_reduced = TSNE(n_components=2, random_state=SEED).fit_transform(talk2vec[ids])

plot_tsne_embeddings(talk2vec_reduced, df.sentence.values[ids])

# Evaluation

In [None]:
eval_set = pd.read_csv(EVAL_PATH, sep="\t")

eval_set.columns = ["word1", "word2", "human_score"]

In [None]:
mask = eval_set.apply(
    lambda row: (row["word1"] in word2index) & (row["word2"] in word2index), axis=1
)

eval_set = eval_set[mask].reset_index(drop=True)

In [None]:
eval_set["model_score"] = eval_set.apply(
    lambda row: cosine_similarity(
        embeddings[[word2index[row["word1"]]]], embeddings[[word2index[row["word2"]]]]
    )[0][0],
    axis=1,
)

In [None]:
plt.scatter(eval_set["model_score"], eval_set["human_score"], alpha=0.8);

In [None]:
pearsonr(eval_set["model_score"], eval_set["human_score"])

In [None]:
spearmanr(eval_set["model_score"], eval_set["human_score"])

In [None]:
eval_set.sort_values("human_score").tail(20).style.background_gradient(
    subset=["model_score"]
)

# Appendix
* FastText: https://arxiv.org/abs/1607.01759
* Byte Pair Encoding:
    * https://arxiv.org/abs/1508.07909
    * https://www.derczynski.com/papers/archive/BPE_Gage.pdf
* Stop Using Word2Vec: https://multithreaded.stitchfix.com/blog/2017/10/18/stop-using-word2vec/