# Word2Vec Dataset Preparation for OOV Words

This notebook handles the import of Word2Vec model and the corresponding logic to build an embedding matrix handling OOV words.

## Load and process dataset

In [None]:
from utils.text import tokenize
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
train_dataset = tokenize(train_dataset)

## Import Word2Vec Dataset

In [2]:
import gensim.downloader
import os

w2v_model_path = "models/word2vec-google-news-300"

# Download pretrained embeddings model if haven't done so
if not os.path.exists(w2v_model_path):
    # Takes around 7mins
    model = gensim.downloader.load("word2vec-google-news-300")
    model.save(w2v_model_path)

    # Alternatively, download from the link below
    # w2v_model = gensim.models.KeyedVectors.load_word2vec_format('model\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', binary=True)
    # download the pretrained model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g (take around 1.5GB)

model = gensim.models.KeyedVectors.load(w2v_model_path)

## Prepare Word Embedding Matrix

In [4]:
import numpy as np
from utils.text import get_context_average_embedding

vocab_train = set([word for sentence in train_dataset["tokens"] for word in sentence])
oov_words = set([word for word in vocab_train if word not in model])

embedding_dim = model.vector_size

# Initialize embedding matrix with zeros (Add 2 for <PAD> and <UNK>).
offset = 2
embedding_matrix = np.zeros((len(vocab_train) + offset, embedding_dim))

# Word to index dictionary for easy lookup.
index_from_word = {word: i + offset for i, word in enumerate(vocab_train)}
index_from_word["<PAD>"] = 0
index_from_word["<UNK>"] = 1

# Mean vector of the pretrained w2v embeddings.
vectors = np.array([model[vocab] for vocab in model.index_to_key])
mean_vector = np.mean(vectors, axis=0)

# Populate embedding matrix with known words.
for word, i in index_from_word.items():
    if word in model:
        embedding_matrix[i] = model[word]


# Populate OOV words with context average embedding.
oov_words_map = {}
for sentence in train_dataset["tokens"]:
    for word in sentence:
        if word in oov_words:
            if word not in oov_words_map:
                oov_words_map[word] = 0
            oov_words_map[word] += 1
            embedding_matrix[index_from_word[word]] += get_context_average_embedding(word, sentence, model)

for word in oov_words_map:
    embedding_matrix[index_from_word[word]] /= oov_words_map[word]
            
embedding_matrix[0] = mean_vector
embedding_matrix[1] = mean_vector

## Export Word2Vec Embedding Matrix

In [None]:
import json
from pathlib import Path

embedding_path = Path("models/embedding_matrix_oov.npy")
index_from_word_path = Path("models/index_from_word_oov.json")

np.save(embedding_path, embedding_matrix)

with index_from_word_path.open("w") as f:
    json.dump(index_from_word, f)