# Glove Dataset Preparation

This notebook handles the import of GLoVe model and the corresponding logic to build an embedding matrix.

## Import Glove Dataset

In [1]:
## https://nlp.stanford.edu/projects/glove/
## Download from this link, select glove6B.zip, unzip and place in models/ directory

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

# Convert GloVe format to word2vec format
glove_input_file = 'models/glove.6B.100d.txt'
word2vec_output_file = 'models/glove.6B.100d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

# Load the converted model
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

  glove2word2vec(glove_input_file, word2vec_output_file)


(a) Check Glove number of data points

In [2]:
len(model.index_to_key)

400000

## Import Train Dataset

In [3]:
from utils.text import tokenize
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]
train_dataset = tokenize(train_dataset)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/juinlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     /Users/juinlee/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/juinlee/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Prepare Word Embedding Matrix

In [None]:
import numpy as np
from utils.text import get_context_average_embedding

vocab_train = set([word for sentence in train_dataset["tokens"] for word in sentence])
oov_words = set([word for word in vocab_train if word not in model])

embedding_dim = model.vector_size

# Initialize embedding matrix with zeros (Add 2 for <PAD> and <UNK>).
offset = 2
embedding_matrix = np.zeros((len(vocab_train) + offset, embedding_dim))

# Word to index dictionary for easy lookup.
index_from_word = {word: i + offset for i, word in enumerate(vocab_train)}
index_from_word["<PAD>"] = 0
index_from_word["<UNK>"] = 1

# Mean vector of the pretrained GloVe embeddings.
vectors = np.array([model[vocab] for vocab in model.index_to_key])
mean_vector = np.mean(vectors, axis=0)

# Populate embedding matrix with known words.
for word, i in index_from_word.items():
    if word in model:
        embedding_matrix[i] = model[word]


# Populate OOV words with context average embedding.
oov_words_map = {}
for sentence in train_dataset["tokens"]:
    for word in sentence:
        if word in oov_words:
            if word not in oov_words_map:
                oov_words_map[word] = 0
            oov_words_map[word] += 1
            embedding_matrix[index_from_word[word]] += get_context_average_embedding(word, sentence, model)

for word in oov_words_map:
    embedding_matrix[index_from_word[word]] /= oov_words_map[word]
            
embedding_matrix[0] = mean_vector
embedding_matrix[1] = mean_vector

[-1.58165002  0.73010498  0.31462398  0.49088001 -1.30036497  0.18174499
  0.36901999 -0.25784999 -0.00873     0.482885    0.94368005  0.179148
 -0.18084499  0.25459     0.51633501 -0.28136    -0.10804     0.0409885
  0.02675501  0.73750001  0.38751501 -0.116468   -0.13833001 -0.04469065
  0.00855999  1.08362508  0.01229     0.17734501  0.302495   -0.94288999
  0.1841445   0.33857     0.21223499 -0.24423     0.56564498  0.84698999
  0.59086001  0.94275498  0.1679      0.47887501  0.99735498 -0.78103501
 -0.57097    -0.95722502  0.02849    -0.65161002  0.093355    0.34443551
 -0.101515   -0.078851    0.2157      0.17446475 -0.49163499 -0.31758499
 -1.19385004  0.14773001 -0.22755501  0.29258499  0.136235    0.42129499
 -0.60543501 -0.21785501 -0.98325002 -0.35978502  1.30250001  0.57874501
 -0.21573301 -0.13982449  1.10615003  0.33975002 -0.94596493  0.2096225
  0.12854199  0.12942749  0.33677    -0.70151997 -0.0560895  -0.376095
 -0.22258501 -1.44799995 -0.06923    -1.09092999 -0.04515

## Export Glove Embedding Matrix

In [7]:
import json
from pathlib import Path

embedding_path = Path("models/glove_matrix.npy")
index_from_word_path = Path("models/glove_index.json")

np.save(embedding_path, embedding_matrix)

with index_from_word_path.open("w") as f:
    json.dump(index_from_word, f)