# 0. Dataset preparation

## 0.1 Import dataset from part 0

In [1]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset["train"]

In [2]:
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

## 0.2 Process text data

In [3]:
from utils.text import tokenize

train_dataset = tokenize(train_dataset)

[nltk_data] Downloading package punkt to /home/yuri/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to /home/yuri/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/yuri/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# 1. Preparing Word Embeddings

- using `Word2Vec`

load a pretrained word2vec model (trained on Google News dataset contained about 100 billion words)

In [4]:
import gensim.downloader
import os

w2v_model_path = "models/word2vec-google-news-300"

# Download pretrained embeddings model if haven't done so
if not os.path.exists(w2v_model_path):
    # Takes around 7mins
    w2v_model = gensim.downloader.load("word2vec-google-news-300")
    w2v_model.save(w2v_model_path)

    # Alternatively, download from the link below
    # w2v_model = gensim.models.KeyedVectors.load_word2vec_format('model\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', binary=True)
    # download the pretrained model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g (take around 1.5GB)

w2v_model = gensim.models.KeyedVectors.load(w2v_model_path)

In [5]:
len(w2v_model.index_to_key)

3000000

## (a) size of vocabulary from the training data.

In [17]:
vocab_train = set([word for sentence in train_dataset["tokens"] for word in sentence])

len(vocab_train)

16331

## (b) number of OOV (out-of-vocabulary)
- (those words appeared in the training data but not in the Word2vec dictionary)

In [19]:
oov_words = [word for word in vocab_train if word not in w2v_model]

print(len(oov_words))
print(oov_words[:20])

1445
['kosminsky', 'desplechin', 'reeses', 'alcatraz', 'weissman', 'esteticamente', 'premissa', 'bugsy', 'manhunter', 'khouri', 'mctiernan', 'francamente', 'kubrick', 'glamour', 'pianista', 'shamu', 'vittorio', 'weigel', 'clements', 'mulan']


## (c) initialize an embedding matrix
- handling of OOV words: using the mean vector from the pretrained w2v vector

In [10]:
import numpy as np

embedding_dim = w2v_model.vector_size  # 300

# Initialize embedding matrix (train_data_vocab_size+1 X embedding dimension)
# `+1` for OOV words in val and test set
embedding_matrix = np.zeros((vocab_size_train + 1, embedding_dim))

# Vocab-to-index dict
vocab_index_train = {vocab: i + 1 for i, vocab in enumerate(vocab_train)}
vocab_index_train[""] = 0  # For OOV words in val and test set

# Mean vector of the pretrained w2v
w2v_vectors = np.array(
    [w2v_model[vocab] for vocab in w2v_model.index_to_key]
)
mean_vector = np.mean(w2v_vectors, axis=0)

In [17]:
# Fill in embedding matrix
for vocab in vocab_train:
    # Assign mean vector for OOV words
    if vocab in oov_words:
        embedding_matrix[vocab_index_train[vocab]] = mean_vector
    else:
        embedding_matrix[vocab_index_train[vocab]] = w2v_model[vocab]

# Assign mean vector for OOV words in val and test set
embedding_matrix[vocab_index_train[""]] = mean_vector

**Export Embedding Matrix & vocab_index_train**

In [18]:
from utils.file import save_to_local_file
from pathlib import Path

embedding_path = Path("models/embedding_matrix.pckl")
vocab_to_index_path = Path("models/embedding_matrix_train_dataset_vocab_to_index.pckl")

save_to_local_file(embedding_path, embedding_matrix)
save_to_local_file(vocab_to_index_path, vocab_index_train)

Saving object to local...
Object saved to local!
Saving object to local...
Object saved to local!
