# 0. Dataset preparation

## 0.1 Import dataset from part 0

In [1]:
import pandas as pd

train_df = pd.read_csv("datasets/train.csv")
val_df = pd.read_csv("datasets/val.csv")
test_df = pd.read_csv("datasets/test.csv")

In [2]:
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


## 0.2 Process text data

In [3]:
from utils.text import preprocessText

train_split = preprocessText(train_df)
val_split = preprocessText(val_df)
test_split = preprocessText(test_df)

[nltk_data] Downloading package punkt to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# 1. Preparing Word Embeddings

- using `Word2Vec`

load a pretrained word2vec model (trained on Google News dataset contained about 100 billion words)

In [4]:
import gensim
import os

# Download pretrained embeddings model if haven't done so
if not os.path.exists("models/googleNews_w2v_model"):
  # Takes around 7mins
  googleNews_w2v_model = gensim.downloader.api.load('models/word2vec-google-news-300')
  googleNews_w2v_model.save("googleNews_w2v_model")

  # Alternatively, download from the link below
  # googleNews_w2v_model = gensim.models.KeyedVectors.load_word2vec_format('model\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', binary=True)  
  # download the pretrained model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g (take around 1.5GB)

googleNews_w2v_model = gensim.models.KeyedVectors.load("models/googleNews_w2v_model")

In [5]:
len(googleNews_w2v_model.index_to_key)

3000000

## (a) size of vocabulary from the training data.

In [6]:
trainDataset_words = [word for sentence in train_split for word in sentence]
trainDataset_vocabs = set(trainDataset_words)
trainDataset_vocab_size = len(trainDataset_vocabs)

print(trainDataset_vocab_size)

16331


## (b) number of OOV (out-of-vocabulary)
- (those words appeared in the training data but not in the Word2vec dictionary)

In [7]:
oov_words = [word for word in trainDataset_vocabs if word not in googleNews_w2v_model]
oov_words_size = len(oov_words)

print(oov_words_size)
print(oov_words)

1445
['cativante', 'bruckheimer', 'javier', 'zhao', 'arrancar', 'peploe', 'chabrol', 'superlarge', 'gantz', 'brockovich', 'naturedness', 'villeneuve', 'friel', 'dafoe', 'wenders', 'revigorates', 'stallone', 'tolkien', 'foxworthy', 'desplat', 'fillm', 'joaquin', 'argento', 'ecks', 'condensada', 'granger', 'rhames', 'puttingly', 'carente', 'hitchens', 'laboriousness', 'diciness', 'herek', 'zellweger', 'murdock', 'kilmer', 'enga', 'intera', 'theatres', 'koyaanisqatsi', 'cotswolds', 'ricci', 'darwinian', 'sorimachi', 'auteil', 'breen', 'esquerdo', 'uberviolence', 'wendigo', 'petin', 'aviv', 'preciosista', 'superficiale', 'assistir', 'englishmen', 'welty', 'aborbing', 'orlean', 'fuelled', 'herrmann', 'preocupar', 'picpus', 'ouro', 'generaciones', 'whaley', 'kaige', 'zucker', 'neuwirth', 'jacquot', 'covardia', 'spielberg', 'frustrado', 'piesiewicz', 'djeinaba', 'crappola', 'jaglomized', 'aprovechar', 'deblois', 'gordy', 'repellantly', 'elfriede', 'xtc', 'callar', 'wisegirls', 'coriat', 'emot

## (c) initialize an embedding matrix
- handling of OOV words: using the mean vector from the pretrained w2v vector

In [10]:
import numpy as np

embedding_dim = googleNews_w2v_model.vector_size # 300

# Initialize embedding matrix (train_data_vocab_size+1 X embedding dimension)
# `+1` for OOV words in val and test set
embedding_matrix = np.zeros((trainDataset_vocab_size+1, embedding_dim))

# Vocab-to-index dict
trainDataset_vocab_index = {vocab: i+1 for i, vocab in enumerate(trainDataset_vocabs)}
trainDataset_vocab_index[""] = 0 # For OOV words in val and test set

# Mean vector of the pretrained w2v
w2v_vectors = np.array([googleNews_w2v_model[vocab] for vocab in googleNews_w2v_model.index_to_key])
mean_vector = np.mean(w2v_vectors, axis=0)


In [17]:
# Fill in embedding matrix
for vocab in trainDataset_vocabs:
    # Assign mean vector for OOV words
    if vocab in oov_words:
        embedding_matrix[trainDataset_vocab_index[vocab]] = mean_vector
    else:
        embedding_matrix[trainDataset_vocab_index[vocab]] = googleNews_w2v_model[vocab]

# Assign mean vector for OOV words in val and test set
embedding_matrix[trainDataset_vocab_index[""]] = mean_vector

**Export Embedding Matrix & trainDataset_vocab_index**

In [18]:
from utils.file import save_to_local_file
save_to_local_file("models/embedding_matrix.pckl", embedding_matrix)
save_to_local_file("models/embedding_matrix_train_dataset_vocab_to_index.pckl", trainDataset_vocab_index)

Saving object to local...
Object saved to local!
Saving object to local...
Object saved to local!
