# 0. Dataset preparation

## 0.1 Import dataset from part 0

In [1]:
import pandas as pd

train_df = pd.read_csv("datasets/train.csv")
val_df = pd.read_csv("datasets/val.csv")
test_df = pd.read_csv("datasets/test.csv")

In [2]:
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


## 0.2 Process text data

In [3]:
import re
import nltk

nltk.download('punkt')
nltk.download('treebank')
nltk.download('punkt_tab')

def preprocessText(dataset):
    texts = []

    for i in range(0, len(dataset)):
        text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i]) #remove numbers and non-alphabetical symbols
        text = text.lower() # lower case
        text = text.strip()

        if isinstance(text, str):    
            tokens = nltk.tokenize.word_tokenize(text) 
        else:     
            print("Input is not a valid string.")
        #text = nltk.tokenize.word_tokenize(text) # tokenize
        
        texts.append(tokens)
        
    return texts

[nltk_data] Downloading package punkt to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
train_split = preprocessText(train_df)
val_split = preprocessText(val_df)
test_split = preprocessText(test_df)

# 1. Preparing Word Embeddings

- using `Word2Vec`

load a pretrained word2vec model (trained on Google News dataset contained about 100 billion words)

In [5]:
import gensim
import os

# Download pretrained embeddings model if haven't done so
if not os.path.exists("models/googleNews_w2v_model"):
  # Takes around 7mins
  googleNews_w2v_model = gensim.downloader.api.load('models/word2vec-google-news-300')
  googleNews_w2v_model.save("googleNews_w2v_model")

  # Alternatively, download from the link below
  # googleNews_w2v_model = gensim.models.KeyedVectors.load_word2vec_format('model\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', binary=True)  
  # download the pretrained model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g (take around 1.5GB)

googleNews_w2v_model = gensim.models.KeyedVectors.load("models/googleNews_w2v_model")

In [6]:
len(googleNews_w2v_model.index_to_key)

3000000

## (a) size of vocabulary from the training data.

In [7]:
trainDataset_words = [word for sentence in train_split for word in sentence]
trainDataset_vocabs = set(trainDataset_words)
trainDataset_vocab_size = len(trainDataset_vocabs)

print(trainDataset_vocab_size)

16331


## (b) number of OOV (out-of-vocabulary)
- (those words appeared in the training data but not in the Word2vec dictionary)

In [8]:
oov_words = [word for word in trainDataset_vocabs if word not in googleNews_w2v_model]
oov_words_size = len(oov_words)

print(oov_words_size)
print(oov_words)

1445
['iwai', 'meyjes', 'matheson', 'inuit', 'sequer', 'koepp', 'flck', 'flavour', 'clockstoppers', 'darwinian', 'testud', 'dario', 'carlen', 'rampling', 'avary', 'penotti', 'yiddish', 'phonce', 'molony', 'gadzooks', 'abrams', 'qatsi', 'sillified', 'hudlin', 'sonnenfeld', 'ayres', 'clearasil', 'hepburn', 'recoing', 'toolbags', 'balto', 'splittingly', 'giles', 'romething', 'apted', 'enfrentar', 'achronological', 'lohman', 'unlaughable', 'coriat', 'caddyshack', 'exporing', 'landbound', 'byron', 'colgate', 'shapiro', 'rdida', 'tambor', 'oscura', 'russos', 'mergulha', 'recurre', 'fresnadillo', 'haynes', 'scorsese', 'collosum', 'niro', 'kieran', 'alain', 'montied', 'spader', 'shandling', 'headbangingly', 'antwone', 'guei', 'diferen', 'hinton', 'schrader', 'feardotcom', 'jed', 'kubrick', 'convencional', 'kalesniko', 'romijn', 'clements', 'deniro', 'silbersteins', 'musicais', 'aqueles', 'rosenthal', 'thekids', 'premisa', 'bolado', 'phocion', 'banderas', 'niels', 'michell', 'desaponta', 'dicap

## (c) initialize an embedding matrix
- handling of OOV words: using the mean vector from the pretrained w2v vector

In [9]:
import numpy as np

embedding_dim = googleNews_w2v_model.vector_size # 300

#initialize embedding matrix (train_data_vocab_size X embedding dimension)
embedding_matrix = np.zeros((trainDataset_vocab_size, embedding_dim))

#vocab-to-index dict
trainDataset_vocab_index = {vocab: i for i, vocab in enumerate(trainDataset_vocabs)}

#mean vector of the pretrained w2v
w2v_vectors = np.array([googleNews_w2v_model[vocab] for vocab in googleNews_w2v_model.index_to_key])
mean_vector = np.mean(w2v_vectors, axis=0)


In [10]:
#fill in embedding matrix
for vocab in trainDataset_vocabs:
    #assign mean vector for OOV words
    if vocab in oov_words:
        embedding_matrix[trainDataset_vocab_index[vocab]] = mean_vector
    else:
        embedding_matrix[trainDataset_vocab_index[vocab]] = googleNews_w2v_model[vocab]
        

**Export Embedding Matrix**

In [11]:
from utils.file import save_to_local_file
save_to_local_file("models/embedding_matrix.pckl", embedding_matrix)

Saving object to local...
Object saved to local!
