# 0. Dataset preparation

In [1]:
%pip install datasets
%pip install nltk
%pip install --upgrade gensim

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print("train: ", train_dataset)
print("validate: ", validation_dataset)
print("test: ", test_dataset)

train:  Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})
validate:  Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
test:  Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})


In [4]:
# convert dataset into pandas dataframe
train_df = train_dataset.to_pandas()
val_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()

In [5]:
import re
import nltk

nltk.download('punkt')
nltk.download('treebank')
nltk.download('punkt_tab')

def preprocessText(dataset):
    texts = []

    for i in range(0, len(dataset)):
        text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i]) #remove numbers and non-alphabetical symbols
        text = text.lower() # lower case
        text = text.strip()

        if isinstance(text, str):    
            tokens = nltk.tokenize.word_tokenize(text) 
        else:     
            print("Input is not a valid string.")
        #text = nltk.tokenize.word_tokenize(text) # tokenize
        
        texts.append(tokens)
        
    return texts

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cihui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\cihui\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\cihui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [6]:
train_split = preprocessText(train_df)
val_split = preprocessText(val_df)
test_split = preprocessText(test_df)

# 1. Preparing Word Embeddings

- using `Word2Vec`

load a pretrained word2vec model (trained on Google News dataset contained about 100 billion words)

In [7]:
import gensim

googleNews_w2v_model = gensim.models.KeyedVectors.load_word2vec_format('model\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', binary=True)  
# download the pretrained model from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g (take around 1.5GB)

'''
# or maybe this works also, takes around 7 mins to load

import gensim.downloader as api

googleNews_w2v_model = api.load('word2vec-google-news-300')
googleNews_w2v_model.save("googleNews_w2v_model")
'''

'\n# or maybe this works also, takes around 7 mins to load\n\nimport gensim.downloader as api\n\ngoogleNews_w2v_model = api.load(\'word2vec-google-news-300\')\ngoogleNews_w2v_model.save("googleNews_w2v_model")\n'

In [8]:
len(googleNews_w2v_model.index_to_key)

3000000

(a) size of vocabulary from the training data.

In [9]:
trainDataset_words = [word for sentence in train_split for word in sentence]
trainDataset_vocabs = set(trainDataset_words)
trainDataset_vocab_size = len(trainDataset_vocabs)

print(trainDataset_vocab_size)

16331


(b) number of OOV (out-of-vocabulary) ( those words appeared in the training data but
not in the Word2vec dictionary)

In [10]:
oov_words = [word for word in trainDataset_vocabs if word not in googleNews_w2v_model]
oov_words_size = len(oov_words)

print(oov_words_size)
print(oov_words)

1445
['fantasti', 'mamet', 'cassel', 'shimizu', 'stonehenge', 'radcliffe', 'profundamente', 'dickensian', 'mesmerised', 'vonnegut', 'gosford', 'collosum', 'sarandon', 'moretti', 'montias', 'labute', 'zaidan', 'ozpetek', 'endeavour', 'besson', 'pender', 'ballhaus', 'unsalvageability', 'cierta', 'schindler', 'elizabethan', 'runyon', 'arteta', 'interspliced', 'sorprender', 'donati', 'certamente', 'makmalbaf', 'polanski', 'jir', 'pompeo', 'blanchett', 'rhames', 'abrahams', 'pryor', 'charlize', 'desplechin', 'shapelessly', 'tamb', 'addessi', 'grenier', 'kosashvili', 'aprovechar', 'hemmingway', 'cattaneo', 'bergmanesque', 'uzumaki', 'kaputschnik', 'ltimo', 'navajos', 'krige', 'hubac', 'spookies', 'godfrey', 'ryoko', 'komediant', 'criar', 'eisenberg', 'kouyate', 'janey', 'bugsy', 'lanie', 'beavis', 'chouraqui', 'garc', 'abderrahmane', 'flavour', 'crappola', 'arwen', 'tampoco', 'transforma', 'revigorates', 'farrelly', 'wewannour', 'prejuicios', 'torna', 'pinocchio', 'minkoff', 'rmino', 'goldba

(c) initialize an embedding matrix
- handling of OOV words: using the mean vector from the pretrained w2v vector

In [16]:
import numpy as np

embedding_dim = googleNews_w2v_model.vector_size # 300

#initialize embedding matrix (train_data_vocab_size X embedding dimension)
embedding_matrix = np.zeros((trainDataset_vocab_size, embedding_dim))

#vocab-to-index dict
trainDataset_vocab_index = {vocab: i for i, vocab in enumerate(trainDataset_vocabs)}

#mean vector of the pretrained w2v
w2v_vectors = np.array([googleNews_w2v_model[vocab] for vocab in googleNews_w2v_model.index_to_key])
mean_vector = np.mean(w2v_vectors, axis=0)


In [17]:
#fill in embedding matrix
for vocab in trainDataset_vocabs:
    #assign mean vector for OOV words
    if vocab in oov_words:
        embedding_matrix[trainDataset_vocab_index[vocab]] = mean_vector
    else:
        embedding_matrix[trainDataset_vocab_index[vocab]] = googleNews_w2v_model[vocab]
        