## Loading the dataset & Pre trained word embeddings

As required, we'll use the pre-trained word embeddings of glove 6B.

In [104]:
import os
import torchtext.data
import torchtext.datasets
data_dir = os.path.expanduser('~/.pytorch-datasets')

# torchtext Field objects parse text (e.g. a review) and create a tensor representation

# This Field object will be used for tokenizing the movie reviews text
review_parser = torchtext.data.Field(
    sequential=True, use_vocab=True, lower=True,
    init_token='<sos>', eos_token='<eos>', dtype=torch.long,
    tokenize='spacy', tokenizer_language='en_core_web_sm'
)

# This Field object converts the text labels into numeric values (0,1,2)
label_parser = torchtext.data.Field(
    is_target=True, sequential=False, unk_token=None, use_vocab=True
)

# Load SST, tokenize the samples and labels
# ds_X are Dataset objects which will use the parsers to return tensors
ds_train, ds_valid, ds_test = torchtext.datasets.SST.splits(
    review_parser, label_parser, root=data_dir
)

n_train = len(ds_train)
print(f'Number of training samples: {n_train}')
print(f'Number of test     samples: {len(ds_test)}')




Number of training samples: 8544
Number of test     samples: 2210


In [None]:
for i in ([111, 4321, 7000, 0]):
    example = ds_train[i]
    label = example.label
    review = str.join(" ", example.text)
    print(f'sample#{i:04d} [{label:8s}]:\n > {review}\n')

And now lets load the pre-trained word embeddings:

In [106]:
import torch
import os
import numpy as np 

#Vocabulary size is 40k, Embedding chosen size in 50
vocab, embeddings = [],[]
with open('.\project\GloVe\GloVe\glove.6B\glove.6B.50d.txt','rt',encoding='utf8') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)
    

sample#0111 [positive]:
 > the film aims to be funny , uplifting and moving , sometimes all at once .

sample#4321 [neutral ]:
 > the most anti - human big studio picture since 3000 miles to graceland .

sample#7000 [negative]:
 > it 's a barely tolerable slog over well - trod ground .

sample#0000 [positive]:
 > the rock is destined to be the 21st century 's new ` ` conan '' and that he 's going to make a splash even greater than arnold schwarzenegger , jean - claud van damme or steven segal .



In [79]:
# Add the padding and the unknown tokens to the vocab and embeddings arrays

vocab = np.array(vocab) 
embeddings = np.array(embeddings)
vocab = np.insert(vocab, 0, '<pad>')
vocab = np.insert(vocab, 1, '<unk>')

pad_emb = np.zeros_like(embeddings[0]).reshape(1,-1)
print(vocab[:10])

['<pad>' '<unk>' 'the' ',' '.' 'of' 'to' 'and' 'in' 'a']


In [98]:
# Apply Embedding pre trained layer to random input

my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embeddings).float())
EMBEDDING_DIM = 50
in_p = torch.randint(low=1, high=3, size=(10,))
print(my_embedding_layer(in_p).shape)

torch.Size([10, 50])


In [80]:
unk_emb = np.mean(embeddings, axis=0, keepdims=True)
embeddings = np.vstack((pad_emb, unk_emb, embeddings))
print(embeddings.shape)

(400002, 50)


## Baseline Model - Sentiment Analysis using RNN - LSTM

As for the first part in our experiment


