In [1]:
"""
Convolutional Neuroscience
Accademic year 2019-2020
Homework 3

Author: Tommaso Tabarelli
Period: december 2019
"""

# Importing libraries

import argparse
import torch
import json
import re
import numpy as np
from torch.utils.data import Dataset, DataLoader
from functools import reduce
from torch import optim, nn
from network import Network, train_batch
from torch.utils.data import DataLoader
from torchvision import transforms
from pathlib import Path

# Importing packages to implement word2vec
from torch.autograd import Variable
import torch.functional as F
import torch.nn.functional as F

In [3]:
# Defining network class

class Network(nn.Module):
    
    def __init__(self, input_size, hidden_units, layers_num, dropout_prob=0):
        # Call the parent init function (required!)
        super().__init__()
        # Define recurrent layer
        self.rnn = nn.LSTM(input_size=input_size, 
                        hidden_size=hidden_units,
                        num_layers=layers_num,
                        dropout=dropout_prob,
                        batch_first=True)
        # Define output layer
        self.out = nn.Linear(hidden_units, input_size)

    def forward(self, x, state=None):
        # LSTM
        x, rnn_state = self.rnn(x, state)
        # Linear layer
        x = self.out(x)
        return x, rnn_state
    



def train_batch(net, batch_onehot, loss_fn, optimizer):

    ### Prepare network input and labels
    # Get the labels (the last letter of each sequence)
    labels_onehot = batch_onehot[:, -1, :]
    labels_numbers = labels_onehot.argmax(dim=1)
    # Remove the labels from the input tensor
    net_input = batch_onehot[:, :-1, :]
    # batch_onehot.shape =   [50, 100, 38]
    # labels_onehot.shape =  [50, 38]
    # labels_numbers.shape = [50]
    # net_input.shape =      [50, 99, 38]

    ### Forward pass
    # Eventually clear previous recorded gradients
    optimizer.zero_grad()
    # Forward pass
    net_out, _ = net(net_input)

    ### Update network
    # Evaluate loss only for last output
    loss = loss_fn(net_out[:, -1, :], labels_numbers)
    # Backward pass
    loss.backward()
    # Update
    optimizer.step()
    # Return average batch loss
    return float(loss.data)

### Implementing word2vec

Following the instructions at the following link: https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb

In [2]:
### Load data
text = open("Picture_of_Dorian_Gray.txt", 'r').read()

# Removing titles
text = re.split('\n{7}', text)[1]

# Lowering all text
text = text.lower()

sentences = re.split('[\.,!?;:]', text)

vocabulary = []
all_words = re.split("[\.,!?;:\n -\'-]", text)
for word in all_words:
    if word not in vocabulary:
        vocabulary.append(word)

vocabulary.sort()

# Removing the first 3 elements since they are numbers or empty string
vocabulary.pop(0)
vocabulary.pop(0)
vocabulary.pop(0)

print(len(text.split(".")))
print(len(sentences))
#print(vocabulary)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

#print(word2idx.keys())

print(sentences[0])
for sentence in sentences:
    # saving index
    index = sentences.index(sentence)
    sentence = re.split("[\.,!?;:\n -\'-]", sentence)
    
    # Removing by hand ALL the unwanted characters
    for i in range(sentence.count('')):
        sentence.pop(sentence.index(''))
    for i in range(sentence.count(' ')):
        sentence.pop(sentence.index(" "))
    for i in range(sentence.count('\n')):
        sentence.pop(sentence.index("\n"))
    if ("152" in sentence):
        sentence.pop(sentence.index("152"))
    if ("1820" in sentence):
        sentence.pop(sentence.index("1820"))
    # Overwriting the sentence with a word-split one
    sentences[index] = sentence
print(sentences[0])

5746
12519
the studio was filled with the rich odour of roses
['the', 'studio', 'was', 'filled', 'with', 'the', 'rich', 'odour', 'of', 'roses']


### Creating "context" environment to then train the net

In [3]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in sentences:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make sure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [4]:
len(idx_pairs)

248350

## Starting to define the Neural Network structure

### Input layer

It should be as large as the vocabulary.

In [84]:
def get_input_layer(word_idx):
    x = torch.zeros(len(vocabulary)).float()
    x[word_idx] = 1.0
    return x

### Hidden layers

It is chosen to be only 1 hidden layer and it depends on the embedding dimension (arbitraryli chosen).

W1 is the weight matrix. It has dimensions: [embedding_dims, vocabulary_size]

There is no activation function — just plain matrix multiplication. (This will be clearer during training)

In [81]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, len(vocabulary)).float(), requires_grad=True)

### Output layer

In [87]:
W2 = Variable(torch.randn(len(vocabulary), embedding_dims).float(), requires_grad=True)

### Training

In [96]:
num_epochs = 101
learning_rate = 0.001

# Define optimizer
optimizer = optim.Adam(my_net.parameters(), weight_decay=5e-4)

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:

        # Forward pass
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
        

        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
        #optimizer.step()
        
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 9.346418637197958
Loss at epo 10: 7.211674061661062
Loss at epo 20: 6.778484885877758
Loss at epo 30: 6.574319957424175
Loss at epo 40: 6.451177387590353
Loss at epo 50: 6.367360884102869
Loss at epo 60: 6.305997703103249
Loss at epo 70: 6.258726537012349
Loss at epo 80: 6.22101781125469
Loss at epo 90: 6.190239186260286
Loss at epo 100: 6.164635735624557


In [97]:
W2.type()

'torch.FloatTensor'

In [106]:
torch.save(W1, "W1_weights")
torch.save(W2, "W2_weights")

In [103]:
print(W2.shape)

torch.Size([6779, 5])


### Loading saved parameters for the embedding

In [14]:
W1_try = torch.load("W1_weights")
W2_try = torch.load("W2_weights")
print(W1_try)
print(W2_try)
print(torch.max(W1_try).item())
print(torch.max(W2_try).item())

tensor([[-0.6536, -0.2067, -0.2938,  ..., -0.3121, -0.6741,  1.1771],
        [ 0.2630,  1.2787,  1.1708,  ..., -0.9615,  0.1497,  1.2112],
        [ 0.5051,  0.5456, -0.4282,  ..., -1.2316,  0.3074,  0.5471],
        [-0.2665, -0.3556, -0.5891,  ...,  0.3163,  1.4133,  1.0261],
        [ 0.4310,  1.6727,  1.3909,  ...,  1.3273,  0.9603,  0.7538]],
       requires_grad=True)
tensor([[-4.7673,  0.7313,  1.7859, -1.6416,  3.9699],
        [ 0.9328, -0.2321,  0.7114,  0.2274, -0.5491],
        [-0.0083, -0.1352,  0.4440, -0.7173, -0.1907],
        ...,
        [ 0.1161, -1.2554,  1.0307, -1.0508, -0.2169],
        [ 1.4459, -1.2231, -0.3555,  0.4521,  1.1497],
        [-0.5359,  0.0490,  0.0687, -0.1203, -1.4884]], requires_grad=True)
3.3844878673553467
4.363911151885986


Taking W2 as the word representation because it depends on context! In other words, I think the center is less biased term, and what I need is to be able to predict words of a given text/style/piece of paper; for this reason context representation is better.

In [15]:
my_words = W2_try

## Using LSTM network

Our network is already well implemented and it uses a LSTM layer.

In [17]:
embedding_dims = 5
hidden_units = 500
num_layers = 2
my_net = Network(len(vocabulary), hidden_units, num_layers, dropout_prob=0.3)

In [None]:
def train_batch(net, batch_W2V, loss_fn, optimizer):

    ### Prepare network input and labels
    
    # Get the labels (the last word of each sequence)
    #    (In the batch, take, for every batch, the last word with all coordinates)
    labels_ = batch_W2V[:, -1, :]
    
    # Remove the labels from the input tensor
    net_input = batch_W2V[:, :-1, :]

    ### Forward pass
    # Eventually clear previous recorded gradients
    optimizer.zero_grad()
    # Forward pass
    net_out, _ = net(net_input)

    ### Update network
    # Evaluate loss only for last output
    loss = loss_fn(net_out[:, -1, :], labels_)
    # Backward pass
    loss.backward()
    # Update
    optimizer.step()
    # Return average batch loss
    return float(loss.data)

### Trying to use LSTM

In [18]:
# Building the encoded data
def encode_text(word_to_index, text):
    encoded = [word_to_index[w] for w in text]
    return encoded

{'a': 0,
 'abandon': 1,
 'abdicate': 2,
 'abject': 3,
 'able': 4,
 'aborde': 5,
 'about': 6,
 'above': 7,
 'absence': 8,
 'absences': 9,
 'absolute': 10,
 'absolutely': 11,
 'absolution': 12,
 'absorb': 13,
 'absorbed': 14,
 'absorption': 15,
 'abstract': 16,
 'abstracted': 17,
 'abstruse': 18,
 'absurd': 19,
 'absurdly': 20,
 'abused': 21,
 'academicians': 22,
 'academy': 23,
 'acanthus': 24,
 'acanthuslike': 25,
 'accentuating': 26,
 'accept': 27,
 'acceptance': 28,
 'accepting': 29,
 'access': 30,
 'accident': 31,
 'accidental': 32,
 'accompanied': 33,
 'accompaniment': 34,
 'accompany': 35,
 'accord': 36,
 'accordance': 37,
 'according': 38,
 'account': 39,
 'accounts': 40,
 'accumulate': 41,
 'accurate': 42,
 'accursed': 43,
 'accusing': 44,
 'accustomed': 45,
 'achilles': 46,
 'acid': 47,
 'acknowledge': 48,
 'acquaintance': 49,
 'acquaintances': 50,
 'acrobats': 51,
 'across': 52,
 'act': 53,
 'acted': 54,
 'acting': 55,
 'action': 56,
 'actions': 57,
 'active': 58,
 'activity':

In [None]:
#%% Train network
    
# Define Dataloader
dataloader = DataLoader(dataset, batch_size=args.batchsize, shuffle=True, num_workers=1)
# Define optimizer
optimizer = optim.Adam(net.parameters(), weight_decay=5e-4)
# Define loss function
loss_fn = nn.MSELoss()

# Start training
for epoch in range(args.num_epochs):
    print('##################################')
    print('## EPOCH %d' % (epoch + 1))
    print('##################################')
    # Iterate batches
    for batch_sample in dataloader:
        # Extract batch
        batch_onehot = batch_sample['encoded_onehot'].to(device)
        # Update network
        batch_loss = train_batch(net, batch_onehot, loss_fn, optimizer)
        print('\t Training loss (single batch):', batch_loss)

### Save all needed parameters
# Create output dir
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
# Save network parameters
torch.save(net.state_dict(), out_dir / 'net_params.pth')
# Save training parameters
with open(out_dir / 'training_args.json', 'w') as f:
    json.dump(vars(args), f, indent=4)
# Save encoder dictionary
with open(out_dir / 'char_to_number.json', 'w') as f:
    json.dump(dataset.char_to_number, f, indent=4)
# Save decoder dictionary

In [None]:
num_epochs = 101
learning_rate = 0.001

# Define optimizer
optimizer = optim.Adam(my_net.parameters(), weight_decay=5e-4)

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        ### Forward pass
        # Eventually clear previous recorded gradients
        #optimizer.zero_grad()
        
        # Forward pass
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        z1 = torch.matmul(W1, x)
        z2, _ = my_net(x)
        

        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
        #optimizer.step()
        
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

### Making predictions