In [1]:
# Import external dependencies
import sys
from IPython import display
from ipywidgets import Output
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [2]:
# Import local dependencies
sys.path.insert(0, "../src")
from collection import Collection

In [3]:
# Set up device and manual seed
torch.manual_seed(1)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
# Load data
test_collection = Collection("../test_data/test_collection")
posting_lists = []
posting_length_to_use = 128
for _, pl in enumerate(test_collection):
    if len(pl[0]) >= posting_length_to_use:
        posting_lists.append(np.array(pl[0], dtype=np.int32))
posting_lists.sort(key=lambda x:np.shape(x)[0], reverse=True)

In [5]:
lengths = [len(pl) for pl in posting_lists]
print("Longest seq: {}".format(max(lengths)))
print("Shortest seq: {}".format(min(lengths)))
print("Average seq: {:.2f}".format(np.array(lengths).mean()))

Longest seq: 9366
Shortest seq: 128
Average seq: 699.22


In [6]:
def list_of_tensors(data):
    tensor_list = []
    for pl in data:
        tensor_list.append(torch.tensor(pl, dtype=torch.float32))
    return tensor_list

In [7]:
data = list_of_tensors(posting_lists)

In [8]:
def get_batch(source, source_lengths, i, bsz):
    batch_size = min(bsz, len(source_lengths))
    return source[i:i+batch_size], source_lengths[i:i+batch_size]

In [9]:
def get_data(source, i, bptt):
    data = [d.unsqueeze(dim=1).to(device) for d in source]
    target = [d.to(device) for d in source]
    return data, target

In [41]:
# for batch_idx, i in enumerate(range(0, len(lengths), 3)):
#     print(batch_idx, i)

In [31]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, ninp, nhid, nlayers):
        super(RNNModel, self).__init__()
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        if self.rnn_type == "LSTM":
            self.rnn = nn.LSTM(ninp, nhid, nlayers, batch_first=False)
        else:
            self.rnn = nn.GRU(ninp, nhid, nlayers, batch_first=False)
        self.linear = nn.Linear(nhid, 1)  # go from hidden dim to dim of 1
            
    def forward(self, input_seqs, input_lengths, hidden):
        input_padded = nn.utils.rnn.pad_sequence(input_seqs, padding_value=0.0, batch_first=False)
        input_packed = nn.utils.rnn.pack_padded_sequence(input_padded, input_lengths, batch_first=False)
        output_packed, hidden = self.rnn(input_packed, hidden)
        output, _ = nn.utils.rnn.pad_packed_sequence(output_packed, padding_value=0.0, batch_first=False)
        batch_size, max_seq_len, _ = output.size()
        output = output.contiguous()
        output = output.view(-1, output.shape[2])
        output = self.linear(output)
        output = output.view(max_seq_len, batch_size, 1)
        return output

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == "LSTM":
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [32]:
def train(model, optimizer, posting_lists, lengths,
          scheduler=None, epochs=2000, batch_size=3, bptt=10000, log_interval=10, plots=False):
    model.train()
    epoch_losses = []
    current_epoch_progress = Output()
    display.display(current_epoch_progress)
    total_epoch_progress = Output()
    display.display(total_epoch_progress)
    loss_plot = Output()
    display.display(loss_plot)
    
    # Loop for number of epochs:
    for e in range(1, epochs+1):
        with total_epoch_progress:
            print("Epoch {}/{} [{:.2f}%]".format(e, epochs, (e/epochs)*100))
        total_epoch_progress.clear_output(wait=True)
        epoch_loss = 0
        
        # Loop for batches within data:
        for batch_idx, i in enumerate(range(0, len(lengths), batch_size)):
            batch, batch_lengths = get_batch(posting_lists, lengths, i, batch_size)
            with current_epoch_progress:
                print("Current Epoch {}: {}/{} [{:.2f}%]".format(e, 
                                                         i+batch_size, 
                                                         len(lengths), 
                                                         ((i+batch_size)/len(lengths))*100))
            current_epoch_progress.clear_output(wait=True)
            hidden = model.init_hidden(batch_size)
            
            # Get data
            data, target = get_data(batch, i, bptt)

            # Zero out the grad
            optimizer.zero_grad()
                
            # Get output
            prediction = model(data, batch_lengths, hidden)
                
            # Calculate loss
            target = nn.utils.rnn.pad_sequence(target, padding_value=0.0, batch_first=False)
            loss = F.mse_loss(prediction, target)
            epoch_loss += loss.item()
                
            # Take gradient step
            loss.backward()
            optimizer.step()

            # Take scheduler step
            if scheduler:
                scheduler.step(loss)
        epoch_losses.append(epoch_loss)
        
        # Print loss and plot predicitons vs. ground truth
        if e % log_interval == 0:
            with loss_plot:
                plt.plot(list(range(len(epoch_losses))), epoch_losses)
                plt.title("Loss per epoch")
                plt.xlabel("Epochs")
                plt.ylabel("MSE Loss")
                plt.show()
            loss_plot.clear_output(wait=True)
            print("Train Epoch {}: Loss - {}, Avg Loss - {}".format(e, epoch_losses[-1], sum(epoch_losses)/(e+1)))
            if plots:
                with torch.no_grad():
                    plot_pred = prediction.cpu().detach().numpy().reshape(-1)
                    plot_index = list(range(plot_pred.shape[0]))
                    plot_target = target.cpu().numpy().reshape(-1)
#                     error = target - prediction
#                     max_error = abs(error.max().item())
                    error = 0
                    max_error = 0
                    title = "Train Epoch {}: Loss - {}; Max error - {}".format(e, loss.item(), max_error)
                    plt.plot(plot_index, plot_target, linestyle="None", marker="o", label="True")
                    plt.plot(plot_index, plot_pred, linestyle="None", marker="o", label="Prediction")
                    plt.title(title)
                    plt.legend(loc="best")
                    plt.show()
    return prediction

In [33]:
# Train model
rnn_type = "GRU"
input_size = 1
hidden_size = 10
layers = 1
lii_rnn = RNNModel(rnn_type, input_size, hidden_size, layers)
lii_rnn.to(device)
optimizer = optim.Adam(params=lii_rnn.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=20, verbose=True, threshold=10e-6)
predictions = train(lii_rnn,
                    optimizer,
                    data,
                    lengths,
                    scheduler,
                    epochs=1000,
                    batch_size=274,
                    log_interval=10,
                    plots=False)

<class 'list'> 0


KeyboardInterrupt: 

In [30]:
print("Model's state_dict:")
for param_tensor in lii_lstm.state_dict():
    print(param_tensor, "\t", lii_lstm.state_dict()[param_tensor].size())
    
#torch.save(lii_lstm.state_dict(), "/Users/yairschiff/Development/Python/DeepLearning/Project/test_lstm.pth")

Model's state_dict:


NameError: name 'lii_lstm' is not defined

In [None]:
# Compare prediction and ground truth
for i in range(predictions.size()[0]):
    print("True value: {} vs. Predicted: {}".format(int((posting_lists[0][i])), int((predictions[i].item()))))

In [None]:
# Gather and print metrics
# columns = ['MSE', 'R2 score', 'MAE', 'Expl.Var']
# rows = ['Performance']
# metrics = pd.DataFrame(0.0, columns=columns, index=rows)
# prediction_values = prediction_scaled.detach().numpy().reshape(-1)
# inverted_index_values = inverted_index_scaled.detach().numpy().reshape(-1)
# metrics.iloc[0,0] = np.sqrt(mean_squared_error(prediction_values, inverted_index_values))
# metrics.iloc[0,1] = r2_score(prediction_values, inverted_index_values)
# metrics.iloc[0,2] = mean_absolute_error(prediction_values, inverted_index_values)
# metrics.iloc[0,3] = explained_variance_score(prediction_values, inverted_index_values)
# print(metrics)
# print('Number of incorrect indexes (Mismatched) ={}, for {} keys'
#       .format(np.count_nonzero(prediction_values != inverted_index_values), inverted_index_values.size))

In [None]:
# # Create tensor data
# data = np.zeros((total_seq_length, 2))
# current_row = 0
# for i in range(len(posting_lists)):
#     idxs = np.array([i]*len(posting_lists[i]))
#     data[current_row:current_row+len(posting_lists[i])] = np.vstack((idxs, posting_lists[i])).T
#     current_row += len(posting_lists[i])
# data_tensor = torch.tensor(data, device=device)

In [None]:
# def masked_loss(prediction, target):
#     prediction = prediction.squeeze()
#     zeros = torch.zeros_like(prediction)
#     masked_prediction = torch.where(prediction == -1, zeros, prediction)
#     masked_target = torch.where(prediction == -1, zeros, target)
#     return F.mse_loss(masked_prediction, masked_target)