In [None]:
# Import external dependencies
import pdb
import os
import sys
from IPython import display
from ipywidgets import Output
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from itertools import accumulate

In [None]:
# Import local dependencies
sys.path.insert(0, "../src")
from collection import Collection
from global_model import RNNModel, LSTMAE

In [None]:
# Set up device and manual seed
torch.manual_seed(1)
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

In [None]:
# Set up params
MODE = "index2doc"  # options: "doc2doc" (predict next doc id from previous) or "index2doc" (predict from index)
RNN_TYPE = "LSTM"  # options: LSTM, GRU, RNN_TANH, RNN_RELU, LSTMAE
EMBED = True  # set this flag if you want to concatenate term id to input  
SCALE = False  # set this flag to scale doc ids [0, 1] (by dividing by max doc id)
LOSS = "L1"  # options: "L1" or "L2"
THRESHOLD = 128  # minimum length posting list to use
INPUT_SIZE = 2 if EMBED else 1
HIDDEN_SIZE = 100
LAYERS = 5
EPOCHS = 10000
BATCH_SIZE = 247
LOG_INTERVAL = 1000
print("Running: {} {} with {} and {} loss \
      \n\t- posting list threshold: {} \
      \n\t- input size: {} \
      \n\t- hidden units: {} \
      \n\t- layers: {} \
      \n\t- batch size: {} \
      \n\t- epochs: {}".format(MODE, 
                               RNN_TYPE, 
                               "scaling" if SCALE else "no scaling",
                               LOSS,
                               THRESHOLD,
                               INPUT_SIZE,
                               HIDDEN_SIZE,
                               LAYERS,
                               BATCH_SIZE,
                               EPOCHS))

In [None]:
# Load data
test_collection = Collection("../test_data/test_collection")
posting_lists = []
posting_length_to_use = THRESHOLD
for term_id, pl in enumerate(test_collection):
    if len(pl[0]) >= posting_length_to_use:
        posting_lists.append((np.array(pl[0], dtype=np.int32), term_id))
posting_lists.sort(key=lambda x:np.shape(x[0])[0], reverse=True)

In [None]:
# Gather length and max doc id info
lengths = [len(pl[0]) for pl in posting_lists]
max_doc_id = max([pl[0].max() for pl in posting_lists])
max_term_id = float(max(pl[1] for pl in posting_lists))
scale_factor = float(max_doc_id) if SCALE else 1.0
print("Number of seqs: {}".format(len(lengths)))
print("Longest seq: {}".format(max(lengths)))
print("Shortest seq: {}".format(min(lengths)))
print("Average seq: {:.2f}".format(np.array(lengths).mean()))
print("Max doc id: {}".format(max_doc_id))

In [None]:
def embed(input_seq, term_id):
    embedding = input_seq.unsqueeze(dim=1).repeat(1, 2)
    embedding.index_fill_(1, torch.tensor([1]), term_id)
    return embedding

In [None]:
def list_of_tensors(device, source, mode="doc2doc", embedding=False):
    data = []
    labels = []
    for pl in source:
        if mode == "doc2doc":
            if embedding:
                # Scale term id from 0 to 1
                data.append(embed(torch.tensor(pl[0], dtype=torch.float32) / scale_factor, 
                                  pl[1] / max_term_id).to(device))
            else:
                data.append(torch.tensor(pl[0], dtype=torch.float32).unsqueeze(dim=1).to(device))
        else:
            if embedding:
                # Scale term id from 0 to 1
                data.append(embed(torch.arange(pl[0].size, dtype=torch.float32), pl[1] / max_term_id).to(device))
            else:
                data.append(torch.arange(pl[0].size, dtype=torch.float32).unsqueeze(dim=1).to(device))
        labels.append((torch.tensor(pl[0], dtype=torch.float32) / scale_factor).to(device))
    return data, labels

In [None]:
source_data, source_labels = list_of_tensors(DEVICE, posting_lists, MODE, EMBED)

In [None]:
print(source_data[2224].shape)
print(source_data[2224])
print(source_labels[2224])

In [None]:
def get_batch(data, target, source_lengths, i, bsz):
    batch_size = min(bsz, len(source_lengths))
    return data[i:i+batch_size], target[i:i+batch_size], source_lengths[i:i+batch_size]

In [None]:
def get_data(data, target, mode="doc2doc"):
    if mode == "doc2doc":
        data = [d[:-1] for d in data]
        target = [t[1:] for t in target]
    return data, target

In [None]:
def train(model, criterion, optimizer, source_data, source_labels, lengths,
          mode="doc2doc", scheduler=None, epochs=2000, batch_size=3, log_interval=10):
    model.train()
    epoch_losses = []
    total_epoch_progress = Output()
    display.display(total_epoch_progress)
    current_epoch_progress = Output()
    display.display(current_epoch_progress)
    loss_plot = Output()
    display.display(loss_plot)
    
    # Wrap single posting lists as lists
    if not isinstance(source_data, list):
        source_data = [source_data]
        source_labels = [source_labels]
        lengths = [lengths]
    
    # Loop for number of epochs:
    for e in range(1, epochs+1):
        with total_epoch_progress:
            print("Epoch {}/{} [{:.2f}%]".format(e, epochs, (e/epochs)*100))
        total_epoch_progress.clear_output(wait=True)
        epoch_loss = 0
        
        # Loop for batches within data:
        for batch_idx, i in enumerate(range(0, len(lengths), batch_size)):
            batch_data, batch_target, batch_lengths = get_batch(source_data, 
                                                                source_labels, lengths, i, batch_size)
            with current_epoch_progress:
                print("Current Epoch {}: {}/{} [{:.2f}%]".format(e, 
                                                         i+min(batch_size, len(batch_data)), 
                                                         len(lengths), 
                                                         ((i+min(batch_size, len(batch_data)))/len(lengths))*100))
            current_epoch_progress.clear_output(wait=True)
#             hidden = model.init_hidden(min(batch_size, len(batch_data)))

            # Get data
            data, target = get_data(batch_data, batch_target, mode)

            # Zero out the grad
            optimizer.zero_grad()
                
            # Get output
#             prediction, _ = model(data, batch_lengths, hidden)
            prediction, _ = model(data, batch_lengths)

            # Calculate loss
            target = nn.utils.rnn.pad_sequence(target, padding_value=0.0, batch_first=False)
            if prediction.dim() == 1:
                target = target.squeeze()
            loss = criterion(prediction, target)
            epoch_loss += loss.item()
                
            # Take gradient step
            loss.backward()
            optimizer.step()

            # Take scheduler step
            if scheduler:
                scheduler.step(loss)
        epoch_losses.append(epoch_loss)
        
        # Print loss and plot predicitons vs. ground truth
        if e % log_interval == 0:
            with loss_plot:
                plt.plot(list(range(len(epoch_losses))), epoch_losses)
                plt.title("Loss per epoch")
                plt.xlabel("Epochs")
                plt.ylabel("Loss")
                plt.show()
            loss_plot.clear_output(wait=True)
            print("Train Epoch {}: Loss - {}, Avg Loss - {}".format(e, epoch_losses[-1], sum(epoch_losses)/(e+1)))

In [None]:
# Instantiate model and setup training
if RNN_TYPE == "LSTMAE":
    lii_rnn = LSTMAE(MODE, INPUT_SIZE, HIDDEN_SIZE, LAYERS)
else:
    lii_rnn = RNNModel(MODE, RNN_TYPE, INPUT_SIZE, HIDDEN_SIZE, LAYERS)
lii_rnn.to(DEVICE)
optimizer = optim.Adam(params=lii_rnn.parameters(), lr=0.1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=20, verbose=True, threshold=10e-6)
scheduler = None

In [None]:
# Train model
train(model=lii_rnn,
      criterion=nn.L1Loss() if LOSS == "L1" else nn.MSELoss(),
      optimizer=optimizer,
      source_data=source_data[2224],
      source_labels=source_labels[2224],
      lengths=lengths[2224],
      scheduler=scheduler,
      mode=MODE,
      epochs=EPOCHS,
      batch_size=BATCH_SIZE,
      log_interval=LOG_INTERVAL)

In [None]:
def evaluate_doc2doc(model, posting_lists, lengths, primer_tokens=1, embedding=False, scaler=1):
    model.eval()
    predictions = []
    prediction_progress = Output()
    display.display(prediction_progress)
    pl_progress = Output()
    display.display(pl_progress)
    
    # Wrap single posting lists as lists
    if not isinstance(posting_lists, list):
        posting_lists = [posting_lists]
        lengths = [lengths]
    
    for idx, pl in enumerate(posting_lists):
        with prediction_progress:
            print("Prediction Progres {}/{} [{:.2f}%]".format(idx+1,
                                                              len(posting_lists),
                                                              ((idx+1)/len(posting_lists))*100))
        prediction_progress.clear_output(wait=True)
        if embedding:
            term_id = pl[0, 1]
        hidden = model.init_hidden(1)
        # Get first output
        prediction, hidden = model([pl[:primer_tokens]], [primer_tokens+1], hidden)
        if not embedding:
            prediction = prediction.unsqueeze(dim=0) * scale_factor
        next_prediction = prediction
        for i in range(lengths[idx] - primer_tokens):
            with pl_progress:
                print("PL {} Progres {}/{} [{:.2f}%]".format(idx+1,
                                                             i+1+primer_tokens,
                                                             lengths[idx],
                                                             ((i+1+primer_tokens)/lengths[idx])*100))
            pl_progress.clear_output(wait=True)
            if embedding:
                next_token = embed(next_prediction.squeeze(), term_id)
            else:
                next_token = next_prediction.unsqueeze(dim=0)
            next_prediction, hidden = model([next_token], [2], hidden)
            if not embedding:
                next_prediction = next_prediction.unsqueeze(dim=0) * scale_factor
            prediction = torch.cat((prediction, next_prediction))
        predictions.append(prediction.squeeze())
    return predictions


def evaluate_index2doc(model, indexes, lengths, scale_factor=1):
    model.eval()
    predictions = []
    prediction_progress = Output()
    display.display(prediction_progress)
    
    # Wrap single posting lists as lists
    if not isinstance(indexes, list):
        indexes = [indexes]
        lengths = [lengths]
    
    for i, pl in enumerate(indexes):
        with prediction_progress:
            print("Prediction Progres {}/{} [{:.2f}%]".format(i+1,
                                                              len(indexes),
                                                              ((i+1)/len(indexes))*100))
        prediction_progress.clear_output(wait=True)
        hidden = model.init_hidden(1)
        predictions.append(model([pl], [lengths[i]], hidden)[0].squeeze() * scale_factor)
    return predictions

In [None]:
# Generate predictions
if MODE == "doc2doc":
    predictions = evaluate_doc2doc(lii_rnn, source_data[2224], lengths[2224],
                                   primer_tokens=1, scale_factor=scale_factor, embedding=EMBED)
else:
    predictions = evaluate_index2doc(lii_rnn, source_data[2224], lengths[2224], scale_factor)

In [None]:
plt.plot(np.arange(predictions[0].size(0)), source_labels[2224].cpu().detach().numpy(), label="Ground truth")
plt.plot(np.arange(predictions[0].size(0)), predictions[0].cpu().detach().numpy(), label="Predictions")
plt.xlabel("Posting list Index")
plt.ylabel("Doc id")
plt.title("Predicted vs. Ground truth Doc IDs")
plt.legend(loc="best")
plt.show()
print(source_labels[2224])
print(predictions)

In [None]:
# Save model
print("Model's state_dict:")
for param_tensor in lii_rnn.state_dict():
    logging.info("{} \t {}".format(param_tensor, lii_rnn.state_dict()[param_tensor].size()))
MODEL_DIR = os.path.join(BASE_DIR, "../models")
model_path = os.path.join(MODEL_DIR, "{}_{}_h{}_l{}_e{}.pth".format(mode, rnn_type, hidden_size, layers, epochs))
print("Saving model to: {}".format(model_path))
torch.save(lii_rnn.state_dict(), model_path)

In [None]:
def zigzag_encode (i):
    return (i >> 31) ^ (i << 1)

In [None]:
# Save delta between prediction and target
deltas_list = []
global_max = 0
for idx, tens in enumerate(predictions):
    delta = (tens.round() - data[idx]).detach().tolist()
    delta_zigzag = [data[idx].size(0)] + list(accumulate([zigzag_encode(int(i)) for i in delta]))
    global_max = max(delta_zigzag[-1], global_max)
    deltas_list.append(np.array(delta_zigzag, dtype=np.uint32))
global_max_list = [1, global_max]
deltas_array = np.concatenate([np.array(global_max_list, dtype=np.uint32)] + deltas_list).ravel()
DELTA_DIR = os.path.join(BASE_DIR, "../deltas")
delta_file = os.path.join(DELTA_DIR, "{}_{}_h{}_l{}_e{}.docs".format(mode, rnn_type, hidden_size, layers, epochs))
print("Saving deltas to: {}".format(delta_file))
with open(delta_file, "wb") as binfile:
    deltas_array.tofile(binfile)