## Fetch Code

In [0]:
!rm *.py
!git clone https://github.com/tanmaybinaykiya/CS-7643-Deep-Learning-Final-Project.git 
!mv CS-7643-Deep-Learning-Final-Project/*.py .
!rm -rf 'CS-7643-Deep-Learning-Final-Project'
!mkdir data
!mkdir dataset

# Install Libraries

## Install Pytorch

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'
print("platform, accelerator:", platform, accelerator)
!pip install -v -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision

In [0]:
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader

from constants import DatasetPaths

from DataLoader import SquadDataset, collate_fn, GloVeEmbeddings
from models import EncoderBILSTM, DecoderLSTM
from train import train, greedy_search

# Data Processing

### Download GloVe

In [0]:
!wget 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
!unzip 'glove.840B.300d.zip'
!rm glove.840B.300d.zip
!mv glove.840B.300d.txt data/

### Download Squad

In [0]:
!mkdir dataset
!curl -o 'dataset/squad-train-v1.1.json' 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' 
!curl -o 'dataset/squad-dev-v1.1.json' 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' 

### Data Preprocessor

- Builds question answer pairs: (Question: [index], Answer [index], Paragraph [index])
- Builds a map of paragraphs: {index: paragraph}
- Builds a word_to_idx map for questions and answers: {word: index}
- Builds an idx_to_word map for questions and answers: {index: word}
- Prunes glove embeddings for questions and answers: {word:embedding}
- Maintains the most frequent 45000 words in answers and 28000 words in questions, all others are stored as <unk>


In [0]:
from DataProcessor import SquadPreProcessor, GlovePreproccesor

train = SquadPreProcessor(path=DatasetPaths["squad"]["train"], split="train", q_vocab_size=45000, a_vocab_size=28000)
paragraphs, question_answer_pairs = train.preprocess()
train.persist(paragraphs, question_answer_pairs)

dev = SquadPreProcessor(path=DatasetPaths["squad"]["dev"], split="dev", q_vocab_size=45000, a_vocab_size=28000)
paragraphs, question_answer_pairs = dev.preprocess()
dev.persist(paragraphs, question_answer_pairs)

GlovePreproccesor().obtain_glove_embeddings(glove_filename=DatasetPaths["glove"]["original-embeddings"],
                                            word_to_ix=train.a_word_to_idx,
                                            pruned_glove_filename=DatasetPaths["glove"]["answer-embeddings"])

GlovePreproccesor().obtain_glove_embeddings(glove_filename=DatasetPaths["glove"]["original-embeddings"],
                                            word_to_ix=train.q_word_to_idx,
                                            pruned_glove_filename=DatasetPaths["glove"]["question-embeddings"])

# Train Model

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline  
import numpy as np

def plot_losses(losses):
  plt.plot(losses)

  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title('Loss vs Epoch')
  plt.grid(True)

  plt.show()

### Train Model

In [0]:
use_cuda=True
use_cuda = use_cuda and torch.cuda.is_available()

train_dataset = SquadDataset(split="train")
word_to_idx_sent = train_dataset.get_answer_word_to_idx()
word_to_idx_q = train_dataset.get_question_idx_to_word()

train_vocab_size_sent = len(word_to_idx_sent)
train_vocab_size_q = len(word_to_idx_q)
num_epoch = 15
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)

word_embeddings_glove_q = GloVeEmbeddings.load_glove_embeddings(True)
word_embeddings_glove_sent = GloVeEmbeddings.load_glove_embeddings(False)

encoder = EncoderBILSTM(vocab_size=train_vocab_size_sent, n_layers=2, embedding_dim=300, hidden_dim=500, dropout=0, embeddings=word_embeddings_glove_sent)
decoder = DecoderLSTM(vocab_size=train_vocab_size_q, embedding_dim=300, hidden_dim=500, n_layers=1, encoder_hidden_dim=500, embeddings=word_embeddings_glove_q)

if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

n_train = len(train_loader)
batch_per_epoch = n_train // batch_size

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer_enc = torch.optim.RMSprop(encoder.parameters(), lr=1, momentum=0.7)
optimizer_dec = torch.optim.RMSprop(decoder.parameters(), lr=1, momentum=0.7)

if not os.path.isdir("model_weights"):
    os.makedirs("model_weights", exist_ok=True)

losses= train(encoder=encoder, decoder=decoder, epoch_count=num_epoch, batch_per_epoch=batch_per_epoch, 
                   train_loader=train_loader, criterion=criterion, optimizer_enc=optimizer_enc, optimizer_dec=optimizer_dec, 
                   is_cuda=use_cuda, debug=False)

### Visualize Loss Profile

In [0]:
plot_losses(losses)

### Predict

In [0]:
dev_dataset = SquadDataset(split="dev")

dev_loader = DataLoader(
    dev_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)
dev_idx_to_word_q = dev_dataset.get_question_idx_to_word()
dev_idx_to_word_sent = dev_dataset.get_answer_idx_to_word()

encoder = EncoderBILSTM(vocab_size=train_vocab_size_sent, n_layers=2, embedding_dim=300, hidden_dim=500, dropout=0, embeddings=word_embeddings_glove_sent)
decoder = DecoderLSTM(vocab_size=train_vocab_size_q, embedding_dim=300, hidden_dim=500, n_layers=1, encoder_hidden_dim=500, embeddings=word_embeddings_glove_q)
if use_cuda:
  encoder.cuda()
  decoder.cuda()
encoder.load_state_dict(torch.load("model_weights/1-encoder.pth"))
decoder.load_state_dict(torch.load("model_weights/1-decoder.pth"))

idx_to_word_sent = train_dataset.get_answer_idx_to_word()
idx_to_word_q = train_dataset.get_question_idx_to_word()

greedy_search(encoder, decoder, train_loader, True, idx_to_word_q, idx_to_word_sent, batch_size=64)
greedy_search(encoder, decoder, dev_loader, True, dev_idx_to_word_q, dev_idx_to_word_sent, batch_size=64)