#This is the notebook for the project

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch import Tensor
import torchtext
from torchtext.data import Field, TabularDataset, BucketIterator

from typing import List, Tuple
import re
import pandas as pd
import numpy as np
import math
import time
import random

#plt.switch_backend('agg')
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# load and format data
def load_data(data):
    colnames = ["In","Out"]
    data = pd.read_csv(data,sep="OUT:",header = None,names=colnames, engine='python')
    data['In'] = data['In'].apply(lambda x: re.sub('IN:','', str(x)))
    return data

In [None]:
# data path for Experiment 1
data_path = "/content/drive/MyDrive/SCAN/simple_split/"

# data path for Experiment 2
# data_path = "/content/drive/MyDrive/SCAN/length_split/"


# data path for Experiment 3
# data_path = "/content/drive/MyDrive/SCAN/add_prim_split/"


# filenames of the training and test data for Experiment 1
train_filename = "tasks_train_simple"
test_filename = "tasks_test_simple"

# filenames of the training and test data for Experiment 2
train_filename = "tasks_train_length"
test_filename = "tasks_test_length"

# filenames of the training and test data for Experiment 3 turn left
train_filename = "tasks_train_addprim_turn_left"
test_filename = "tasks_test_addprim_turn_left"

# filenames of the training and test data for Experiment 3 turn left
train_filename = "tasks_train_addprim_jump"
test_filename = "tasks_test_addprim_jump"


train_data = load_data(data_path + train_filename + ".txt")
test_data = load_data(data_path + test_filename + ".txt")

In [5]:
train_csv = train_filename + ".csv"
test_csv = test_filename + ".csv"

train_to_tsv = train_data.to_csv(data_path + train_csv, index=False)
test_to_tsv = test_data.to_csv(data_path + test_csv, index=False)

In [6]:
command = Field(sequential=True, use_vocab=True, init_token='<SOS>', eos_token = '<EOS>')
action = Field(sequential=True, use_vocab=True, init_token='<SOS>', eos_token = '<EOS>')
fields = {"In":("i", command),"Out":("o", action)}
data_train, data_test = TabularDataset.splits(path=data_path, train=train_csv, test=test_csv, format="csv", fields=fields)

In [7]:
command.build_vocab(data_train)
action.build_vocab(data_train)

In [8]:
batch_size = 1
train_iterator, test_iterator = BucketIterator.splits((data_train, data_test), sort_key=lambda x: len(x.i), batch_size=batch_size, device=device)

In [9]:
def command_to_sentence(sequence):
  out = []
  for i in sequence:
    out.append(command.vocab.itos[i])
  return out
  
def action_to_sentence(sequence):
  out = []
  for i in sequence:
    out.append(action.vocab.itos[i])
  return out

In [10]:
src_vocab_size = len(command.vocab.itos)
trg_vocab_size = len(action.vocab.itos)
src_pad_idx = command.vocab.stoi['<pad>']

In [11]:
class TransformerModel(nn.Module):

    def __init__(self, emb_dim, src_vocab_size, trg_vocab_size, src_pad_idx, n_heads, n_encoder_layers, n_decoder_layers, dim_forward, dropout_p, max_len, device):
        super(TransformerModel, self).__init__()
        self.device = device
        self.src_emb = nn.Embedding(src_vocab_size, emb_dim)
        self.src_pos = nn.Embedding(max_len, emb_dim)
        self.trg_emb = nn.Embedding(trg_vocab_size, emb_dim)
        self.trg_pos = nn.Embedding(max_len, emb_dim)
        self.transformer = nn.Transformer(emb_dim, n_heads, n_encoder_layers, n_decoder_layers, dim_forward, dropout_p)
        self.fc = nn.Linear(emb_dim, trg_vocab_size)
        self.dropout = nn.Dropout(dropout_p)
        self.src_pad_idx = src_pad_idx
    
    def generate_src_padding_mask(self, src):
        src_mask = src.transpose(0, 1) == self.src_pad_idx
        return src_mask.to(self.device)
    
    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape
        src_positions = (torch.arange(0, src_seq_length).unsqueeze(1).expand(src_seq_length, N).to(self.device))
        trg_positions = (torch.arange(0, trg_seq_length).unsqueeze(1).expand(trg_seq_length, N).to(self.device))
        emb_src = self.dropout(self.src_emb(src)+self.src_pos(src_positions))
        emb_trg = self.dropout(self.trg_emb(trg)+self.trg_pos(trg_positions))
        src_mask = self.generate_src_padding_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)
        
        output = self.transformer(emb_src, emb_trg, tgt_mask=trg_mask)#, src_key_padding_mask = src_mask
        output = self.fc(output)
        return output

In [12]:
# hyperparameters for Experiment 1 and 2
emb_dim = 200
n_heads = 8
n_encoder_layers = 2
n_decoder_layers = 2
dim_forward = 480
dropout_p = 0
max_len=100
lr = 0.0001

# hyperparameters for Experiment 3
emb_dim = 240
#n_heads = 8
#n_encoder_layers = 2
#n_decoder_layers = 2
#dim_forward = 480
#dropout_p = 0
#max_len=100
#lr = 0.0001

model = TransformerModel(emb_dim, src_vocab_size, trg_vocab_size, src_pad_idx, n_heads, n_encoder_layers, n_decoder_layers, dim_forward, dropout_p, max_len, device).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [13]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

clip = 5

def trainIters(model, n_iters, print_every=1000, plot_every=100):

    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    for i in range(1, n_iters + 1):
        model.train()
        optimizer.zero_grad()

        training_pair = next(iter(train_iterator))
        input_tensor = training_pair.i.long().to(device)
        target_tensor = training_pair.o.long().to(device)

        # forward
        pred = model(input_tensor, target_tensor[:-1, :])
        pred = pred.reshape(-1, pred.shape[2])
        # remove the start token
        target_tensor = target_tensor[1:].reshape(-1)

        loss = criterion(pred, target_tensor)

        print_loss_total += loss
        plot_loss_total += loss

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        if i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, i / n_iters), i, i / n_iters * 100, print_loss_avg))

        if i % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [14]:
def evaluate(model, input_tensor, max_length = 50):
    model.eval()
    with torch.no_grad():
        outputs = []
        outputs = [action.vocab.stoi["<SOS>"]]
        
        for i in range(max_length):
            target_tensor =  torch.LongTensor(outputs).unsqueeze(1).to(device)
            output = model(input_tensor, target_tensor)

            best_guess = output.argmax(2)[-1, :].item()
            outputs.append(best_guess)

            if best_guess == action.vocab.stoi["<EOS>"]:
                break
    return outputs

In [None]:
# start training
trainIters(model, 50000, print_every=100)

In [None]:
# get training accuracy
total = {}

counts = {}

print('Testing on ' + str(len(train_iterator)) + ' examples')
for batch_idx, batch in enumerate(train_iterator):
    l = len(batch.o)
    pred_out = evaluate(model, batch.i.long().to(device), max_length = 50)
    if (str(batch.o.squeeze(1).tolist()) == str(pred_out)):
        total[l] = total[l] +1 if l in total else 1
    counts[l] = counts[l] +1 if l in counts else 1
a = pd.Series(total)
b = pd.Series(counts)
accuracy = a.sum()/b.sum()
print('Training accuracy: ' + str(accuracy))
plt.bar(b.keys(), a/b)
plt.show()

In [None]:
# get test accuracy

total = {}

counts = {}

print('Testing on ' + str(len(test_iterator)) + ' examples')
for batch_idx, batch in enumerate(test_iterator):
    l = len(batch.o)
    pred_out = evaluate(model, batch.i.long().to(device), max_length = 50)
    if (str(batch.o.squeeze(1).tolist()) == str(pred_out)):
        total[l] = total[l] +1 if l in total else 1
    counts[l] = counts[l] +1 if l in counts else 1
a = pd.Series(total)
b = pd.Series(counts)
accuracy = a.sum()/b.sum()
print('correct items.', a.sum())
print('all items', b.sum())
print('Test  accuracy: ' + str(accuracy))
plt.bar(b.keys(), a/b)
plt.show()