In [1]:
%load_ext autoreload
%autoreload 2

## Imports

In [2]:
import numpy as np
import torch

In [3]:
import utils
import structs
import data_utils

In [4]:
from models import BOW, CBOW, DeepCBOW, LSTMClassifier, TreeLSTMClassifier

## Data

In [5]:
GLOVE_PATH = "data/glove.840B.300d.sst.txt"

In [6]:
with open(GLOVE_PATH) as f:
  lines = f.readlines()

In [7]:
dataset = data_utils.get_SST_data("data/trees/")

## Vocabulary

In [8]:
# normal vocab
v = structs.Vocabulary()
for data_set in (dataset["train"],):
    for ex in data_set:
        for token in ex.tokens:
            v.count_token(token)

v.build()
print("Vocabulary size:", len(v.w2i))

Vocabulary size: 18280


In [9]:
# glove vocab and vectors
glove_v = structs.Vocabulary()
glove_v.build()

# +2 to account for unk and pad
glove_vectors = np.zeros((len(lines) + 2, 300), dtype=np.float64)
# set unk vector randomly
glove_vectors[0] = np.random.normal(0, 1, 300)
# we set pad token to be 0-valued

for i, line in enumerate(lines):
    split = line.split(" ")
    glove_v.add_token(split[0])
    # we skip the first two elements as these are reserved to unk and pad
    glove_vectors[i + 2] = split[1:]

## Globals

In [10]:
N_CLASSES = len(structs.i2t)
V_SIZE = len(v.w2i)
GLOVE_V_SIZE = len(glove_v.w2i)

## Model Instantiation 

In [11]:
bow = BOW(V_SIZE, N_CLASSES, v)
cbow = CBOW(V_SIZE, 300, N_CLASSES, v)
dcbow = DeepCBOW(V_SIZE, 300, 100, N_CLASSES, v)
# less direct instantiation for  models with  pretrained glove vectors
ptdcbow = DeepCBOW(GLOVE_V_SIZE, 300, 100, N_CLASSES, glove_v)
with torch.no_grad():
    ptdcbow.embed.weight.data.copy_(torch.from_numpy(glove_vectors))
    ptdcbow.embed.weight.requires_grad = False

lstm = LSTMClassifier(GLOVE_V_SIZE, 300, 100, N_CLASSES, glove_v)
with torch.no_grad():
    lstm.embed.weight.data.copy_(torch.from_numpy(glove_vectors))
    lstm.embed.weight.requires_grad = False

treelstm = TreeLSTMClassifier(GLOVE_V_SIZE, 300, 100, N_CLASSES, glove_v)
with torch.no_grad():
    treelstm.embed.weight.data.copy_(torch.from_numpy(glove_vectors))
    treelstm.embed.weight.requires_grad = False

In [12]:
model_zoo = {
    "bow": cbow,
    "dcbow": dcbow,
    "ptdcbow": ptdcbow,
    "lstm": lstm,
    "treelstm": treelstm,
}