In [1]:
import re
import random
import time
import math
import numpy as np
from utils import print_results
import matplotlib.pyplot as plt
plt.style.use('default')

## Preprocessing

In [6]:
# this function reads in a textfile and fixes an issue with "\\"
def filereader(path): 
  with open(path, mode="r", encoding="utf-8") as f:
    for line in f:
      yield line.strip().replace("\\","")

s = next(filereader('trees/train.txt'))

In [7]:
# We will also need the following function, but you can ignore this for now.
# It is explained later on.

def transitions_from_treestring(s):
  s = re.sub("\([0-5] ([^)]+)\)", "0", s)
  s = re.sub("\)", " )", s)
  s = re.sub("\([0-4] ", "", s)
  s = re.sub("\([0-4] ", "", s)
  s = re.sub("\)", "1", s)
  return list(map(int, s.split()))

def tokens_from_treestring(s):
  """extract the tokens from a sentiment tree"""
  return re.sub(r"\([0-9] |\)", "", s).split()

In [8]:
from collections import namedtuple
from nltk import Tree

# A simple way to define a class is using namedtuple.
Example = namedtuple("Example", ["tokens", "tree", "label", "transitions"])
   

def examplereader(path, lower=False):
  """Returns all examples in a file one by one."""
  for line in filereader(path):
    line = line.lower() if lower else line
    tokens = tokens_from_treestring(line)
    tree = Tree.fromstring(line)  # use NLTK's Tree
    label = int(line[1])
    trans = transitions_from_treestring(line)
    yield Example(tokens=tokens, tree=tree, label=label, transitions=trans)

# Let's load the data into memory.
LOWER = False  # we will keep the original casing
train_data = list(examplereader("trees/train.txt", lower=LOWER))
dev_data = list(examplereader("trees/dev.txt", lower=LOWER))
test_data = list(examplereader("trees/test.txt", lower=LOWER))

print("train", len(train_data))
print("dev", len(dev_data))
print("test", len(test_data))

train 8544
dev 1101
test 2210


In [9]:
# Here we first define a class that can map a word to an ID (w2i)
# and back (i2w).

from collections import Counter, OrderedDict, defaultdict


class OrderedCounter(Counter, OrderedDict):
  """Counter that remembers the order elements are first seen"""
  def __repr__(self):
    return '%s(%r)' % (self.__class__.__name__,
                      OrderedDict(self))
  def __reduce__(self):
    return self.__class__, (OrderedDict(self),)


class Vocabulary:
  """A vocabulary, assigns IDs to tokens"""
  
  def __init__(self):
    self.freqs = OrderedCounter()
    self.w2i = {}
    self.i2w = []

  def count_token(self, t):
    self.freqs[t] += 1
    
  def add_token(self, t):
    self.w2i[t] = len(self.w2i)
    self.i2w.append(t)    
    
  def build(self, min_freq=0):
    '''
    min_freq: minimum number of occurrences for a word to be included  
              in the vocabulary
    '''
    self.add_token("<unk>")  # reserve 0 for <unk> (unknown words)
    self.add_token("<pad>")  # reserve 1 for <pad> (discussed later)   
    
    tok_freq = list(self.freqs.items())
    tok_freq.sort(key=lambda x: x[1], reverse=True)
    for tok, freq in tok_freq:
      if freq >= min_freq:
        self.add_token(tok)

In [10]:
# This process should be deterministic and should have the same result 
# if run multiple times on the same data set.

v = Vocabulary()
for data_set in (train_data,):
  for ex in data_set:
    for token in ex.tokens:
      v.count_token(token)

v.build()
print("Vocabulary size:", len(v.w2i))

Vocabulary size: 18280


In [11]:
i2t = ["very negative", "negative", "neutral", "positive", "very positive"]
# And let's also create the opposite mapping.
# We won't use a Vocabulary for this (although we could), since the labels
# are already numeric.
t2i = OrderedDict({p : i for p, i in zip(i2t, range(len(i2t)))})

## word2vec

In [12]:
# # This downloads the word2vec 300D Google News vectors 
# # The file has been truncated to only contain words that appear in our data set.
# # You can find the original file here: https://code.google.com/archive/p/word2vec/

# # You only need to do this once.
# # Please comment this out after downloading.
# !wget https://gist.githubusercontent.com/bastings/4d1c346c68969b95f2c34cfbc00ba0a0/raw/76b4fefc9ef635a79d0d8002522543bc53ca2683/googlenews.word2vec.300d.txt

In [13]:
embedding_size = 300
v_embed = Vocabulary()

vectors = [np.zeros(embedding_size), np.zeros(embedding_size)]
with open('googlenews.word2vec.300d.txt', 'r') as fin:
    for embedding in fin:
        word, vector = embedding.split(None, maxsplit=1)
        vectors.append(vector.split())
        v_embed.count_token(word)

v_embed.build()        
print("Vocabulary size:", len(v_embed.w2i))
vectors = np.stack(vectors, axis=0).astype(float)

Vocabulary size: 18922


## Torch

In [14]:
import torch
print("Using torch", torch.__version__) 

# PyTorch can run on CPU or on Nvidia GPU (video card) using CUDA
# This cell selects the GPU if one is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# When running on the CuDNN backend two further options must be set for reproducibility
if torch.cuda.is_available():
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

Using torch 1.10.0
cpu


# Experiments:

### N-ary Tree-LSTM vs Child-Sum Tree-LSTM

In [None]:
# Baseline training
best_iters, train_accs, dev_accs, test_accs = [], [], [], []
for seed in [1,2,3]:
  torch.manual_seed(seed)

  tree_model = TreeLSTMClassifier(
      len(v_embed.w2i), 300, 150, len(t2i), v_embed)

  with torch.no_grad():
    tree_model.embed.weight.data.copy_(torch.from_numpy(vectors))
    tree_model.embed.weight.requires_grad = False
    
  model = tree_model.to(device)

  optimizer = torch.optim.Adagrad(model.parameters(), lr=0.05)
    
  best_iter, train_acc, dev_acc, test_acc = train_model(
        model, optimizer, train_data, dev_data, 
        test_data, num_iterations=10000, 
        print_every=250, eval_every=250,
        prep_fn=prepare_treelstm_minibatch,
        eval_fn=evaluate,
        batch_fn=get_minibatch,
        batch_size=25, eval_batch_size=25)

  best_iters.append(best_iter)
  train_accs.append(train_acc)
  dev_accs.append(dev_acc)
  test_accs.append(test_acc)

print_results(best_iters, train_accs, dev_accs, test_accs)

In [None]:
# Now let's train the Child-Sum Tree LSTM!
from LSTM import ChildSumTreeLSTMClassifier

best_iters, train_accs, dev_accs, test_accs = [], [], [], []
for seed in [1, 2, 3]:
    torch.manual_seed(seed)
    tree_model = ChildSumTreeLSTMClassifier(
        len(v_embed.w2i), 300, 150, len(t2i), v_embed)

    with torch.no_grad():
        tree_model.embed.weight.data.copy_(torch.from_numpy(vectors))
        tree_model.embed.weight.requires_grad = False

    model = tree_model.to(device)

    optimizer = torch.optim.Adagrad(model.parameters(), lr=0.05)

    best_iter, train_acc, dev_acc, test_acc = train_model(
        model, optimizer, train_data, dev_data,
        test_data, num_iterations=30000,
        print_every=250, eval_every=250,
        prep_fn=prepare_treelstm_minibatch,
        eval_fn=evaluate,
        batch_fn=get_minibatch,
        batch_size=25, eval_batch_size=25)

    best_iters.append(best_iter)
    train_accs.append(train_acc)
    dev_accs.append(dev_acc)
    test_accs.append(test_acc)


print_results(best_iters, train_accs, dev_accs, test_accs)