# End model to predict financial incentives and disincentives

John Brandt

Last updated: Aug 19, 2019


This notebook contains a gold standard baseline (LSTM with gold standard labels) as well as a noisy implementation of snorkel labels with roBERTa encoding.

In [1]:
import numpy as np
import torch
import pandas as pd
from torch import nn

## Shared classes

#### Data Loader

In [2]:
import torch
from torch.utils import data
from torch.utils.data import DataLoader, TensorDataset

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, list_IDs, labels, mode):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs
        self.mode = mode
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]

        # Load data and get label
        X = np.load('../data/processed/' + self.mode + '_embeddings/' + str(ID) + '.npy')
        y = self.labels[ID]

        return X.reshape((50, 1024)), y

In [13]:
df = pd.read_csv("../data/processed/gold_standard.csv")
gs_probas = np.load('../data/interim/snorkel_proba.npy')
noisy_probas = np.load('../data/interim/snorkel_noisy_proba.npy')

params = {'batch_size': 50,
          'shuffle': True,
          'num_workers': 2}

Y = df['class'] - 1
    
# Datasets
partition = {'train': [x for x in range(0, 800)],
             'validation': [x for x in range(800, 1000)]}

noisy_labels = {k:w for w,k in zip(noisy_probas, range(1000))}
gs_labels = {k:w for w,k in zip(Y, range(800, 1000))}

# Generators
training_set = Dataset(partition['train'], gs_probas[0:800], 'test')
training_generator = data.DataLoader(training_set, **params)

validation_set = Dataset(partition['validation'], gs_labels, 'test')
validation_generator = data.DataLoader(validation_set, **params)


#### Soft label loss

In [14]:
import torch.nn as nn
import torch.nn.functional as F


class SoftCrossEntropyLoss(nn.Module):
    """Computes the CrossEntropyLoss while accepting probabilistic (float) targets
    Args:
        weight: a tensor of relative weights to assign to each class.
            the kwarg name 'weight' is used to match CrossEntropyLoss
        reduction: how to combine the elementwise losses
            'none': return an unreduced list of elementwise losses
            'mean': return the mean loss per elements
            'sum': return the sum of the elementwise losses
    Accepts:
        input: An [n, k] float tensor of prediction logits (not probabilities)
        target: An [n, k] float tensor of target probabilities
    """

    def __init__(self, weight=None, reduction="mean"):
        super().__init__()
        # Register as buffer is standard way to make sure gets moved /
        # converted with the Module, without making it a Parameter
        if weight is None:
            self.weight = None
        else:
            # Note: Sets the attribute self.weight as well
            self.register_buffer("weight", torch.FloatTensor(weight))
        self.reduction = reduction

    def forward(self, input, target):
        n, k = input.shape
        # Note that t.new_zeros, t.new_full put tensor on same device as t
        cum_losses = input.new_zeros(n)
        for y in range(k):
            cls_idx = input.new_full((n,), y, dtype=torch.long)
            y_loss = F.cross_entropy(input, cls_idx, reduction="none")
            if self.weight is not None:
                y_loss = y_loss * self.weight[y]
            cum_losses += target[:, y].float() * y_loss
        if self.reduction == "none":
            return cum_losses
        elif self.reduction == "mean":
            return cum_losses.mean()
        elif self.reduction == "sum":
            return cum_losses.sum()
        else:
            raise ValueError(f"Unrecognized reduction: {self.reduction}")

#### Training function

In [15]:
def apply(model, criterion, batch, targets):
    pred = model(torch.autograd.Variable(batch))
    loss = criterion(pred, torch.autograd.Variable(targets))
    return pred, loss


def train(model,
         n_epochs,
         train_generator,
         val_generator):
    
    softcriterion = SoftCrossEntropyLoss()
    hardcriterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.002)
    counter = 0
    
    for epoch in range(1, n_epochs + 1):
        for inputs, labels in train_generator:
            counter += 1
            model.zero_grad()
            inputs = inputs.type(torch.FloatTensor)
            pred, loss = apply(model, softcriterion, inputs, labels)
            loss.backward()
            optimizer.step()
            if counter % print_every == 0:
                # Get validation loss
                val_h = model.init_hidden(batch_size)
                val_losses = []
                model.eval()
                for inputs, labels in val_generator:
                    inputs = inputs.type(torch.FloatTensor)
                    output = model(inputs)
                    val_loss = hardcriterion(output.reshape((batch_size, 3)), labels.long())
                    val_losses.append(val_loss.item())

                model.train()
                print("Epoch: {}/{}...".format(epoch+1, n_epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.6f}...".format(loss.item()),
                      "Val Loss: {:.6f}".format(np.mean(val_losses)))
    return model

## Gold standard baseline

Shallow RNN with RoBERTa encoded words.

In [16]:
import torch.autograd as autograd

class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_size = output_size

        # Defining the layers
        # RNN Layer
        self.lstm = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, self.output_size)
    
    def forward(self, batch):
        self.hidden = self.init_hidden(batch.size(-2))
        outputs, (ht, ct) = self.lstm(batch, self.hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = self.fc(ht[-1])
        return out
    
    def init_hidden(self, batch_size):
        return(autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)),
                                 autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))

In [17]:
model = Model(input_size = 1024, output_size = 3, hidden_dim = 200, n_layers = 1)
model.to("cpu")

n_epochs = 100

In [18]:
device = "cpu"
print_every = 100
batch_size = 50

model = train(model = model,
             n_epochs = 250,
             train_generator = training_generator,
             val_generator = validation_generator)

Epoch: 8/250... Step: 100... Loss: 0.344589... Val Loss: 1.252464
Epoch: 14/250... Step: 200... Loss: 0.342282... Val Loss: 1.358753
Epoch: 20/250... Step: 300... Loss: 0.328632... Val Loss: 1.242303
Epoch: 26/250... Step: 400... Loss: 0.283295... Val Loss: 1.359109
Epoch: 33/250... Step: 500... Loss: 0.300449... Val Loss: 1.385932
Epoch: 39/250... Step: 600... Loss: 0.310107... Val Loss: 1.391400
Epoch: 45/250... Step: 700... Loss: 0.257704... Val Loss: 1.406703
Epoch: 51/250... Step: 800... Loss: 0.256873... Val Loss: 1.419379


Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connectio

KeyboardInterrupt: 

## Snorkel end model with RoBERTA-LSTM input module and noise-aware output head

In [11]:
Y = df['class'] - 1
    
# Datasets
partition = {'train': [x for x in range(0, 15000)],
             'validation': [x for x in range(0, 1000)]}

noisy_labels = {k:w for w,k in zip(noisy_probas, range(15000))}
gs_labels = {k:w for w,k in zip(Y, range(0, 1000))}

# Generators
training_set = Dataset(partition['train'], noisy_labels, 'train')
training_generator = data.DataLoader(training_set, **params)

validation_set = Dataset(partition['validation'], gs_labels, 'test')
validation_generator = data.DataLoader(validation_set, **params)

In [12]:
model = Model(input_size = 1024, output_size = 3, hidden_dim = 200, n_layers = 1)
model.to("cpu")

n_epochs = 100

device = "cpu"
print_every = 100
batch_size = 50

model = train(model = model,
             n_epochs = 250,
             train_generator = training_generator,
             val_generator = validation_generator)

Epoch: 2/250... Step: 100... Loss: 0.528898... Val Loss: 1.002922
Epoch: 2/250... Step: 200... Loss: 0.576691... Val Loss: 1.018679
Epoch: 2/250... Step: 300... Loss: 0.472449... Val Loss: 1.097656
Epoch: 3/250... Step: 400... Loss: 0.579681... Val Loss: 1.114452
Epoch: 3/250... Step: 500... Loss: 0.586448... Val Loss: 1.136326
Epoch: 3/250... Step: 600... Loss: 0.521979... Val Loss: 1.020871


Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/john.brandt/anaconda3/lib/python3.7/multiprocessing/connectio

KeyboardInterrupt: 

## To do: Snorkel end model with RoBERTa-LSTM input module and multi-task output head

## To do: Snorkel end model with RoBERTa-LSTM input module and multi-task output head, with concatenation of additional feature engineering

## To do: Snorkel end model with RoBERTa-LSTM input module and multi-task output head, with concatenation of additional feature engineering and synonym augmentation