# Natural Language Inference with Mixed Effects Models

In [63]:
import numpy as np
import pandas as pd
import torch

from typing import Tuple
from pandas.api.types import CategoricalDtype
from torch import softmax
from torch.nn import Module, ModuleList, Linear, LogSoftmax, ReLU, Sequential, CrossEntropyLoss, BCEWithLogitsLoss, Dropout
from torch.optim import Adam
from torch.distributions.multivariate_normal import MultivariateNormal

from fairseq.data.data_utils import collate_tokens

# Load data

## MegaVeridicality

We first pull the [MegaVeridicality v2](http://megaattitude.io/projects/mega-veridicality/) data from the [MegaAttitude website](http://megaattitude.io/).

In [2]:
ver_url = 'http://megaattitude.io/projects/mega-veridicality/mega-veridicality-v2/mega-veridicality-v2.csv'

ver = pd.read_csv(ver_url)

ver.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,voice,polarity,conditional,sentence,veridicality,acceptability,nativeenglish,exclude
0,487,0,1,surmise,that_S,active,positive,False,Someone surmised that a particular thing happened,maybe,6,True,False
1,487,0,2,update,that_S,active,positive,False,Someone updated that a particular thing happened,yes,2,True,False
2,487,0,3,disregard,that_S,active,negative,False,Someone didn\'t disregard that a particular th...,maybe,4,True,False
3,487,0,4,agree,that_S,active,positive,False,Someone agreed that a particular thing happened,yes,4,True,False
4,487,0,5,surmise,that_S,active,negative,False,Someone didn\'t surmise that a particular thin...,maybe,3,True,False


### Filtering

Next, we remove non-native English speakers.

In [3]:
ver = ver[ver.nativeenglish]

ver.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,voice,polarity,conditional,sentence,veridicality,acceptability,nativeenglish,exclude
0,487,0,1,surmise,that_S,active,positive,False,Someone surmised that a particular thing happened,maybe,6,True,False
1,487,0,2,update,that_S,active,positive,False,Someone updated that a particular thing happened,yes,2,True,False
2,487,0,3,disregard,that_S,active,negative,False,Someone didn\'t disregard that a particular th...,maybe,4,True,False
3,487,0,4,agree,that_S,active,positive,False,Someone agreed that a particular thing happened,yes,4,True,False
4,487,0,5,surmise,that_S,active,negative,False,Someone didn\'t surmise that a particular thin...,maybe,3,True,False


MegaVeridicality contains judgments to the same items presented under two different prompts: 

1. Conditional prompt: If someone \_ed that a particular thing happened, did that thing happen?
2. Unconditional prompt: Someone \_ed that a particular thing happened. Did that thing happen?

We remove responses to conditional items.

In [4]:
ver = ver[~ver.conditional]

ver.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,voice,polarity,conditional,sentence,veridicality,acceptability,nativeenglish,exclude
0,487,0,1,surmise,that_S,active,positive,False,Someone surmised that a particular thing happened,maybe,6,True,False
1,487,0,2,update,that_S,active,positive,False,Someone updated that a particular thing happened,yes,2,True,False
2,487,0,3,disregard,that_S,active,negative,False,Someone didn\'t disregard that a particular th...,maybe,4,True,False
3,487,0,4,agree,that_S,active,positive,False,Someone agreed that a particular thing happened,yes,4,True,False
4,487,0,5,surmise,that_S,active,negative,False,Someone didn\'t surmise that a particular thin...,maybe,3,True,False


Finally, we remove NA responses, which arise from MTurk errors.

In [5]:
ver = ver[~ver.veridicality.isnull()]

### Hypothesis Generation

Next, we add a column containing the hypothesis corresponding to each item. This is just the declarative form of the question that participants were asked.

In [6]:
def make_hypothesis(frame):
    if frame in ['that_S', 'for_NP_to_VP']:
        return 'That thing happened.'
    elif frame in ['to_VPeventive', 'NP_to_VPeventive']:
        return 'That person did that thing.'
    elif frame in ['to_VPstative', 'NP_to_VPstative']:
        return 'That person had that thing.'
    
ver['hypothesis'] = ver.frame.map(make_hypothesis)

ver.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,voice,polarity,conditional,sentence,veridicality,acceptability,nativeenglish,exclude,hypothesis
0,487,0,1,surmise,that_S,active,positive,False,Someone surmised that a particular thing happened,maybe,6,True,False,That thing happened.
1,487,0,2,update,that_S,active,positive,False,Someone updated that a particular thing happened,yes,2,True,False,That thing happened.
2,487,0,3,disregard,that_S,active,negative,False,Someone didn\'t disregard that a particular th...,maybe,4,True,False,That thing happened.
3,487,0,4,agree,that_S,active,positive,False,Someone agreed that a particular thing happened,yes,4,True,False,That thing happened.
4,487,0,5,surmise,that_S,active,negative,False,Someone didn\'t surmise that a particular thin...,maybe,3,True,False,That thing happened.


### Column hashing

We then convert the response itself to an integer: _no_ = 0, _maybe_ = 1, _yes_ = 2. This is necessary for the model.

In [7]:
ver['veridicality'] = ver.veridicality.astype(CategoricalDtype(['no', 'maybe', 'yes']))
ver['target'] = ver.veridicality.cat.codes

ver.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,voice,polarity,conditional,sentence,veridicality,acceptability,nativeenglish,exclude,hypothesis,target
0,487,0,1,surmise,that_S,active,positive,False,Someone surmised that a particular thing happened,maybe,6,True,False,That thing happened.,1
1,487,0,2,update,that_S,active,positive,False,Someone updated that a particular thing happened,yes,2,True,False,That thing happened.,2
2,487,0,3,disregard,that_S,active,negative,False,Someone didn\'t disregard that a particular th...,maybe,4,True,False,That thing happened.,1
3,487,0,4,agree,that_S,active,positive,False,Someone agreed that a particular thing happened,yes,4,True,False,That thing happened.,2
4,487,0,5,surmise,that_S,active,negative,False,Someone didn\'t surmise that a particular thin...,maybe,3,True,False,That thing happened.,1


We similarly convert the participant indices to contiguous integers. This step is necessary since we removed some participants, meaning the participant identifiers are not necessarily contiguous. This conversion is necessary for the random effects component of the model.

In [8]:
ver['participant'] = ver.participant.astype('category').cat.codes

### Majority response

Lastly, we compute the modal response for each verb-frame pair. This will allow us to determine how well the model does in comparison to the best possible model.

In [9]:
ver['modal_response'] = ver.groupby(['verb', 'frame']).target.transform(lambda x: int(np.round(np.mean(x))))
(ver.target == ver.modal_response).mean()

0.6535400834164939

## MegaNegRaising

Next, we pull the [MegaNegRaising v1](http://megaattitude.io/projects/mega-negraising/) data from the [MegaAttitude website](http://megaattitude.io/).

In [10]:
neg_url = 'http://megaattitude.io/projects/mega-negraising/mega-negraising-v1/mega-negraising-v1.tsv'

neg = pd.read_csv(neg_url, sep='\t')

neg.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,tense,subject,sentence,negraising,acceptability,nativeenglish
0,229,81,8,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.3,0.97,True
1,259,81,15,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.09,0.34,True
2,883,81,14,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.5,0.71,True
3,665,81,22,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.0,0.87,True
4,901,81,12,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.33,0.61,True


## Hypothesis Generation

To generate the hypotheses, it is useful to use the sentences found in [MegaAcceptability v2](http://megaattitude.io/projects/mega-acceptability/). And so we pull those data as well.

In [11]:
acc_url = 'http://megaattitude.io/projects/mega-acceptability/mega-acceptability-v2/mega-acceptability-v2.tsv'

acc = pd.read_csv(acc_url, sep='\t')

acc.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,tense,response,nativeenglish,sentence,version
0,192,504,13,abhor,NP V,past,7.0,True,Someone abhorred.,1
1,638,504,13,abhor,NP V,past,3.0,True,Someone abhorred.,1
2,200,504,13,abhor,NP V,past,1.0,True,Someone abhorred.,1
3,336,504,13,abhor,NP V,past,3.0,True,Someone abhorred.,1
4,339,504,13,abhor,NP V,past,4.0,True,Someone abhorred.,1


Using this data for hypothesis generation requires that we have access to the verb lemma and verb form found in the sentence. These functions extract that information.

In [12]:
def get_idx(sentence, tense, template, verblemma):
    tokens = sentence.split()
    lemmasplit = verblemma.split('_')
    idx = np.where([w=='V' for w in template.split()])[0][0]
    
    if template == 'S, I V':
        if len(lemmasplit) > 1:
            return [len(tokens)-3, len(tokens)-2]
        else:
            return [len(tokens)-2]
        
    elif tense == 'past_progressive':
        
        if len(lemmasplit) > 1:
            return [idx+1, idx+2]
        else:
            return [idx+1]
        
    else:
        if len(lemmasplit) > 1:
            return [idx, idx+1]
        else:
            return [idx]

def get_verb_form(sentence, idx):
    tokens = np.array(sentence.split())
    return ' '.join([c.replace('.', '') for t in tokens[idx] for c in t.split('_')])

We then extract that information.

In [13]:
acc['verbidx'] = acc[['sentence', 'tense', 'frame', 'verb']].apply(lambda x: get_idx(*x), axis=1)
acc['verbform'] = acc[['sentence', 'verbidx']].apply(lambda x: get_verb_form(*x), axis=1)

acc.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,tense,response,nativeenglish,sentence,version,verbidx,verbform
0,192,504,13,abhor,NP V,past,7.0,True,Someone abhorred.,1,[1],abhorred
1,638,504,13,abhor,NP V,past,3.0,True,Someone abhorred.,1,[1],abhorred
2,200,504,13,abhor,NP V,past,1.0,True,Someone abhorred.,1,[1],abhorred
3,336,504,13,abhor,NP V,past,3.0,True,Someone abhorred.,1,[1],abhorred
4,339,504,13,abhor,NP V,past,4.0,True,Someone abhorred.,1,[1],abhorred


Because we just care about the mapping from verb, frame, and tense to sentence, we drop all the other columns, de-dupe, and rename the sentence column to `hypothesis`.

In [14]:
sentence_map = acc[['verb', 'verbform', 'frame', 'tense', 'sentence']].drop_duplicates().reset_index(drop=True)

sentence_map = sentence_map.rename(columns={'sentence': 'hypothesis'})

sentence_map.head()

Unnamed: 0,verb,verbform,frame,tense,hypothesis
0,abhor,abhorred,NP V,past,Someone abhorred.
1,abhor,abhorred,NP V NP,past,Someone abhorred something.
2,abhor,abhorring,NP V NP,past_progressive,Someone was abhorring something.
3,abhor,abhors,NP V NP,present,Someone abhors something.
4,abhor,abhorred,NP V NP VP,past,Someone abhorred someone do something.


We then add the negation in to make the neg-raising hypotheses and then add them to the neg-raising data.

In [15]:
sentence_map['hypothesis'] = sentence_map.hypothesis.str.replace('something happened.', "that thing didn't happen.")
sentence_map['hypothesis'] = sentence_map.hypothesis.str.replace('to do something.', "not to do that thing.")
sentence_map['hypothesis'] = sentence_map.hypothesis.str.replace('to have something.', "not to have that thing.")

neg = pd.merge(neg, sentence_map)

neg.head()

Unnamed: 0,participant,list,presentationorder,verb,frame,tense,subject,sentence,negraising,acceptability,nativeenglish,verbform,hypothesis
0,229,81,8,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.3,0.97,True,abhorred,Someone abhorred that that thing didn't happen.
1,259,81,15,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.09,0.34,True,abhorred,Someone abhorred that that thing didn't happen.
2,883,81,14,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.5,0.71,True,abhorred,Someone abhorred that that thing didn't happen.
3,665,81,22,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.0,0.87,True,abhorred,Someone abhorred that that thing didn't happen.
4,901,81,12,abhor,NP V that S,past,first,I didn't abhor that a particular thing happened.,0.33,0.61,True,abhorred,Someone abhorred that that thing didn't happen.


The last thing we need to do is convert the subject to first person when the neg-raising sentence being judged has a first person subject.

In [16]:
def convert_subject(subject, tense, verb, frame, verbform, hypothesis):
    if subject == 'first':
        hypothesis = hypothesis.replace('Someone', 'I')
        
        if tense == 'present':
            hypothesis = hypothesis.replace('I is', "I'm")
            
            if 'be' not in frame:
                hypothesis = hypothesis.replace(verbform, verb)
        
        return hypothesis
    else:
        return hypothesis.replace('Someone', 'That person')
    
neg['hypothesis'] = neg[['subject', 'tense', 'verb', 'frame', 'verbform', 'hypothesis']].apply(lambda x: convert_subject(*x), axis=1)

neg[(neg.subject=='first')&(neg.tense=='present')]

Unnamed: 0,participant,list,presentationorder,verb,frame,tense,subject,sentence,negraising,acceptability,nativeenglish,verbform,hypothesis
40,1012,39,18,abhor,NP be V that S,present,first,I'm not abhorred that a particular thing happe...,0.00,0.27,True,abhorred,I'm abhorred that that thing didn't happen.
41,1018,39,3,abhor,NP be V that S,present,first,I'm not abhorred that a particular thing happe...,0.64,0.36,True,abhorred,I'm abhorred that that thing didn't happen.
42,265,39,17,abhor,NP be V that S,present,first,I'm not abhorred that a particular thing happe...,0.76,0.56,True,abhorred,I'm abhorred that that thing didn't happen.
43,488,39,10,abhor,NP be V that S,present,first,I'm not abhorred that a particular thing happe...,0.04,0.64,True,abhorred,I'm abhorred that that thing didn't happen.
44,643,39,3,abhor,NP be V that S,present,first,I'm not abhorred that a particular thing happe...,0.58,0.58,True,abhorred,I'm abhorred that that thing didn't happen.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78845,550,209,28,yell,NP V to VP[+eventive],present,first,I don't yell to do a particular thing.,0.00,0.60,True,yells,I yell not to do that thing.
78846,377,209,31,yell,NP V to VP[+eventive],present,first,I don't yell to do a particular thing.,0.69,0.00,True,yells,I yell not to do that thing.
78847,937,209,30,yell,NP V to VP[+eventive],present,first,I don't yell to do a particular thing.,0.27,0.31,True,yells,I yell not to do that thing.
78848,771,209,25,yell,NP V to VP[+eventive],present,first,I don't yell to do a particular thing.,0.63,0.41,True,yells,I yell not to do that thing.


### Column Hashing

As for MegaVeridicality, we need to map the participants to contiguous integers. The neg-raising slider response itself does not need to be changed, since it is already numeric.

In [17]:
neg['participant'] = neg.participant.astype('category').cat.codes
neg['target'] = neg.negraising

### Majority Response

We will be using a binary cross entrop loss, and the best possible response for this loss is the mean

In [18]:
neg['modal_response'] = neg.groupby(['verb', 'frame', 'tense', 'subject']).negraising.transform(np.mean)

-(neg.negraising * np.log(neg.modal_response) + (1-neg.negraising) * np.log(1-neg.modal_response)).mean()

0.6229927846085578

# Fit models

In [19]:
roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
roberta.eval()

Using cache found in /Users/wgantt/.cache/torch/hub/pytorch_fairseq_master


RobertaHubInterface(
  (model): RobertaModel(
    (decoder): RobertaEncoder(
      (sentence_encoder): TransformerSentenceEncoder(
        (embed_tokens): Embedding(50265, 768, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 768, padding_idx=1)
        (layers): ModuleList(
          (0): TransformerSentenceEncoderLayer(
            (self_attn): MultiheadAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
            (final_layer_norm): LayerNorm((768

In [20]:
token_ids = roberta.encode(neg.loc[100,'sentence'], neg.loc[100,'hypothesis'])

print(neg.loc[100,'sentence'], neg.loc[100,'hypothesis'])
print(roberta.decode(token_ids))

I didn't accept that a particular thing happened. I accepted that that thing didn't happen.
["I didn't accept that a particular thing happened.", "I accepted that that thing didn't happen."]


In [21]:
with torch.no_grad():
    emb = roberta.extract_features(token_ids)
    
emb.shape, emb

(torch.Size([1, 23, 768]),
 tensor([[[-0.1013,  0.0900, -0.0011,  ..., -0.1106, -0.0496, -0.0244],
          [-0.0525,  0.0970, -0.0762,  ...,  0.2017, -0.1770,  0.0313],
          [-0.0429,  0.1596, -0.0277,  ..., -0.1038, -0.2597,  0.1564],
          ...,
          [ 0.0680, -0.0329,  0.1913,  ..., -0.1216,  0.2177, -0.0265],
          [-0.1008,  0.0795, -0.0270,  ..., -0.1501, -0.0499, -0.0519],
          [-0.0529, -0.0010,  0.0384,  ..., -0.0635,  0.0235,  0.0492]]]))

In [237]:
class NaturalLanguageInference(torch.nn.Module):
    
    def __init__(self, embedding_dim: int, n_predictor_layers: int,  
                 output_dim: int, n_participants: int,
                 use_random_slopes=False, device=torch.device('cpu')):
        super().__init__()
        
        self.embedding_dim = embedding_dim
        self.n_predictor_layers = n_predictor_layers
        self.output_dim = output_dim
        self.n_participants = n_participants
        self.use_random_slopes = use_random_slopes
        self.device = device
        
        self._initialize_random_effects()
        
        # TODO: comment
        if self.use_random_slopes:
            self.predictor = self._initialize_predictor_for_random_slopes(self.n_participants)
        else:
            self.predictor = self._initialize_predictor()
     
    def _initialize_predictor(self):
        seq = []
        
        prev_size = self.embedding_dim
        
        for l in range(self.n_predictor_layers):
            curr_size = int(prev_size/2)
                
            seq += [Linear(prev_size,
                           curr_size),
                    ReLU(),
                    Dropout(0.5)]

            prev_size = curr_size

        seq += [Linear(prev_size,
                       self.output_dim)]
        
        return Sequential(*seq)

    def _initialize_predictor_for_random_slopes(self, n_participants):
        # Separate MLP for each annotator. We assume the annotator IDs
        # are zero-indexed and range up to n_participants.
        heads = []
        for _ in range(n_participants):
            seq = []
            prev_size = self.embedding_dim
        
            for l in range(self.n_predictor_layers):
                curr_size = int(prev_size/2)

                seq += [Linear(prev_size,
                               curr_size),
                        ReLU(),
                        Dropout(0.5)]

                prev_size = curr_size

            seq += [Linear(prev_size,
                           self.output_dim)]
            
            heads.append(Sequential(*seq))
            
        return ModuleList(heads)
        
    def forward(self, embeddings, participant=None) -> Tuple[torch.Tensor, torch.Tensor]:
        if self.use_random_slopes:
            # Not sure this is right, but given that the *entire* prediction head
            # is random, I don't know what would count as the fixed component.
            # Maybe just the embedding?
            fixed = None
            
            # In the random slopes setting, 'predictor' is actually a Tensor of 
            # n_participants MLPs, one for each participant. Couldn't figure out
            # a way to vectorize this, unfortunately. This should probably go in
            # the _random_effects method of a CategoricalNaturalLanguageInferenceRandomSlopes
            # class.
            random = torch.stack([self.predictor[p](e.mean(0)) for p, e in zip(participant, embeddings)], dim=0)
            random_loss = self._random_loss(self._random_effects(participant))
            
        else:
            fixed = self.predictor(embeddings.mean(1))
        
            if participant is None:
                random = None
                random_loss = 0.
            else:
                random = self._random_effects(participant)
                random_loss = self._random_loss(random)
        
        prediction = self._link_function(fixed, random, participant)
        
        return prediction, random_loss

    def embed(self, items: pd.DataFrame) -> torch.Tensor:
        texts, hypotheses = items.sentence.values, items.hypothesis.values
                
        token_ids = collate_tokens([roberta.encode(t, h) 
                                    for t, h in zip(texts, hypotheses)], 
                                   pad_idx=1)

        with torch.no_grad():
            embedding = roberta.extract_features(token_ids)
            
        return embedding

class CategoricalNaturalLanguageInference(NaturalLanguageInference):
    
    def _initialize_random_effects(self):
        self.random_effects = torch.randn(self.n_participants, self.output_dim)
    
    # Not sure why "participant" is a parameter here
    def _random_effects(self, participant):
        # No mean subtraction for random slopes
        if self.use_random_slopes:
            return self.random_effects
        else:
            return self.random_effects - self.random_effects.mean(0)[None,:]
    
    def _link_function(self, fixed, random, participant):
        # Random slopes + random intercepts case
        if fixed is None:
            return random
        # Fixed effects (standard setting)
        elif random is None:
            return fixed
        # Random intercepts alone (extended setting)
        else:
            return fixed + random[participant]
    
    def _random_loss(self, random):
        # TODO: Handle random slopes for tied covariance? May not be worth it.
        print(random.std(0))
        return torch.mean(torch.square(random/random.std(0)[None,:]))

class CategoricalNaturalLanguageInferenceUntiedCovariance(CategoricalNaturalLanguageInference):
    
    def _random_loss(self, random):
        # Random slopes + random intercepts: may have non-zero mean
        if self.use_random_slopes:
            mean = random.mean(0)
            # This is currently incorrect
            cov = torch.matmul(torch.transpose(random, 1, 0), random) / (self.n_participants - 1)
        # Random intercepts only: mean is zero
        else:
            mean = torch.zeros(self.n_participants, self.output_dim)
            cov = torch.matmul(torch.transpose(random, 1, 0), random) / (self.n_participants - 1)
        return torch.mean(MultivariateNormal(mean, cov).log_prob(random)[None,:])

In [133]:
nli = CategoricalNaturalLanguageInference(embedding_dim=768, n_predictor_layers=2, output_dim=3, n_participants=10)
nli(torch.cat([emb, emb]), [0, 1])

torch.Size([10, 3])


(tensor([[ 1.2562,  0.5619, -0.2292],
         [ 2.4061, -0.1792,  0.0312]], grad_fn=<AddBackward0>),
 tensor(0.9000))

In [91]:
nli(torch.cat([emb, emb]))

(tensor([[ 0.2046,  0.0377,  0.1039],
         [ 0.0072,  0.0219, -0.0238]], grad_fn=<AddmmBackward>),
 0.0)

In [92]:
nli = CategoricalNaturalLanguageInferenceUntiedCovariance(embedding_dim=768, n_predictor_layers=2, output_dim=3, n_participants=10)
nli(torch.cat([emb, emb]), [0, 1])

AttributeError: 'list' object has no attribute 'shape'

In [239]:
class UnitNaturalLanguageInference(NaturalLanguageInference):
    
    # TODO: Convert to using beta distribution
    def _initialize_random_effects(self):
        self.random_effects = torch.randn(self.n_participants, 2)
        
    def _random_effects(self, participant):
        random_scale = torch.square(self.random_effects[:,0])
        random_shift = self.random_effects[:,1] - self.random_effects[:,1].mean(0)
        
        return random_scale, random_shift
    
    def _link_function(self, fixed, random, participant):
        if random is None:
            return torch.square(self.random_effects[:,0]).mean()*fixed.squeeze(1)
        else:
            random_scale, random_shift = random
            return (random_scale[participant][:,None]*fixed +\
                    random_shift[participant][:,None]).squeeze(1)
    
    def _random_loss(self, random):
        random_scale, random_shift = random
        
        random_scale_loss = torch.mean(torch.square(random_scale/random_scale.mean(0)))
        random_shift_loss = torch.mean(torch.square(random_shift/random_shift.std(0)))
        print(random_scale)
        
        return (random_scale_loss + random_shift_loss)/2

In [None]:
class UnitNaturalLanguageInferenceRandomSlopes(NaturalLanguageInference):
    
    # TODO: implement
    def _initialize_random_effects(self):
        self.random_effects = torch.randn(self.n_participants, 2)
        
    def _random_effects(self, participant):
        pass

In [151]:
nli = UnitNaturalLanguageInference(embedding_dim=768, n_predictor_layers=2, output_dim=1, n_participants=10)
nli(torch.cat([emb, emb]), [0, 1])

AttributeError: 'list' object has no attribute 'shape'

In [153]:
nli(torch.cat([emb, emb]))

torch.Size([1, 23, 768])

In [223]:
class NaturalLanguageInferenceTrainer:
    
    def __init__(self, n_participants: int, 
                 embedding_dim: int = 768, 
                 n_predictor_layers: int = 2,
                 use_random_slopes: bool = False,
                 device=torch.device('cpu')):
        self.embedding_dim = embedding_dim
        self.n_predictor_layers = n_predictor_layers
        self.n_participants = n_participants
        self.device = device
        self.use_random_slopes = use_random_slopes,
        self.nli = self.MODEL_CLASS(embedding_dim, 
                                    n_predictor_layers,
                                    self.OUTPUT_DIM, 
                                    n_participants,
                                    use_random_slopes,
                                    device)
    
    def fit(self, data: pd.DataFrame, batch_size: int = 32, 
            n_epochs: int = 10, lr: float = 1e-2, verbosity: int = 10):
        
        optimizer = Adam(self.nli.parameters(),
                         lr=lr)
        lossfunc = self.LOSS_CLASS()
        
        self.nli.train()
        
        n_batches = np.ceil(data.shape[0]/batch_size)
        
        for epoch in range(n_epochs):
            # shuffle the data
            data = data.sample(frac=1).reset_index(drop=True)

            data['batch_idx'] = np.repeat(np.arange(n_batches), batch_size)[:data.shape[0]]
            
            loss_trace = []
            acc_trace = []
            best_trace = []
            
            for batch, items in data.groupby('batch_idx'):
                self.nli.zero_grad()
                
                participant = torch.LongTensor(items.participant.values).to(self.device)
                target = self.TARGET_TYPE(items.target.values)
                
                embedding = self.nli.embed(items)
                
                prediction, random_loss = self.nli(embedding, participant)
            
                fixed_loss = lossfunc(prediction, target)
                
                loss = fixed_loss + random_loss
                
                loss_trace.append(loss.item()-random_loss.item())
                
                if self.MODEL_CLASS is CategoricalNaturalLanguageInference:
                    acc = (prediction.argmax(1) == target).data.cpu().numpy().mean()
                    best = (items.modal_response==items.target).mean()
                    
                    acc_trace.append(acc)
                    best_trace.append(acc/best)
                
                elif self.MODEL_CLASS is CategoricalNaturalLanguageInferenceUntiedCovariance:
                    # Is the mean still the best possible performance in the random slopes case?
                    # Something's fishy here when use_random_slopes = True, since the "best" loss
                    # is often greater than the 
                    acc = (prediction.argmax(1) == target).data.cpu().numpy().mean()
                    best = (items.modal_response==items.target).mean()
                    
                    acc_trace.append(acc)
                    best_trace.append(acc/best)
                    
                elif self.MODEL_CLASS is UnitNaturalLanguageInference:
                    acc = loss_trace[-1]
                    best = -(items.target.values * np.log(items.modal_response.values) +\
                             (1-items.target.values) * np.log(1-items.modal_response.values)).mean()
                    
                    acc_trace.append(acc)
                    best_trace.append(1 - (acc-best)/best)
                
                if not (batch % verbosity):
                    print(f"fixed loss: {fixed_loss}")
                    print(f"random loss: {random_loss}")
                    print('epoch:              ', int(epoch))
                    print('batch:              ', int(batch))
                    print('mean loss:          ', np.round(np.mean(loss_trace), 2))
                    print('mean acc.:          ', np.round(np.mean(acc_trace), 2))
                    print('prop. best possible:', np.round(np.mean(best_trace), 2))
                    print()
                    
                    print()

                    loss_trace = []
                    acc_trace = []
                    best_trace = []
                
                loss.backward()
                optimizer.step()
                
        return self.nli.eval()
    
    def calibrate(self, data: pd.DataFrame,
                  trained_model: NaturalLanguageInference, 
                  calibrator_trainer_class: 'NaturalLanguageInferenceTrainer') -> NaturalLanguageInference:
        raise NotImplementedError
        
        n_participants = data.participant.unique().shape[0]
        
        trained_model.eval()
        
        calibrator_trainer = calibrator_trainer_class(n_participants=n_participants,
                                                      embedding_dim=trained_model.output_dim,
                                                      n_predictor_layers=self.n_predictor_layers)
        
        calibrator = calibrator_trainer.fit(data)

In [238]:
class CategoricalNaturalLanguageInferenceTrainer(NaturalLanguageInferenceTrainer):
    MODEL_CLASS = CategoricalNaturalLanguageInference
    LOSS_CLASS = CrossEntropyLoss
    TARGET_TYPE = torch.LongTensor
    OUTPUT_DIM = 3

n_ver_participants = ver.participant.unique().shape[0]
    
cnli_trainer = CategoricalNaturalLanguageInferenceTrainer(n_participants=n_ver_participants)
ver_model = cnli_trainer.fit(data=ver[ver.verb.isin(["know", 'think'])],
                             n_epochs=25, batch_size=128)

tensor([0.9914, 1.0290, 0.9344])
fixed loss: 1.40700101852417
random loss: 0.9980276823043823
epoch:               0
batch:               0
mean loss:           1.41
mean acc.:           0.34
prop. best possible: 0.59


tensor([0.9914, 1.0290, 0.9344])
tensor([0.9914, 1.0290, 0.9344])
fixed loss: 1.5120247602462769
random loss: 0.9980276823043823
epoch:               1
batch:               0
mean loss:           1.51
mean acc.:           0.47
prop. best possible: 0.76


tensor([0.9914, 1.0290, 0.9344])
tensor([0.9914, 1.0290, 0.9344])
fixed loss: 1.3643431663513184
random loss: 0.9980276823043823
epoch:               2
batch:               0
mean loss:           1.36
mean acc.:           0.36
prop. best possible: 0.56


tensor([0.9914, 1.0290, 0.9344])
tensor([0.9914, 1.0290, 0.9344])
fixed loss: 1.2718019485473633
random loss: 0.9980276823043823
epoch:               3
batch:               0
mean loss:           1.27
mean acc.:           0.41
prop. best possible: 0.71


tensor([0.9914,

KeyboardInterrupt: 

In [110]:
items = ver[(ver.verb.isin(['think', 'know']))&(ver.frame=='that_S')][['verb', 'frame', 'sentence', 'hypothesis']].drop_duplicates()
embedding = ver_model.embed(items)

list(zip(items.sentence.values, items.hypothesis.values, torch.softmax(ver_model(embedding)[0], 1).data.cpu().numpy()))

[('Someone thought that a particular thing happened',
  'That thing happened.',
  array([0.00310725, 0.8000219 , 0.19687088], dtype=float32)),
 ("Someone didn\\'t think that a particular thing happened",
  'That thing happened.',
  array([0.39985964, 0.5714444 , 0.02869605], dtype=float32)),
 ('Someone knew that a particular thing happened',
  'That thing happened.',
  array([6.890681e-07, 2.588863e-02, 9.741107e-01], dtype=float32)),
 ("Someone didn\\'t know that a particular thing happened",
  'That thing happened.',
  array([0.03967796, 0.44767112, 0.5126509 ], dtype=float32))]

In [236]:
class CategoricalNaturalLanguageInferenceUntiedCovarianceTrainer(NaturalLanguageInferenceTrainer):
    MODEL_CLASS = CategoricalNaturalLanguageInferenceUntiedCovariance
    LOSS_CLASS = CrossEntropyLoss
    TARGET_TYPE = torch.LongTensor
    OUTPUT_DIM = 3

n_ver_participants = ver.participant.unique().shape[0]
    
cnli_trainer = CategoricalNaturalLanguageInferenceUntiedCovarianceTrainer(n_participants=n_ver_participants, use_random_slopes=False)
ver_model = cnli_trainer.fit(data=ver[ver.verb.isin(["know", 'think'])],
                             n_epochs=25, batch_size=128)

tensor([[ 0.9461,  0.0122, -0.0895],
        [ 0.0122,  1.0064,  0.0334],
        [-0.0895,  0.0334,  1.0127]])
fixed loss: 1.4101446866989136
random loss: -4.230784893035889
epoch:               0
batch:               0
mean loss:           1.41
mean acc.:           0.32
prop. best possible: 0.53


tensor([[ 0.9461,  0.0122, -0.0895],
        [ 0.0122,  1.0064,  0.0334],
        [-0.0895,  0.0334,  1.0127]])
tensor([[ 0.9461,  0.0122, -0.0895],
        [ 0.0122,  1.0064,  0.0334],
        [-0.0895,  0.0334,  1.0127]])
fixed loss: 1.295589566230774
random loss: -4.230784893035889
epoch:               1
batch:               0
mean loss:           1.3
mean acc.:           0.43
prop. best possible: 0.75


tensor([[ 0.9461,  0.0122, -0.0895],
        [ 0.0122,  1.0064,  0.0334],
        [-0.0895,  0.0334,  1.0127]])
tensor([[ 0.9461,  0.0122, -0.0895],
        [ 0.0122,  1.0064,  0.0334],
        [-0.0895,  0.0334,  1.0127]])
fixed loss: 1.3057492971420288
random loss: -4.230784893035889
e

KeyboardInterrupt: 

In [None]:
items = ver[(ver.verb.isin(['think', 'know']))&(ver.frame=='that_S')][['verb', 'frame', 'sentence', 'hypothesis']].drop_duplicates()
embedding = ver_model.embed(items)

list(zip(items.sentence.values, items.hypothesis.values, torch.softmax(ver_model(embedding)[0], 1).data.cpu().numpy()))

In [240]:
class UnitNaturalLanguageInferenceTrainer(NaturalLanguageInferenceTrainer):
    MODEL_CLASS = UnitNaturalLanguageInference
    LOSS_CLASS = BCEWithLogitsLoss
    TARGET_TYPE = torch.FloatTensor
    OUTPUT_DIM = 1

n_neg_participants = neg.participant.unique().shape[0]

unli_trainer = UnitNaturalLanguageInferenceTrainer(n_participants=n_neg_participants)
neg_model = unli_trainer.fit(data=neg[neg.verb.isin(['think', 'know'])], 
                             n_epochs=100, batch_size=128, lr=1-1)

tensor([1.6621, 0.5234, 0.7755,  ..., 0.0270, 0.8010, 0.9642])
fixed loss: 0.8863546848297119
random loss: 1.948185682296753
epoch:               0
batch:               0
mean loss:           0.89
mean acc.:           0.89
prop. best possible: 0.57


tensor([1.6621, 0.5234, 0.7755,  ..., 0.0270, 0.8010, 0.9642])
tensor([1.6621, 0.5234, 0.7755,  ..., 0.0270, 0.8010, 0.9642])
tensor([1.6621, 0.5234, 0.7755,  ..., 0.0270, 0.8010, 0.9642])
fixed loss: 0.8896234631538391
random loss: 1.948185682296753
epoch:               1
batch:               0
mean loss:           0.89
mean acc.:           0.89
prop. best possible: 0.59


tensor([1.6621, 0.5234, 0.7755,  ..., 0.0270, 0.8010, 0.9642])


KeyboardInterrupt: 

In [None]:
items = neg[neg.verb.isin(['think', 'know'])][['verb', 'frame', 'sentence', 'hypothesis']].drop_duplicates()
embedding = neg_model.embed(items)

list(zip(items.sentence.values, items.hypothesis.values, torch.sigmoid(neg_model(embedding)[0]).data.cpu().numpy()))

In [None]:
neg_model.random_effects[:,0].square().mean()