In [1]:
import pandas as pd
import ipdb
from tqdm.notebook import trange, tqdm
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score as acc, precision_score as prec, recall_score as rec, matthews_corrcoef as mattcorr
import torch
from transformers import *
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
import random

In [2]:
# Set random seeds for reproducibility on a specific machine
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.manual_seed(1)
torch.cuda.manual_seed(1)
random.seed(1)
np.random.seed(1)
np.random.RandomState(0)

RandomState(MT19937) at 0x1A4ABC1678

In [3]:
train = pd.read_csv('../annotated_data/train.tsv', sep='\t', header=0)
train['Sentence'] = train['Sentence'].apply(lambda x: x.lower())
train_sentences = train['Sentence'].tolist()
train_labels_DS = train['DS_Label'].values
train_labels_Maj = train['Majority_label'].values


dev = pd.read_csv('../annotated_data/dev.tsv', sep='\t', header=0)
dev['Sentence'] = dev['Sentence'].apply(lambda x: x.lower())
dev_sentences = dev['Sentence'].tolist()
dev_labels_DS = dev['DS_Label'].values
dev_labels_Maj = dev['Majority_label'].values

test = pd.read_csv('../annotated_data/test.tsv', sep='\t', header=0)
test['Sentence'] = test['Sentence'].apply(lambda x: x.lower())
test_sentences = test['Sentence'].tolist()
test_labels_DS = test['DS_Label'].values
test_labels_Maj = test['Majority_label'].values

Creating new columns that might help later on with the sequence labelling task:
1. *Post.ID* The reddit post from which the replies are scraped - 
2. *Reply.ID* The id number of the reply, as ordered by reddit's best algorithm (need to check up on this with Ben)
3. *Sent.Num* The sentence number, in order, from within the reply

In [4]:
train['Post.ID'] = train['ID'].apply(lambda x: x.split('-')[0])
train['Reply.ID'] = train['ID'].apply(lambda x: x.split('-')[1])
train['Sent.Num'] = train['ID'].apply(lambda x: x.split('-')[2])

dev['Post.ID'] = dev['ID'].apply(lambda x: x.split('-')[0])
dev['Reply.ID'] = dev['ID'].apply(lambda x: x.split('-')[1])
dev['Sent.Num'] = dev['ID'].apply(lambda x: x.split('-')[2])

test['Post.ID'] = test['ID'].apply(lambda x: x.split('-')[0])
test['Reply.ID'] = test['ID'].apply(lambda x: x.split('-')[1])
test['Sent.Num'] = test['ID'].apply(lambda x: x.split('-')[2])

train.set_index('ID',inplace=True)
train.head()

Unnamed: 0_level_0,Majority_label,DS_Label,Sentence,Post.ID,Reply.ID,Sent.Num
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dbk05v-8-2,1,1,if you had an office ( is studio the right wor...,dbk05v,8,2
8pf0i0-2-4,0,0,but i would not go to a strip club with my mom .,8pf0i0,2,4
cqeljh-4-0,0,0,i 'm not in your situation .,cqeljh,4,0
cb4rzb-1-5,1,1,"if your home is half way clean , oh yeah , the...",cb4rzb,1,5
7gvjhg-4-0,0,0,same as everyone else .,7gvjhg,4,0


In [5]:
# Print the maximum length of sequences in train, dev and test -- useful for setting MAX_LEN later
print(sorted([len(a.split()) for a in train_sentences])[-10:])
print(max([len(a.split()) for a in dev_sentences]))
print(max([len(a.split()) for a in test_sentences]))

[91, 92, 92, 93, 112, 126, 126, 129, 139, 152]
91
108


The following models are the ones I need to look into for now

In [6]:
MODELS = [(BertModel,       BertTokenizer,       'bert-base-cased'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base')
         ]
# In addition, need to look into SentenceBERT

## Sequence Labelling Task

In [7]:
train.sort_values(by=['Post.ID', 'Reply.ID', 'Sent.Num']).head()

Unnamed: 0_level_0,Majority_label,DS_Label,Sentence,Post.ID,Reply.ID,Sent.Num
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
30twyy-1-0,1,1,i would ask for a thyroid check .,30twyy,1,0
30twyy-1-1,0,0,no need for a specialist there .,30twyy,1,1
30twyy-1-2,1,1,"as for the teeth , that 's better brought up w...",30twyy,1,2
30twyy-1-3,0,0,i 'm 26 years old and i still have a baby toot...,30twyy,1,3
30twyy-1-4,0,0,does n't necessarily mean anything bad health ...,30twyy,1,4


In [8]:
def BIO_convert(array):
    '''
    Convert a sequence of 1s and 0s to BIO(Beginning-Inside-Outside) format
    '''
    bio = ['O' for i in range(len(array))]
    if 1 not in array:
        return bio
    else:
        bio[array.index(1)] = 'B'
        for k in range(array.index(1)+1, len(array)):
            if array[k] == 1 and bio[k-1] == 'B':
                bio[k] = 'I'
            elif array[k] == 0:
                bio[k] = 'O'
            elif array[k] == 1 and array[k-1] == 0:
                bio[k] = 'B'
        return bio

In [9]:
train['Post.Reply.ID'] = train.apply(lambda row: row['Post.ID'] + '-' + str(row['Reply.ID']), axis=1)

train_seq = {'Post.Reply.ID':[], 'Sequence':[], 'Maj_Seq_Labels': [], 'DS_Seq_Labels': [] }

for reply_id in train['Post.Reply.ID'].unique():
    train_seq['Post.Reply.ID'].append(reply_id)
    train_seq['Sequence'].append(
        train[train['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['Sentence'].values.tolist())
    train_seq['Maj_Seq_Labels'].append( 
        train[train['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['Majority_label'].values.tolist())
    train_seq['DS_Seq_Labels'].append(
        train[train['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['DS_Label'].values.tolist())
        
train_seq = pd.DataFrame(train_seq)
train_seq['Sequence'] = train_seq['Sequence'].apply(lambda x: '[CLS] ' + ' [CLS] '.join(x))

# train[train['Post.Reply.ID']=='30twyy-1'].sort_values(by=['Sent.Num'])['Sentence'].values.tolist()


In [10]:
## Convert Majority and DS Labels to BIO format
# train_seq['Maj_Seq_Labels'] = train_seq['Maj_Seq_Labels'].apply(lambda x: BIO_convert(x))
# train_seq['DS_Seq_Labels'] = train_seq['DS_Seq_Labels'].apply(lambda x: BIO_convert(x))

In [11]:
## Same as above, but for dev set
dev['Post.Reply.ID'] = dev.apply(lambda row: row['Post.ID'] + '-' + str(row['Reply.ID']), axis=1)

dev_seq = {'Post.Reply.ID':[], 'Sequence':[], 'Maj_Seq_Labels': [], 'DS_Seq_Labels': [] }

for reply_id in dev['Post.Reply.ID'].unique():
    dev_seq['Post.Reply.ID'].append(reply_id)
    dev_seq['Sequence'].append(
        dev[dev['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['Sentence'].values.tolist())
    dev_seq['Maj_Seq_Labels'].append( 
        dev[dev['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['Majority_label'].values.tolist())
    dev_seq['DS_Seq_Labels'].append(
        dev[dev['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['DS_Label'].values.tolist())
        
dev_seq = pd.DataFrame(dev_seq)
dev_seq['Sequence'] = dev_seq['Sequence'].apply(lambda x: '[CLS] ' + ' [CLS] '.join(x))

## Convert Majority and DS Labels to BIO format
# dev_seq['Maj_Seq_Labels'] = dev_seq['Maj_Seq_Labels'].apply(lambda x: BIO_convert(x))
# dev_seq['DS_Seq_Labels'] = dev_seq['DS_Seq_Labels'].apply(lambda x: BIO_convert(x))

In [12]:
## Same as above, but for test set
test['Post.Reply.ID'] = test.apply(lambda row: row['Post.ID'] + '-' + str(row['Reply.ID']), axis=1)

test_seq = {'Post.Reply.ID':[], 'Sequence':[], 'Maj_Seq_Labels': [], 'DS_Seq_Labels': [] }

for reply_id in test['Post.Reply.ID'].unique():
    test_seq['Post.Reply.ID'].append(reply_id)
    test_seq['Sequence'].append(
        test[test['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['Sentence'].values.tolist())
    test_seq['Maj_Seq_Labels'].append( 
        test[test['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['Majority_label'].values.tolist())
    test_seq['DS_Seq_Labels'].append(
        test[test['Post.Reply.ID']==reply_id].sort_values(by=['Sent.Num'])['DS_Label'].values.tolist())
        
test_seq = pd.DataFrame(test_seq)
test_seq['Sequence'] = test_seq['Sequence'].apply(lambda x: '[CLS] ' + ' [CLS] '.join(x))

## Convert Majority and DS Labels to BIO format
# test_seq['Maj_Seq_Labels'] = test_seq['Maj_Seq_Labels'].apply(lambda x: BIO_convert(x))
# test_seq['DS_Seq_Labels'] = test_seq['DS_Seq_Labels'].apply(lambda x: BIO_convert(x))

In [13]:
print(train.shape, train_seq.shape, dev.shape, dev_seq.shape, test.shape, test_seq.shape)

(8623, 7) (2288, 4) (1309, 8) (336, 4) (1043, 8) (285, 4)


In [16]:
print(max([len(x.split()) for x in train_seq['Sequence']]))
print(max([len(x.split()) for x in dev_seq['Sequence']]))
print(max([len(x.split()) for x in test_seq['Sequence']]))


print(max([len(x) for x in train_seq['DS_Seq_Labels']]))
print(max([len(x) for x in dev_seq['DS_Seq_Labels']]))
print(max([len(x) for x in test_seq['DS_Seq_Labels']]))

212
163
161
16
15
11


Add a [CLS] to the beginning of each sentence in sequence(reply). Then extract that and do one of the following below:

Put a **CRF** on top of a list of representations for each sentence - the representation could be CLS Token

Or put a simple **LSTM/GRU** on top

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
MAX_LEN = 256
MAX_SEQ_LEN = 16

In [None]:
def find_len(a):
    '''
    Finds the length of the sequence beyond which padding is necessary (in the batch of cls tokens)
    a: tensor of shape max_len
    
    returns an integer (which is length for that sequence - this is equivalent to number of sentences in
    a reply)
    '''
    
    try:
        len = ((a != 0).nonzero())[-1].item() + 1
    except IndexError:
        len = 1
    
    return len

# Convert our text into tokens that corresponds to BERT library
train_input_ids = [tokenizer.encode(seq, max_length=MAX_LEN,pad_to_max_length=True,add_special_tokens=False) 
                   for seq in train_seq['Sequence'].tolist()]
train_input_ids = torch.tensor(train_input_ids)


# Create a mask of 1 for all input tokens and 0 for all padding tokens 
train_attention_masks = torch.tensor([[float(i>0) for i in seq] for seq in train_input_ids])

# Find the indicies of all [CLS] tokens that will be used later for with RNN/CRF on top
train_cls_idxs = torch.nn.utils.rnn.pad_sequence([torch.tensor(np.where(a==101)[0]) for a in train_input_ids.numpy()], padding_value=0, batch_first=True)
# import ipdb;ipdb.set_trace()
train_cls_lens = torch.tensor([find_len(a) for a in train_cls_idxs])

# Pad the labels with 0
train_labels_DS = torch.nn.utils.rnn.pad_sequence([torch.tensor(a) for a in train_seq['DS_Seq_Labels'].values], padding_value=-1, batch_first=True)

# Same for dev
dev_input_ids = [tokenizer.encode(seq, max_length=MAX_LEN,pad_to_max_length=True,add_special_tokens=False) 
                 for seq in dev_seq['Sequence'].tolist()]
dev_input_ids = torch.tensor(dev_input_ids)


dev_attention_masks = torch.tensor([[float(i>0) for i in seq] for seq in dev_input_ids])

dev_cls_idxs = torch.nn.utils.rnn.pad_sequence([torch.tensor(np.where(a==101)[0]) for a in dev_input_ids.numpy()], padding_value=0, batch_first=True)
dev_cls_lens = torch.tensor([find_len(a) for a in dev_cls_idxs])

dev_labels_DS = torch.nn.utils.rnn.pad_sequence([torch.tensor(a) for a in dev_seq['DS_Seq_Labels'].values], padding_value=-1, batch_first=True)

print(train_input_ids.shape, train_attention_masks.shape, train_labels_DS.shape, train_cls_idxs.shape)
print(dev_input_ids.shape, dev_attention_masks.shape, dev_labels_DS.shape, dev_cls_idxs.shape)

In [None]:
print(train_cls_idxs[0],"\n", train_cls_lens[0])

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_input_ids,train_attention_masks,train_labels_DS, train_cls_idxs, 
                           train_cls_lens)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

dev_data = TensorDataset(dev_input_ids,dev_attention_masks,dev_labels_DS, dev_cls_idxs, dev_cls_lens)
dev_sampler = RandomSampler(dev_data)
dev_dataloader = DataLoader(dev_data,sampler=dev_sampler,batch_size=batch_size)


In [None]:
## Setup all the parameters

if torch.cuda.is_available():
    device = torch.cuda.device(0)
else:
    device = torch.device('cpu')

In [None]:
class SequentialModel(torch.nn.Module):
    
    def __init__(self, transformer_config, device, input_size, hidden_size, num_layers, dropout, layers):
        '''
        Setup the modules in the model - a transformer, followed by a GRU for the CLS hidden states/taking
        the mean of all tokens, followed by Linear layers that outputs one number, followed by softmax
        '''
        super(SequentialModel, self).__init__()
        
        # Setup the transformer and the GRU layer on top of the CLS tokens
        self._transformer = BertModel.from_pretrained(transformer_config)
        self._rnn = torch.nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=2, 
                                 batch_first=True, bidirectional=True, dropout=dropout)
        
        # Setup the linear layers on top of the GRU final hidden state
        output_size = 2
        self._linmaps = torch.nn.ModuleList([])
        last_size = hidden_size * 2
        for j in layers:
            self._linmaps.append(torch.nn.Linear(last_size, j))
            last_size = j
        self._linmaps.append(torch.nn.Linear(last_size, output_size))
        
        self._activation = 'relu'
        self._dropout = torch.nn.Dropout(p=dropout)
        self._logsoftmax = torch.nn.LogSoftmax(dim=2)
        
    def nonlinearity(self, x):
        '''Applies relu or tanh activation on tensor.'''

        if self._activation == 'relu':
            return torch.nn.functional.relu(x)
        elif self._activation == 'tanh':
            return torch.tanh(x)
    
    def forward(self, input_ids, input_mask, input_cls_idxs, input_cls_lens, max_seq_len):
        '''
        Runs forward pass on neural network
        
        Arguments:
        ---------
        input_ids: the tokenized, bert wordpiece IDs. (batch_size, MAX_LEN)
        input_masks: the masking to be done on input_ids due to padding. (batch_size, MAX_LEN)
        input_cls_idxs: the indicies of the CLS tokens for each sequence. (batch_size, max_num_seq)
        input_cls_lens: the length of each sequence in the batch (for packing before passing through GRU)
        '''
        
        
        # Forward pass through transformer
        # other values returned are pooler_output, hidden_states, and attentions
        last_hidden_state, _ = self._transformer(input_ids, token_type_ids=None, attention_mask=input_mask)

        # last_hidden_state is of shape batch_size x MAX_LEN(256) x 768 (for bert-base-cased)
        # now extract along the cls_idx for each sequence
        # cls_hidden_states will be of shape batch_size x max_num_of_sequences x 768
        cls_hidden_states = torch.cat([torch.index_select(a, 0, i).unsqueeze(0) for a, i in 
                                       zip(last_hidden_state, input_cls_idxs) ])
        
        # Else put in something here to take the mean of the whole sequence instead

        
        # Pack padded sequence above
        x = torch.nn.utils.rnn.pack_padded_sequence(cls_hidden_states,input_cls_lens,enforce_sorted=False, 
                                                    batch_first=True)
        

        # Run it through the GRU
        x, _ = self._rnn(x)
        
        # Unpack packed sequence
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True, total_length=max_seq_len)

        # Then run it through linear layers
        # Run through linear layers
        for i, linmap in enumerate(self._linmaps):
            if i:
                x = self.nonlinearity(x)
                x = self._dropout(x)
            x = linmap(x)
        
        x = self._logsoftmax(x)
        
        return x

In [None]:
model = SequentialModel(transformer_config='bert-base-cased', device=device, input_size=768, hidden_size=256, 
                        num_layers=2, dropout=0.2, layers=(64,16)).to(device)

# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

In [None]:
def loss_fn(y, y_pred):
    '''
    Calculates cross entropy loss
    
    Arguments:
    =========
    y: actual outputs. (batch_size x max_seq_len)
    y_pred: model's predicted outputs. (batch_size x max_seq_len)
    
    Returns:
    =======
    loss: cross entropy loss / binary cross entropy loss for sequence
    '''
    
    # Flatten the true and predicted labels
    y = y.view(-1)
    y_pred = y_pred.view(-1,2)

    
    # First figure out which values to mask from loss calculation (padded with -1)
    mask = (y > -1).float()
    
    # Calculate cross entropy loss manually because of masking
    num_of_tokens = int(torch.sum(mask).item())
    
    # Ok this is a little hard to read. First y_pred[range(y_pred.shape[0]), y] returns a (512,) tensor
    # that looks at every element in y_pred and returns the 0th or 1th element corresponding to what is in
    # the actual label (is this what cross entropy does? I think y.p(y) + (1-y)p(1-y)). However -1 will
    # choose the last element for those elements as well. Then multipy by  mask to zero out all the -1 labels. 
    y_pred = y_pred[range(y_pred.shape[0]), y] * mask
    
    # Now calculate loss by just summing it up
    loss = -torch.sum(y_pred)/num_of_tokens
    
    return loss

# Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

for _ in range(1,epochs+1):
    print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    
    # Calculate total loss for this epoch
    batch_loss = 0
    
    for step, batch in enumerate(tqdm(train_dataloader)):
        # Set our model to training mode (as opposed to evaluation mode)
        model.train()
        
         # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_cls_idxs, b_cls_lens = batch
        
        # Forward pass
        b_outputs = model(input_ids=b_input_ids, input_mask=b_input_mask, input_cls_idxs=b_cls_idxs,
                          input_cls_lens=b_cls_lens, max_seq_len=16)

        loss = loss_fn(b_labels, b_outputs)
        
        # Backward pass
        loss.backward()
    
        # Clip the norm of the gradients to 1.0
        # Gradient clipping is not in AdamW anymore
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient
        optimizer.step()
    
        # Update learning rate schedule
        scheduler.step()

        # Clear the previous accumulated gradients
        optimizer.zero_grad()
        
        # Update tracking variables
        batch_loss += loss.item()
        
    # Calculate the average loss over the training data.
    avg_train_loss = batch_loss / len(train_dataloader)

    #store the current learning rate
    for param_group in optimizer.param_groups:
        print("\n\tCurrent Learning rate: ",param_group['lr'])
        learning_rate.append(param_group['lr'])

    train_loss_set.append(avg_train_loss)
    print(F'\n\tAverage Training loss: {avg_train_loss}')

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_accuracy, eval_mcc_accuracy, eval_f1_score, nb_eval_steps = 0, 0, 0, 0

    for batch in dev_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_cls_idxs, b_cls_lens = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          b_outputs = model(input_ids=b_input_ids, input_mask=b_input_mask, input_cls_idxs=b_cls_idxs,
                            input_cls_lens=b_cls_lens, max_seq_len=15)

        
        b_outputs = b_outputs.to('cpu').numpy()
        label_ids = b_labels.to('cpu').numpy()

        pred_flat = np.argmax(b_outputs, axis=1).flatten()
        labels_flat = label_ids.flatten()
        mask = (labels_flat > -1).int()
    
        tmp_eval_accuracy = acc(pred_flat, labels_flat, sample_weight=mask)
        tmp_eval_mcc_accuracy = mattcorr(labels_flat, pred_flat, sample_weight=mask)
        tmp_eval_f1_score = f1_score(labels_flat, pred_flat, sample_weight=mask)

        eval_accuracy += tmp_eval_accuracy
        eval_mcc_accuracy += tmp_eval_mcc_accuracy
        eval_f1_score += tmp_eval_f1_score
        nb_eval_steps += 1

    print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
    print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')
    print(F'\n\tValidation F1 Score: {eval_f1_score/nb_eval_steps}')