In [2]:
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score as acc, precision_score as prec, recall_score as rec
import torch
from transformers import BertForSequenceClassification, BertTokenizer, XLNetForSequenceClassification, XLNetTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import random

In [3]:
# Set random seeds for reproducibility on a specific machine
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.manual_seed(1)
torch.cuda.manual_seed(1)
random.seed(1)
np.random.seed(1)
np.random.RandomState(0)

RandomState(MT19937) at 0x1A3DF07570

In [4]:
train = pd.read_csv('../annotated_data/train.tsv', sep='\t', header=0)
train['Sentence'] = train['Sentence'].apply(lambda x: x.lower())
train_sentences = train['Sentence'].tolist()
train_labels_DS = train['DS_Label'].values
train_labels_Maj = train['Majority_label'].values


dev = pd.read_csv('../annotated_data/dev.tsv', sep='\t', header=0)
dev['Sentence'] = dev['Sentence'].apply(lambda x: x.lower())
dev_sentences = dev['Sentence'].tolist()
dev_labels_DS = dev['DS_Label'].values
dev_labels_Maj = dev['Majority_label'].values

test = pd.read_csv('../annotated_data/test.tsv', sep='\t', header=0)
test['Sentence'] = test['Sentence'].apply(lambda x: x.lower())
test_sentences = test['Sentence'].tolist()
test_labels_DS = test['DS_Label'].values
test_labels_Maj = test['Majority_label'].values

Creating new columns that might help later on with the sequence labelling task:
1. *Post.ID* The reddit post from which the replies are scraped - 
2. *Reply.ID* The id number of the reply, as ordered by reddit's best algorithm (need to check up on this with Ben)
3. *Sent.Num* The sentence number, in order, from within the reply

In [5]:
train['Post.ID'] = train['ID'].apply(lambda x: x.split('-')[0])
train['Reply.ID'] = train['ID'].apply(lambda x: x.split('-')[1])
train['Sent.Num'] = train['ID'].apply(lambda x: x.split('-')[2])

dev['Post.ID'] = dev['ID'].apply(lambda x: x.split('-')[0])
dev['Reply.ID'] = dev['ID'].apply(lambda x: x.split('-')[1])
dev['Sent.Num'] = dev['ID'].apply(lambda x: x.split('-')[2])

test['Post.ID'] = test['ID'].apply(lambda x: x.split('-')[0])
test['Reply.ID'] = test['ID'].apply(lambda x: x.split('-')[1])
test['Sent.Num'] = test['ID'].apply(lambda x: x.split('-')[2])

train.set_index('ID',inplace=True)
train.head()

Unnamed: 0_level_0,Majority_label,DS_Label,Sentence,Post.ID,Reply.ID,Sent.Num
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dbk05v-8-2,1,1,if you had an office ( is studio the right wor...,dbk05v,8,2
8pf0i0-2-4,0,0,but i would not go to a strip club with my mom .,8pf0i0,2,4
cqeljh-4-0,0,0,i 'm not in your situation .,cqeljh,4,0
cb4rzb-1-5,1,1,"if your home is half way clean , oh yeah , the...",cb4rzb,1,5
7gvjhg-4-0,0,0,same as everyone else .,7gvjhg,4,0


In [None]:
# def BIO_convert(array):
#     '''
#     Convert a sequence of 1s and 0s to BIO(Beginning-Inside-Outside) format
#     '''
#     bio = ['O' for i in range(len(array))]
#     if 1 not in array:
#         return bio
#     else:
#         bio[array.index(1)] = 'B'
#         for k in range(array.index(1)+1, len(array)):
#             if array[k] == 1 and bio[k-1] == 'B':
#                 bio[k] = 'I'
#             elif array[k] == 0:
#                 bio[k] = 'O'
#             elif array[k] == 1 and array[k-1] == 0:
#                 bio[k] = 'B'
#         return bio

In [11]:
# MODELS = {'bert': (BertForSequenceClassification, BertTokenizer,
#                  'bert-base-cased'),
#           'xlnet': (XLNetForSequenceClassification, XLNetTokenizer,
#                    'xlnet-base-cased'),
#           'roberta': (RobertaForSequenceClassification, RobertaTokenizer,
#                       'roberta-base')}

## Normal Sequence Classification with Fine tuned BERT (DS labels)

In [17]:
# # Set the maximum length of sequence - just the longest length sentence from
# # train, test and dev
# MAX_LEN = max(max([len(a.split()) for a in train_sentences]),
#               max([len(a.split()) for a in dev_sentences]),
#               max([len(a.split()) for a in test_sentences]))

# # Select a batch size for training. For fine-tuning xlnet on a specific task, the authors recommend a batch size of 16 or 32
# batch_size = 32

# # Choose gpu or cpu
# if torch.cuda.is_available():
#     device = torch.device('cuda:0')
# else:
#     device = torch.device('cpu')
    
# # How many labels in your problem?
# num_labels = np.unique(train_labels_DS).shape[0]

# # Load tokenizer XForSequenceClassification model, the pretrained args.model
# # with a single linear classification layer on top
# tokenizer = MODELS['xlnet'][1].from_pretrained(MODELS['xlnet'][2])
# model = MODELS['xlnet'][0].from_pretrained(MODELS['xlnet'][2], num_labels=num_labels).to(device)

In [20]:
# model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2, 
#                                                       hidden_dropout_prob=0.5).to(device)

TypeError: __init__() got an unexpected keyword argument 'hidden_dropout_prob'

In [None]:
# Convert our text into tokens that corresponds to BERT library
# train_input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) 
#                    for sent in train_sentences]
# train_input_ids = torch.tensor(train_input_ids)

# # Create a mask of 1 for all input tokens and 0 for all padding tokens
# train_attention_masks = [[float(i>0) for i in seq] for seq in train_input_ids]
# train_attention_masks = torch.tensor(train_attention_masks)
# train_labels_DS = torch.tensor(train_labels_DS)

# # Same for dev
# dev_input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True) 
#                  for sent in dev_sentences]
# dev_input_ids = torch.tensor(dev_input_ids)
# dev_attention_masks = [[float(i>0) for i in seq] for seq in dev_input_ids]
# dev_attention_masks = torch.tensor(dev_attention_masks)
# dev_labels_DS = torch.tensor(dev_labels_DS)

# print(train_input_ids.shape, dev_input_ids.shape, train_attention_masks.shape, dev_attention_masks.shape)

In [None]:
# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory
# train_data = TensorDataset(train_input_ids[:32],train_attention_masks[:32],train_labels_DS[:32])
# train_sampler = RandomSampler(train_data)
# train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

# dev_data = TensorDataset(dev_input_ids[:32],dev_attention_masks[:32],dev_labels_DS[:32])
# dev_sampler = RandomSampler(dev_data)
# dev_dataloader = DataLoader(dev_data,sampler=dev_sampler,batch_size=batch_size)

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top
# model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2, 
#                                                       hidden_dropout_prob=0.5).to(device)

# # Parameters:
# lr = 2e-7
# adam_epsilon = 1e-8

# # Number of training epochs (authors recommend between 2 and 4)
# epochs = 6

# num_warmup_steps = 0
# num_training_steps = len(train_dataloader)*epochs

# ### In Transformers, optimizer and schedules are splitted and instantiated like this:
# optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
# scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

### Training/Fine-tuning step

In [None]:
# Store our loss and accuracy for plotting
# train_loss_set = []
# learning_rate = []
# validation_acc = [0]

# # Gradients gets accumulated by default
# model.zero_grad()

# for _ in range(1,epochs+1):
#     print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
    
#     # Calculate total loss for this epoch
#     batch_loss = 0
    
#     for step, batch in enumerate(tqdm(train_dataloader)):
#         # Set our model to training mode (as opposed to evaluation mode)
#         model.train()
        
#          # Add batch to GPU
#         batch = tuple(t.to(device) for t in batch)
        
#         # Unpack the inputs from our dataloader
#         b_input_ids, b_input_mask, b_labels = batch
#         from IPython.core.debugger import set_trace;set_trace()
#         # Forward pass
#         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
#         loss = outputs[0]

#         # Backward pass
#         loss.backward()
        
#         # Clip the norm of the gradients to 1.0
#         # Gradient clipping is not in AdamW anymore
#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
#         # Update parameters and take a step using the computed gradient
#         optimizer.step()
    
#         # Update learning rate schedule
#         scheduler.step()

#         # Clear the previous accumulated gradients
#         optimizer.zero_grad()
        
#         # Update tracking variables
#         batch_loss += loss.item()
        
#     # Calculate the average loss over the training data.
#     avg_train_loss = batch_loss / len(train_dataloader)

#     #store the current learning rate
#     for param_group in optimizer.param_groups:
#         print("\n\tCurrent Learning rate: ",param_group['lr'])
#         learning_rate.append(param_group['lr'])

#     train_loss_set.append(avg_train_loss)
#     print(F'\n\tAverage Training loss: {avg_train_loss}')

#     # Put model in evaluation mode to evaluate loss on the validation set
#     model.eval()

#     # Tracking variables 
#     eval_accuracy, eval_f1_score, nb_eval_steps = 0, 0, 0

#     for batch in dev_dataloader:
#         # Add batch to GPU
#         batch = tuple(t.to(device) for t in batch)
#         # Unpack the inputs from our dataloader
#         b_input_ids, b_input_mask, b_labels = batch
#         # Telling the model not to compute or store gradients, saving memory and speeding up validation
#         with torch.no_grad():
#             # Forward pass, calculate logit predictions
#             (logits,) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

#         # Move logits and labels to CPU
#         logits = logits.to('cpu').numpy()
#         label_ids = b_labels.to('cpu').numpy()

#         pred_flat = np.argmax(logits, axis=1).flatten()
#         labels_flat = label_ids.flatten()
#         tmp_eval_accuracy = acc(pred_flat, labels_flat)
#         tmp_eval_f1_score = f1_score(labels_flat, pred_flat)

#         eval_accuracy += tmp_eval_accuracy
#         eval_f1_score += tmp_eval_f1_score
#         nb_eval_steps += 1
    
#     validation_acc.append(eval_accuracy/nb_eval_steps)
#     print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
# #     print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')
#     print(F'\n\tValidation F1 Score: {eval_f1_score/nb_eval_steps}')
# #     if valid_acc[-1] < valid_acc[-2]:
# #         break