In [0]:
!pip install transformers
!pip install jsonlines

In [0]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch
import transformers as ppb
from google.colab import drive
import warnings

warnings.filterwarnings('ignore')
drive.mount('/content/drive')

In [0]:
%cd drive/My \Drive/
!ls

In [0]:
!wc -l 'train_output_accurate.jsonl'
!head 'train_output_accurate.jsonl'

In [0]:
# Load pre-trained model and tokenizer 
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
#define example and feature classes
class InputExample(object):
  def __init__(self, claim, ev, label=None):
    self.claim = claim
    self.ev = ev
    self.label = label

class InputFeatures(object):
    def __init__(self, claim_ids, ev_ids, label_id):
        self.claim_ids = claim_ids
        self.ev_ids = ev_ids
        self.label_id = label_id

In [0]:
#convert training data to InputExamples
import jsonlines
import re
def data_to_examples(data_file):
  examples = []
  data = jsonlines.open(data_file)
  for line in data:
    label = line['label']
#    label = 'null'
    claim = line['claim']
    ev=[]
    if line['evidence'] != 'null':
      for item in line['evidence']:
        #remove nonsense from sents
        clean = re.sub('[a-zA-Z0-9]*\t[a-zA-Z0-9]*','',item)
        ev.append(clean)
    else: #if evidence = null
      ev.append('null')
    examples.append(InputExample(claim,ev,label))
  return examples

In [0]:
train_ex = data_to_examples('train_output_accurate.jsonl')

In [0]:
print(train_ex[11].label)
print(train_ex[11].claim)
print(train_ex[11].ev)

#label, claim, list of evidence text

In [0]:
def example_to_features(example):
  #tokenize and add [CLS]/[SEP] tokens
  claim_tokens = tokenizer.encode(example.claim, add_special_tokens=True)
  ev_tokens = []
  if example.ev ==[]:
    ev_tokens.append([101])
  for item in example.ev:
    ev_tokens.append(tokenizer.encode(item,add_special_tokens=True))
  
  return InputFeatures(claim_tokens,ev_tokens,example.label)

In [0]:
feat_list = []
for example in train_ex[:15000]:
  features = example_to_features(example)
  feat_list.append(features)

In [0]:
print(feat_list[11].claim_ids)
print(feat_list[11].ev_ids)
print(feat_list[11].label_id)
print(len(feat_list))

In [0]:
#make a list of all sents (to find max_len)
claims = [x.claim_ids for x in feat_list]
evs = [item for x in feat_list for item in x.ev_ids]

#padding
max_len = 0
for sent in claims+evs:
    if len(sent) > max_len:
        max_len = len(sent)
print(max_len)

#max_len = 327

In [0]:
#####BOTTLENECK STEP
#create BERT representations for each example
BERT_examples = []
round = 0
for item in feat_list:
  #padding
  padded_claim = np.array(item.claim_ids + [0]*(max_len-len(item.claim_ids)))
  padded_claim = np.reshape(padded_claim,(1,max_len))
  padded_ev = np.array([sent + [0]*(max_len-len(sent)) for sent in item.ev_ids])
  
  #mask padded tokens
  claim_mask = np.where(padded_claim != 0,1,0)
  ev_mask = np.where(padded_ev != 0,1,0)

  #create matrix: each row represents one sentence
  padded_total = np.concatenate((padded_claim,padded_ev),axis=0)
  mask_total = np.concatenate((claim_mask,ev_mask))

  #convert to tensors
  input_ids = torch.tensor(padded_total)
  attention_mask = torch.tensor(mask_total)

  #get BERT embeddings
  with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

  #extract [CLS] embeddings and labels for input to classifier
  features = last_hidden_states[0][:,0,:].numpy()
  concatenated = features.flatten()
  BERT_examples.append(list(concatenated))
  round += 1
  print(round)

8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383


In [0]:
print(padded_claim)
print(padded_ev)

In [0]:
print(len(BERT_examples))

In [0]:
#pad each example for uniform input size
max_len = 0
for example in BERT_examples:
  if len(example) > max_len:
    max_len = len(example)

padded_BERT = []
for example in BERT_examples:
  padded_BERT.append(example + [0]*(max_len-len(example)))

print(max_len)
print(len(padded_BERT[40]))

In [0]:
train_labels = [item.label_id for item in feat_list[:len(BERT_examples)]]
train_inputs = np.array([np.array(ex) for ex in padded_BERT])
print(train_inputs.shape)

#feat_list[:10000] model: max_len = 109824
#feat_list[:15000] model: max_len = 109824

In [0]:
#train classifier on embeddings/labels
log_reg = LogisticRegression()
log_reg.fit(train_inputs, train_labels)

In [0]:
import pickle

# save the model to drive
filename = 'new_trained_model_15k.sav'
pickle.dump(log_reg, open(filename, 'wb'))

In [0]:
# load the trained model from drive
loaded_model = pickle.load(open('new_trained_model_15k.sav', 'rb'))

In [0]:
#test classifier accuracy
loaded_model.score(train_inputs, train_labels)

In [0]:
#10k train accuracy = 0.9306
#15k train accuracy = 0.9412