In [1]:
import os
import glob
import pandas as pd
import pickle
from collections import defaultdict
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from transformers import BertForSequenceClassification, BertConfig

In [71]:
import pickle
from tqdm import tqdm
import json
import torch
import pandas as pd
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from tensorboardX import SummaryWriter
import numpy as np
import time
import datetime
import random
from collections import defaultdict
import argparse
import os
import scipy
import sklearn
import math

CUDA = (torch.cuda.device_count() > 0)

SEED = 1234

WORKING_DIR = '.'
writer = SummaryWriter(WORKING_DIR + '/events')

LEARN_RATE=2e-5
optimizer = AdamW(model.parameters(), lr=LEARN_RATE, eps=1e-8)

NUM_EPOCHS=3
total_steps = len(train_dataloader) * NUM_EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [7]:
CLASSES = ['for','against','neutral']
NUM_LABELS = 3

def get_pred_label(res_,to_str=False):
    if to_str:
        return CLASSES[res_.index(max(res_))]
    else:
        return res_.index(max(res_))

In [30]:
PRETRAINED_MODELS_DIR = '../BERT/trained_models'

DATA_NAME = 'mturk_windowed_1_downsampled'
BASE_MOD = 'uncased_LM'
CASING = 'uncased'
DATA_DIR = os.path.join('../data_creation/scripts/save',DATA_NAME)
print(DATA_DIR)

#model_path = os.path.join(PRETRAINED_MODELS_DIR,DATA_NAME,BASE_MOD,
#                         CASING)
model_path = '../BERT/LM_finetuned/uncased_LM_cc_output'
print(model_path)

../data_creation/scripts/save/mturk_windowed_1_downsampled
../BERT/LM_finetuned/uncased_LM_cc_output


In [37]:
# Load model
config = BertConfig.from_pretrained(model_path, num_labels=NUM_LABELS)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                          config=config)
with open(os.path.join(model_path,'vocab.txt'),'r') as f:
    vocab = f.readlines()
vocab = [l.strip() for l in vocab]

In [32]:
# Load data for prediction/eval
eval_set = 'train' # can also be 'test'
eval_data = pd.read_csv(os.path.join(DATA_DIR,eval_set+'.tsv'),
                          sep='\t',header=None)
eval_data.columns = ['text','label']#,'outlet']

In [42]:
eval_data

Unnamed: 0,text,label
0,"[SEP] In recent years, some scientists have be...",0
1,[SEP] I’m writing a series of posts building o...,0
2,"[SEP] FILE - In this Jan. 8, 2018 file photo t...",0
3,[SEP] As the effects of global climate change ...,0
4,[SEP] About the record-breaking intensity of H...,0
...,...,...
818,"[SEP] Her name is Naomi Seibt, she’s 19 years ...",2
819,"[SEP] They are simply being used as ""human shi...",2
820,[SEP] “It’s a rare event that we’re still tryi...,2
821,"[SEP] Also, warmer temperatures bring longer g...",2


In [44]:
eval_data = eval_data.iloc[[818,819,820]]

In [72]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def build_dataloader(*args, sampler='random'):
    #print(args[:2])
    data = (torch.tensor(x) for x in args)
    #print(data[0])
    data = TensorDataset(*data)

    #sampler = RandomSampler(data) if sampler == 'random' else SequentialSampler(data)
    dataloader = DataLoader(data, batch_size=1)

    return dataloader

def get_out_data(dat_path,max_seq_length=500):
    #eval_set = 'train' # can also be 'test'
    data = pd.read_csv(dat_path,
                              sep='\t',header=None)
    data.columns = ['text','label']#,'outlet']
    
    out = defaultdict(list)
    
    print('Number of examples to predict:',len(data))
    to_predict = data.text.values
    true = data.label.values
    
    for dat_ix in range(len(data)):
        sent = to_predict[dat_ix]
        #print(sent)
        label = true[dat_ix]
        encoded_sent = tokenizer.encode(sent,add_special_tokens=True)
        out['input_ids'].append(encoded_sent)
        out['sentences'].append(sent)
        out['label'].append(label)

    out['input_ids'] = pad_sequences(
            out['input_ids'], 
            maxlen=max_seq_length, 
            dtype="long", 
            value=0, 
            truncating="post", 
            padding="post")


    print('Adding attention masks...')
    # get attn masks
    for sent in out['input_ids']:
        tok_type_ids = [0 for tok_id in sent]
        mask = [int(tok_id > 0) for tok_id in sent]
        out['attention_mask'].append(mask)
        out['token_type_ids'].append(tok_type_ids)
    #print(len(out['labels']))
    #print(sum(out['labels']))
    
    print('Preparing input examples for prediction...')
    
    return out

In [None]:
if os.path.exists(WORKING_DIR + "/data.cache.pkl"):
    data = pickle.load(open(WORKING_DIR + "/data.cache.pkl", 'rb'))
else:
    data = get_out_data(os.path.join(DATA_DIR, 'train.tsv'))
    pickle.dump(data, open(WORKING_DIR + "/data.cache.pkl", 'wb'))

In [None]:
train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks, = train_test_split(
    data['input_ids'], data['label'], data['attention_mask'],
    random_state=SEED, test_size=0.1)

train_dataloader = build_dataloader(
    train_inputs, train_labels, train_masks)
test_dataloader = build_dataloader(
    test_inputs, test_labels, test_masks,
    sampler='order')

In [None]:
for epoch_i in range(0, NUM_EPOCHS):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, NUM_EPOCHS))
    print('Training...')

    losses = []
    t0 = time.time()
    model.train()
    for step, batch in enumerate(train_dataloader):
        #print(step,batch)

        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. Loss: {:.2f}'.format(
                step, len(train_dataloader), elapsed, float(np.mean(losses))))

        if CUDA:
            batch = (x.cuda() for x in batch)            
        input_ids, labels, masks = batch
        model.zero_grad()        

        outputs = model(
            input_ids,
            attention_mask=masks, 
            labels=labels)
        
        print(len(outputs))
        
        #loss, _, _ = outputs
        loss, _ = outputs
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = np.mean(losses)
    writer.add_scalar('train/loss', np.mean(avg_loss), epoch_i)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    losses = []
    all_preds = []
    all_labels = []
    log = open(WORKING_DIR + '/epoch%d.log' % epoch_i, 'w')
    for step, batch in enumerate(test_dataloader):

        if CUDA:
            batch = (x.cuda() for x in batch)            
        input_ids, labels, masks = batch

        with torch.no_grad():        
            outputs = model(
                input_ids,
                attention_mask=masks, 
                labels=labels)
        #loss, logits, attns = outputs
        loss, logits = outputs

        losses.append(loss.item())

        labels = labels.cpu().numpy()
        input_ids = input_ids.cpu().numpy()
        preds = scipy.special.softmax(logits.cpu().numpy(), axis=1)
        input_toks = [
            tokenizer.convert_ids_to_tokens(s) for s in input_ids
        ]

        for seq, label, pred in zip(input_toks, labels, preds):
            sep_char = '+' if np.argmax(pred) == label else '-'
            log.write(sep_char * 40 + '\n')
            log.write(' '.join(seq) + '\n')
            log.write('label: ' + str(label) + '\n')
            log.write('pred: ' + str(np.argmax(pred)) + '\n')
            log.write('dist: ' + str(pred) + '\n')
            log.write('\n\n')

            all_preds += [pred]
            all_labels += [label]
    log.close()
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    avg_loss = np.mean(losses)
    f1 = sklearn.metrics.f1_score(all_labels, np.argmax(all_preds, axis=1))
    acc = sklearn.metrics.accuracy_score(all_labels, np.argmax(all_preds, axis=1))
    auc = sklearn.metrics.roc_auc_score(all_labels, all_preds[:, 1])

    writer.add_scalar('eval/acc', acc, epoch_i)
    writer.add_scalar('eval/auc', auc, epoch_i)
    writer.add_scalar('eval/f1', f1, epoch_i)
    writer.add_scalar('eval/loss', f1, epoch_i)

    print("  Loss: {0:.2f}".format(avg_loss))
    print("  Accuracy: {0:.2f}".format(acc))
    print("  F1: {0:.2f}".format(f1))
    print("  AUC: {0:.2f}".format(auc))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Done!")

In [21]:
# feed processed 'out' to model for prediction

def batch_predict_(out_objs,pred_to_str=False):
    
    print('Doing predictions...')
    modeled_logits = [model(**out_objs[ix])[0] 
                      for ix in range(len(out_objs))]
    
    modeled_results = [torch.softmax(x, dim=1).tolist()[0] 
                  for x in modeled_logits]
    predicted_labels = [get_pred_label(x,pred_to_str) 
                        for x in modeled_results]
    return predicted_labels

In [22]:
def batch_predict(eval_dat):
    
    return batch_predict_(get_out_data(eval_dat))

In [69]:
model(**out_data[0])[0]

(tensor([[ 1.0975,  0.2420, -0.1305]], grad_fn=<AddmmBackward>),)

In [46]:
dev_preds = batch_predict(eval_data)

Number of examples to predict: 3
Adding attention masks...
Preparing input examples for prediction...
Doing predictions...


In [47]:
dev_preds

[0, 0, 0]

In [50]:
eval_data.text.values

array(['[SEP] Her name is Naomi Seibt, she’s 19 years old, but unlike some teenage activists we could mention she is most definitely not welcome at the UN’s COP25 climate conference. [SEP] [CLS] The global warming scare is a massive hoax. [SEP] Worse — and being a German, she should know — Naomi believes that her Chancellor’s green policies are steering her country inexorably towards the kind of totalitarianism it last experienced in the 1930s and the 1940s.',
       '[SEP] They are simply being used as "human shields" for adult climate activists who recognize that the climate scare will soon lose credibility as global warming Armageddon fails to materialize as forecast. [SEP] [CLS] The world may have already begun to cool in response to a weakening Sun, a phenomenon far more dangerous than any possible human-induced warming. [SEP] In our August 20 America Out Loud article, The Disgraceful Use of Children to Promote the Climate Change Delusion, we explained: At the center of Baal was t

In [60]:
out_data = get_out_data(eval_data)

Number of examples to predict: 3
Adding attention masks...
Preparing input examples for prediction...


In [61]:
out_data[0]

{'input_ids': tensor([[  101,   102,  2014,  2171,  2003, 12806,  7367, 12322,  2102,  1010,
           2016,  1521,  1055,  2539,  2086,  2214,  1010,  2021,  4406,  2070,
           9454, 10134,  2057,  2071,  5254,  2016,  2003,  2087,  5791,  2025,
           6160,  2012,  1996,  4895,  1521,  1055,  8872, 17788,  4785,  3034,
           1012,   102,   101,  1996,  3795, 12959, 12665,  2003,  1037,  5294,
          28520,  1012,   102,  4788,  1517,  1998,  2108,  1037,  2446,  1010,
           2016,  2323,  2113,  1517, 12806,  7164,  2008,  2014,  7306,  1521,
           1055,  2665,  6043,  2024,  9602,  2014,  2406,  1999, 10288,  6525,
           6321,  2875,  1996,  2785,  1997,  2561, 25691,  2964,  2009,  2197,
           5281,  1999,  1996,  5687,  1998,  1996,  7675,  1012,   102,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [62]:
list([int(x) for x in out_data[0]['input_ids'][0]])

[101,
 102,
 2014,
 2171,
 2003,
 12806,
 7367,
 12322,
 2102,
 1010,
 2016,
 1521,
 1055,
 2539,
 2086,
 2214,
 1010,
 2021,
 4406,
 2070,
 9454,
 10134,
 2057,
 2071,
 5254,
 2016,
 2003,
 2087,
 5791,
 2025,
 6160,
 2012,
 1996,
 4895,
 1521,
 1055,
 8872,
 17788,
 4785,
 3034,
 1012,
 102,
 101,
 1996,
 3795,
 12959,
 12665,
 2003,
 1037,
 5294,
 28520,
 1012,
 102,
 4788,
 1517,
 1998,
 2108,
 1037,
 2446,
 1010,
 2016,
 2323,
 2113,
 1517,
 12806,
 7164,
 2008,
 2014,
 7306,
 1521,
 1055,
 2665,
 6043,
 2024,
 9602,
 2014,
 2406,
 1999,
 10288,
 6525,
 6321,
 2875,
 1996,
 2785,
 1997,
 2561,
 25691,
 2964,
 2009,
 2197,
 5281,
 1999,
 1996,
 5687,
 1998,
 1996,
 7675,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,

In [63]:
[vocab[ix] for ix in list([int(x) for x in out_data[0]['input_ids'][0]])]

['[CLS]',
 '[SEP]',
 'her',
 'name',
 'is',
 'naomi',
 'se',
 '##ib',
 '##t',
 ',',
 'she',
 '’',
 's',
 '19',
 'years',
 'old',
 ',',
 'but',
 'unlike',
 'some',
 'teenage',
 'activists',
 'we',
 'could',
 'mention',
 'she',
 'is',
 'most',
 'definitely',
 'not',
 'welcome',
 'at',
 'the',
 'un',
 '’',
 's',
 'cop',
 '##25',
 'climate',
 'conference',
 '.',
 '[SEP]',
 '[CLS]',
 'the',
 'global',
 'warming',
 'scare',
 'is',
 'a',
 'massive',
 'hoax',
 '.',
 '[SEP]',
 'worse',
 '—',
 'and',
 'being',
 'a',
 'german',
 ',',
 'she',
 'should',
 'know',
 '—',
 'naomi',
 'believes',
 'that',
 'her',
 'chancellor',
 '’',
 's',
 'green',
 'policies',
 'are',
 'steering',
 'her',
 'country',
 'in',
 '##ex',
 '##ora',
 '##bly',
 'towards',
 'the',
 'kind',
 'of',
 'total',
 '##itarian',
 '##ism',
 'it',
 'last',
 'experienced',
 'in',
 'the',
 '1930s',
 'and',
 'the',
 '1940s',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 