In [1]:
from __future__ import absolute_import, division, print_function

import argparse
import collections
import logging
import os
import random

import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, SequentialSampler
from tqdm import tqdm, trange

import tokenization
from modeling import BertConfig, BertForSequenceClassification
from optimization import BERTAdam
from processor import (Semeval_NLI_B_Processor, Semeval_NLI_M_Processor,
                       Semeval_QA_B_Processor, Semeval_QA_M_Processor,
                       Semeval_single_Processor, Sentihood_NLI_B_Processor,
                       Sentihood_NLI_M_Processor, Sentihood_QA_B_Processor,
                       Sentihood_QA_M_Processor, Sentihood_single_Processor,
                       Covid_NLI_B_Processor)

In [2]:

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i

    features = []
    for (ex_index, example) in enumerate(tqdm(examples)):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append("[CLS]")
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append("[SEP]")
            segment_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[example.label]

        features.append(
                InputFeatures(
                        input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id))
    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


In [6]:
class Arg:
    def __init__(self):
        self.task_name = 'Covid_NLI_B'
        self.data_dir = 'data/covid/bert-pair'
        self.vocab_file = 'uncased_L-12_H-768_A-12/vocab.txt '
        self.bert_config_file = 'uncased_L-12_H-768_A-12/bert_config.json '
        self.init_checkpoint = 'uncased_L-12_H-768_A-12/pytorch_model.bin '
        self.eval_test = True
        self.do_lower_case = True
        self.max_seq_length = 512 
        self.no_cuda = False
        self.train_batch_size = 24 
        self.learning_rate = 2e-5 
        self.warmup_proportion = 0.1
        self.num_train_epochs = 0 
        self.eval_batch_size = 8
        self.accumulate_gradients = 1
        self.output_dir = 'results/covid/NLI_B_EVAL_NB'
        self.seed = 42
        self.local_rank = -1
        self.init_eval_checkpoint = 'results/covid/NLI_B_savemodel/model_ep_1.bin'
args = Arg()

In [10]:
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
else:
    device = torch.device("cuda", args.local_rank)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1))

if args.accumulate_gradients < 1:
    raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
                        args.accumulate_gradients))

args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

bert_config = BertConfig.from_json_file(args.bert_config_file)

if args.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format(
        args.max_seq_length, bert_config.max_position_embeddings))

if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(args.output_dir, exist_ok=True)


# prepare dataloaders
processors = {
    "Covid_NLI_B":Covid_NLI_B_Processor,
    "sentihood_single":Sentihood_single_Processor,
    "sentihood_NLI_M":Sentihood_NLI_M_Processor,
    "sentihood_QA_M":Sentihood_QA_M_Processor,
    "sentihood_NLI_B":Sentihood_NLI_B_Processor,
    "sentihood_QA_B":Sentihood_QA_B_Processor,
    "semeval_single":Semeval_single_Processor,
    "semeval_NLI_M":Semeval_NLI_M_Processor,
    "semeval_QA_M":Semeval_QA_M_Processor,
    "semeval_NLI_B":Semeval_NLI_B_Processor,
    "semeval_QA_B":Semeval_QA_B_Processor,
}

processor = processors[args.task_name]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

12/11/2021 13:57:17 - INFO - __main__ -   device cuda n_gpu 1 distributed training False


In [11]:
# training set
train_examples = None
num_train_steps = None
train_examples = processor.get_train_examples(args.data_dir)
num_train_steps = int(
    len(train_examples) / args.train_batch_size * args.num_train_epochs)

train_features = convert_examples_to_features(
    train_examples, label_list, args.max_seq_length, tokenizer)
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", args.train_batch_size)
logger.info("  Num steps = %d", num_train_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
if args.local_rank == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

# test set
test_examples = processor.get_test_examples(args.data_dir)
test_features = convert_examples_to_features(
test_examples, label_list, args.max_seq_length, tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)

test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
test_dataloader = DataLoader(test_data, batch_size=args.eval_batch_size, shuffle=False)



0
guid= train-0
text_a= unrelated - politics
label= 1
1000
guid= train-1000
text_a= unrelated - foreign
label= 1
2000
guid= train-2000
text_a= unrelated - situation
label= 1
3000
guid= train-3000
text_a= unrelated - racism
label= 1
4000
guid= train-4000
text_a= unrelated - politics
label= 1
5000
guid= train-5000
text_a= unrelated - foreign
label= 1
6000
guid= train-6000
text_a= unrelated - situation
label= 1
7000
guid= train-7000
text_a= unrelated - racism
label= 1
8000
guid= train-8000
text_a= unrelated - politics
label= 1
9000
guid= train-9000
text_a= unrelated - foreign
label= 1
10000
guid= train-10000
text_a= unrelated - situation
label= 1
11000
guid= train-11000
text_a= unrelated - racism
label= 1
12000
guid= train-12000
text_a= unrelated - politics
label= 1
13000
guid= train-13000
text_a= unrelated - foreign
label= 1
14000
guid= train-14000
text_a= unrelated - situation
label= 1
15000
guid= train-15000
text_a= unrelated - racism
label= 1
16000
guid= train-16000
text_a= unrelated 

100%|████████████████████████████████████████████████████████████████████████| 161120/161120 [01:47<00:00, 1497.75it/s]
12/11/2021 13:59:17 - INFO - __main__ -   ***** Running training *****
12/11/2021 13:59:17 - INFO - __main__ -     Num examples = 161120
12/11/2021 13:59:17 - INFO - __main__ -     Batch size = 24
12/11/2021 13:59:17 - INFO - __main__ -     Num steps = 0


0
guid= test-0
text_a= unrelated - politics
label= 1
1000
guid= test-1000
text_a= unrelated - foreign
label= 1
2000
guid= test-2000
text_a= unrelated - situation
label= 1
3000
guid= test-3000
text_a= unrelated - racism
label= 1
4000
guid= test-4000
text_a= unrelated - politics
label= 1
5000
guid= test-5000
text_a= unrelated - foreign
label= 1
6000
guid= test-6000
text_a= unrelated - situation
label= 0
7000
guid= test-7000
text_a= unrelated - racism
label= 1
8000
guid= test-8000
text_a= unrelated - politics
label= 1


100%|████████████████████████████████████████████████████████████████████████████| 8512/8512 [00:05<00:00, 1423.80it/s]


In [12]:
# model and optimizer
model = BertForSequenceClassification(bert_config, len(label_list))
if args.init_eval_checkpoint is not None:
    model.load_state_dict(torch.load(args.init_eval_checkpoint, map_location='cpu'))
elif args.init_checkpoint is not None:
    model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))
model.to(device)

if args.local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                      output_device=args.local_rank)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

no_decay = ['bias', 'gamma', 'beta']
optimizer_parameters = [
     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
     ]

optimizer = BERTAdam(optimizer_parameters,
                     lr=args.learning_rate,
                     warmup=args.warmup_proportion,
                     t_total=num_train_steps)


In [39]:
model.eval()
test_loss, test_accuracy = 0, 0
nb_test_steps, nb_test_examples = 0, 0
for input_ids, input_mask, segment_ids, label_ids in test_dataloader:
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
        tmp_test_loss, logits = model(input_ids, segment_ids, input_mask, label_ids)
    logits = F.softmax(logits, dim=-1)
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
#     print(label_ids)
    print(logits)
    outputs = np.argmax(logits, axis=1)
    
    for output_i in range(len(outputs)):
#         print(logits)
        print(f"logits: ({str(logits[output_i][0])[:5]},{str(logits[output_i][1])[:5]})  Argmaxed:{outputs[output_i]}")


           
    print('prediction:',outputs)
    print('labels:    ',label_ids)
    break
    tmp_test_accuracy=np.sum(outputs == label_ids)

    test_loss += tmp_test_loss.mean().item()
    test_accuracy += tmp_test_accuracy

    nb_test_examples += input_ids.size(0)
    nb_test_steps += 1

test_loss = test_loss / nb_test_steps
test_accuracy = test_accuracy / nb_test_examples


[[8.85171667e-02 9.11482811e-01]
 [9.92075920e-01 7.92405941e-03]
 [8.90031457e-01 1.09968565e-01]
 [9.98793125e-01 1.20683329e-03]
 [9.92757548e-03 9.90072370e-01]
 [9.94671762e-01 5.32828923e-03]
 [9.98390675e-01 1.60935021e-03]
 [9.99854088e-01 1.45898186e-04]]
logits: (0.088,0.911)  Argmaxed:1
logits: (0.992,0.007)  Argmaxed:0
logits: (0.890,0.109)  Argmaxed:0
logits: (0.998,0.001)  Argmaxed:0
logits: (0.009,0.990)  Argmaxed:1
logits: (0.994,0.005)  Argmaxed:0
logits: (0.998,0.001)  Argmaxed:0
logits: (0.999,0.000)  Argmaxed:0
prediction: [1 0 0 0 1 0 0 0]
labels:     [1 0 0 0 1 0 0 0]


ZeroDivisionError: division by zero

In [None]:
[[8.85171667e-02 9.11482811e-01]
 [9.92075920e-01 7.92405941e-03]
 [8.90031457e-01 1.09968565e-01]
 [9.98793125e-01 1.20683329e-03]
 [9.92757548e-03 9.90072370e-01]
 [9.94671762e-01 5.32828923e-03]
 [9.98390675e-01 1.60935021e-03]
 [9.99854088e-01 1.45898186e-04]]

In [128]:
import pandas as pd
df = pd.read_csv('.\\data\\covid\\bert-pair\\test_NLI_B.csv',sep='\t',header=None)
df[1]

0       1
1       0
2       0
3       0
4       1
       ..
8507    0
8508    0
8509    1
8510    0
8511    0
Name: 1, Length: 8512, dtype: int64

In [133]:
df_pred = pd.read_csv('.\\results\\covid\\NLI_B_EVAL\\test_ep_0.txt',sep=' ',header=None)
df_pred[0]

0       1
1       0
2       0
3       0
4       1
       ..
8507    0
8508    0
8509    0
8510    0
8511    0
Name: 0, Length: 8512, dtype: int64

In [134]:
(df[1]==df_pred[0]).sum()/len(df)

0.9339755639097744

In [141]:
df['pred'] = df_pred[0]
df.head()

Unnamed: 0,0,1,2,3,pred
0,1229853968957833216,1,unrelated - politics,Self-driving technology + road roller! #CSCEC ...,1
1,1229853968957833216,0,neutral - politics,Self-driving technology + road roller! #CSCEC ...,0
2,1229853968957833216,0,negative - politics,Self-driving technology + road roller! #CSCEC ...,0
3,1229853968957833216,0,positive - politics,Self-driving technology + road roller! #CSCEC ...,0
4,1229853968957833216,1,unrelated - economy,Self-driving technology + road roller! #CSCEC ...,1


In [145]:
df[3]

0       Self-driving technology + road roller! #CSCEC ...
1       Self-driving technology + road roller! #CSCEC ...
2       Self-driving technology + road roller! #CSCEC ...
3       Self-driving technology + road roller! #CSCEC ...
4       Self-driving technology + road roller! #CSCEC ...
                              ...                        
8507    This very crafty Beijing taxi driver construct...
8508    This very crafty Beijing taxi driver construct...
8509    This very crafty Beijing taxi driver construct...
8510    This very crafty Beijing taxi driver construct...
8511    This very crafty Beijing taxi driver construct...
Name: 3, Length: 8512, dtype: object

In [146]:
df=df[[0,1,'pred',2,3]]

In [147]:
df

Unnamed: 0,0,1,pred,2,3
0,1229853968957833216,1,1,unrelated - politics,Self-driving technology + road roller! #CSCEC ...
1,1229853968957833216,0,0,neutral - politics,Self-driving technology + road roller! #CSCEC ...
2,1229853968957833216,0,0,negative - politics,Self-driving technology + road roller! #CSCEC ...
3,1229853968957833216,0,0,positive - politics,Self-driving technology + road roller! #CSCEC ...
4,1229853968957833216,1,1,unrelated - economy,Self-driving technology + road roller! #CSCEC ...
...,...,...,...,...,...
8507,1233983941692076032,0,0,positive - racism,This very crafty Beijing taxi driver construct...
8508,1233983941692076032,0,0,unrelated - overall,This very crafty Beijing taxi driver construct...
8509,1233983941692076032,1,0,neutral - overall,This very crafty Beijing taxi driver construct...
8510,1233983941692076032,0,0,negative - overall,This very crafty Beijing taxi driver construct...


In [148]:
df.to_csv('pred_target.csv',index=False)

In [None]:
df_pred = pd.read_csv('.\\results\\covid\\NLI_B_EVAL\\test_ep_0.txt',sep=' ',header=None)

In [1]:
df

NameError: name 'df' is not defined