-
Notifications
You must be signed in to change notification settings - Fork 1
/
evaluate.py
153 lines (118 loc) · 5.13 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""Evaluate the model"""
import argparse
import random
import logging
import os
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import trange
from sequence_tagger import BertOnlyForSequenceTagging as BertForSequenceTagging
from seqeval.metrics import f1_score, classification_report
from data_loader import DataLoader
import utils
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', default='conll',
help="Directory containing the dataset")
parser.add_argument('--seed', type=int, default=23,
help="random seed for initialization")
parser.add_argument('--multi_gpu', default=False, action='store_true',
help="Whether to use multiple GPUs if available")
parser.add_argument('--fp16', default=False, action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
def evaluate(model, data_iterator, params, mark='Eval', verbose=False):
"""Evaluate the model on `steps` batches."""
# set model to evaluation mode
model.eval()
idx2tag = params.idx2tag
true_tags = []
pred_tags = []
# a running average object for loss
loss_avg = utils.RunningAverage()
one_epoch = trange(params.eval_steps)
for step, batch in zip(one_epoch, data_iterator):
# fetch the next evaluation batch
input_ids, label_ids, attention_mask, sentence_ids, label_mask = batch
with torch.no_grad():
loss, logits, labels = model(input_ids, token_type_ids=sentence_ids,
attention_mask=attention_mask, labels=label_ids, label_masks=label_mask)
if params.n_gpu > 1 and params.multi_gpu:
loss = loss.mean()
loss_avg.update(loss.item())
batch_output = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
batch_output = batch_output.detach().cpu().numpy()
batch_tags = labels.to('cpu').numpy()
batch_true_tags = [
[idx2tag.get(idx) for idx in indices[np.where(indices != -1)]]
for indices in batch_tags]
batch_pred_tags = [
[idx2tag.get(idx) for idx in indices[np.where(batch_tags[i] != -1)]]
for i, indices in enumerate(batch_output)]
true_tags.extend(batch_true_tags)
pred_tags.extend(batch_pred_tags)
one_epoch.set_postfix(eval_loss='{:05.3f}'.format(loss_avg()))
assert len(pred_tags) == len(true_tags)
# logging loss, f1 and report
metrics = {}
f1 = f1_score(true_tags, pred_tags)
metrics['loss'] = loss_avg()
metrics['f1'] = f1
metrics_str = "; ".join("{}: {:05.4f}".format(k, v)
for k, v in metrics.items())
logging.info("- {} metrics: ".format(mark) + metrics_str)
if verbose:
report = classification_report(true_tags, pred_tags)
logging.info(report)
return metrics
if __name__ == '__main__':
args = parser.parse_args()
tagger_model_dir = 'experiments/' + args.dataset
# Load the parameters from json file
json_path = os.path.join(tagger_model_dir, 'params.json')
assert os.path.isfile(
json_path), "No json configuration file found at {}".format(json_path)
params = utils.Params(json_path)
# Use GPUs if available
params.device = torch.device(
'cuda' if torch.cuda.is_available() else 'cpu')
params.n_gpu = torch.cuda.device_count()
params.multi_gpu = args.multi_gpu
# Set the random seed for reproducible experiments
random.seed(args.seed)
torch.manual_seed(args.seed)
if params.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed) # set random seed for all GPUs
params.seed = args.seed
# Set the logger
utils.set_logger(os.path.join(tagger_model_dir, 'evaluate.log'))
# Create the input data pipeline
logging.info("Loading the dataset...")
# Initialize the DataLoader
data_dir = 'data/' + args.dataset
if args.dataset in ["conll"]:
bert_model_dir = 'pretrained_bert_models/bert-base-cased/'
elif args.dataset in ["msra"]:
bert_model_dir = 'pretrained_bert_models/bert-base-chinese/'
data_loader = DataLoader(data_dir, bert_model_dir, params)
# Load data
test_data = data_loader.load_data('test')
# Specify the test set size
params.test_size = test_data.__len__()
params.eval_steps = params.test_size // params.batch_size
test_data_iterator = data_loader.data_iterator(test_data, shuffle=False)
logging.info("- done.")
# Define the model
# config_path = os.path.join(args.bert_model_dir, 'config.json')
# config = BertConfig.from_json_file(config_path)
# model = BertForTokenClassification(config, num_labels=len(params.tag2idx))
# model = BertForSequenceTagging(config)
model = BertForSequenceTagging.from_pretrained(tagger_model_dir)
model.device = params.device
model.to(params.device)
if args.fp16:
model.half()
if params.n_gpu > 1 and args.multi_gpu:
model = torch.nn.DataParallel(model)
logging.info("Starting evaluation...")
test_metrics = evaluate(model, test_data_iterator,
params, mark='Test', verbose=True)