In [1]:
#Define which GPU to use
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '6'

In [2]:
from collections import defaultdict
import json
import math
import os
import sys

import numpy as np
from sklearn.metrics import confusion_matrix, f1_score

import torch
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertForSequenceClassification, BertConfig, RobertaForSequenceClassification, RobertaConfig,RobertaTokenizer 
from util import parse_tagname, f1_score_multilabel

In [3]:
from dataset import (
    collate_fn,
    #CTA
    CTASingleColumnDataset,
    CTAAllTableDataset,
    #CPA
    CPASingleColumnDataset,
    CPAAllTableDataset,
)
from model import BertForMultiOutputClassification, BertMultiPairPooler
from util import f1_score_multilabel, set_seed

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
#Define the task to evaluate
#Possible method: cta, cpa or doduo(multi-task)
method = 'doduo'
#Specify which serialization strategy to use
#Possible: single-column, all-table ... (+ new ones)
serialization = 'all-table'
#Specify which language model to use:
#bert-base-uncased, roberta-base ...
model_name = 'bert-base-uncased'

In [7]:
if method == 'cta':
    tasks = ['cta']
elif method == 'cpa':
    tasks = ['cpa']
else:
    tasks = ['cta', 'cpa']

In [8]:
#Model parameters
max_length = 32
batch_size = 16
#Number of classes per task
task_num_class_dict = {
        "cta": 91,
        "cpa": 176
    }
filepaths_task_dict = {
    "cta": "data/CTA/cta_lm.pkl",
    "cpa": "data/CPA/cpa_lm.pkl",
}
serialization_method_dict = {
    "cta": {
        "single-column": CTASingleColumnDataset,
        "all-table": CTAAllTableDataset
    },
    "cpa": {
        "single-column": CPASingleColumnDataset,
        "all-table": CPAAllTableDataset
    }
}

In [9]:
#Tokenizer based on language model
if 'roberta' in model_name:
    tokenizer = RobertaTokenizer.from_pretrained(model_name, use_fast=True)
    base_model = 'roberta'
else:
    tokenizer = BertTokenizer.from_pretrained(model_name, use_fast=True)
    base_model = 'bert'

In [10]:
for task in tasks:
    
    if not os.path.exists('eval/'):
        print("{} not exist. Created.".format('eval/'))
        os.makedirs('eval/')
        
    model_path = "model/{}_{}_{}_{}-bs{}-ml-{}.pt".format(method, task, serialization, model_name, batch_size, max_length)
    print(model_path)
    
    if serialization == 'single-column':
        #Choose model
        if 'roberta' in model_name:
            model_config = RobertaConfig.from_pretrained(model_name, num_labels=task_num_class_dict[task])
            model = RobertaForSequenceClassification(model_config).to(device)
        else:
            model_config = BertConfig.from_pretrained(model_name, num_labels=task_num_class_dict[task])
            model = BertForSequenceClassification(model_config).to(device)
        
        #Choose serialization
        dataset_serialization = serialization_method_dict[task][serialization]
        
    #Add more conditions when adding new serialization methods        
    else:
        if 'roberta' in model_name:
            model = RobertaForMultiOutputClassification.from_pretrained(
                    model_name,
                    num_labels=task_num_class_dict[task],
                    output_attentions=False,
                    output_hidden_states=False,
                ).to(device)
        else:
            model = BertForMultiOutputClassification.from_pretrained(
                    model_name,
                    num_labels=task_num_class_dict[task],
                    output_attentions=False,
                    output_hidden_states=False,
                ).to(device)
        
        dataset_serialization = serialization_method_dict[task][serialization]
            
        #What is the difference: using multipair pooler instead of usual pooler
        if task == "cpa":
            print("Use column-pair pooling")
            #Change for Roberta!!!
            # Use column pair embeddings
            config = BertConfig.from_pretrained(model_name)
            model.bert.pooler = BertMultiPairPooler(config).to(device)
            
            
    #Load test datasets and datasetloaders
    test_dataset = dataset_serialization(filepath=filepaths_task_dict[task],
                                   split="test",
                                   tokenizer=tokenizer,
                                   max_length=max_length,
                                   bert=base_model,
                                   device=device)
    
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 collate_fn=collate_fn)
    

    model.load_state_dict(torch.load(model_path, map_location=device))
    test_predictions = []
    test_labels = []
    
    eval_dict = {}
    for batch_idx, batch in enumerate(test_dataloader):
        if serialization == 'single-column':
            
            batch_input_ids = batch["data"].T.to(device)
            batch_mask = batch["attention"].T.to(device)
            #For cross-entropy loss labels should not be vectors
            batch_labels = torch.tensor([label.tolist().index(1) for label in batch["label"]]).to(device)

            loss, logits = model(batch_input_ids, token_type_ids=None, attention_mask=batch_mask, labels=batch_labels, return_dict=False)

            for p in logits.argmax(axis=-1):
                y = [0] * logits.shape[1]
                y[p] = 1
                test_predictions.append(y)

            test_labels += batch["label"].cpu().detach().numpy().tolist()
        else:
            logits, = model(input_ids = batch["data"].T)

            # Align the tensor shape when the size is 1
            if len(logits.shape) == 2:
                logits = logits.unsqueeze(0)

            cls_indexes = torch.nonzero( batch["data"].T == tokenizer.cls_token_id)
            filtered_logits = torch.zeros(cls_indexes.shape[0], logits.shape[2]).to(device)

            #Mark where CLS tokens are located
            for n in range(cls_indexes.shape[0]):
                i, j = cls_indexes[n]
                logit_n = logits[i, j, :]
                filtered_logits[n] = logit_n

            if task == 'cta':
                for pred in filtered_logits.argmax(axis=-1):
                    y = [0] * filtered_logits.shape[1]
                    y[pred] = 1
                    test_predictions.append(y)

                test_labels += batch["label"].cpu().detach().numpy().tolist()
                
            else:
                all_preds = []
                for pred in filtered_logits.argmax(axis=-1):
                    y = [0] * filtered_logits.shape[1]
                    y[pred] = 1
                    all_preds.append(y)
                    
                all_labels = batch["label"].cpu().detach().numpy()
                # Ignore the very first CLS token
                idxes = np.where(all_labels > 0)[0]
                test_predictions += [ pred for i, pred in enumerate(all_preds) if i in idxes ]
                test_labels += [label.tolist() for label in batch["label"] if 1 in label.tolist()]
                
    
    ts_micro_f1, ts_macro_f1, ts_class_f1, ts_conf_mat = f1_score_multilabel(test_labels, test_predictions)
    
    eval_dict["ts_micro_f1"] = ts_micro_f1
    if type(ts_class_f1) != list:
        ts_class_f1 = ts_class_f1.tolist()
    eval_dict["ts_class_f1"] = ts_class_f1
    if type(ts_conf_mat) != list:
        ts_conf_mat = ts_conf_mat.tolist()
    eval_dict["confusion_matrix"] = ts_conf_mat
    
    print("test_macro_f1={:.4f} test_micro_f1={:.4f} "
        .format(ts_macro_f1, ts_micro_f1))
        
#     with open(output_filepath, "w") as fout:
#         json.dump(eval_dict, fout)
    
    

model/doduo_cta_all-table_bert-base-uncased-bs16-ml-32.pt


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiOutputClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiOutputClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiOutputClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiOutputClassification were not initialized from the model c

Loading already processed test dataset
test_macro_f1=0.8433 test_micro_f1=0.8538 
model/doduo_cpa_all-table_bert-base-uncased-bs16-ml-32.pt


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultiOutputClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForMultiOutputClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultiOutputClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultiOutputClassification were not initialized from the model c

Use column-pair pooling
Loading already processed test dataset
test_macro_f1=0.7758 test_micro_f1=0.8037 
