In [None]:
import torch
import re
import numpy as np
import random
import torch.nn as nn
import json
import csv
from transformers import BertPreTrainedModel, RobertaConfig
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

ROOT_DIR = '~/NLI-KB/'
CACHE_DIR = '~/.cache/'
CSQA_PATH = 'datasets/csqa.jsonl'

In [None]:
# helper function: read and dump data
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
from dataclasses import dataclass
      
class CommonsenseqaSentence(object):
    answerKey: int = 1
    choicesA: str = None
    choicesB: str = None
    choicesC: str = None
    choicesD: str = None
    choicesE: str = None
    question: str = None
    qid: str = None

def load_commonsenseqa_from_path(filepath: str):
    csqa = load_jsonl(filepath)
    answerToIndex = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
    csqs_sentences = list()
    for c in csqa:
        s = CommonsenseqaSentence()
        s.qid = c['id']
        s.question = c['question']['stem']
        s.choicesA = c['question']['choices'][0]['text']
        s.choicesB = c['question']['choices'][1]['text']
        s.choicesC = c['question']['choices'][2]['text']
        s.choicesD = c['question']['choices'][3]['text']
        s.choicesE = c['question']['choices'][4]['text']
        s.answerKey = answerToIndex[c['answerKey']]
        csqs_sentences.append(s)
    return csqs_sentences    

@dataclass
class ExpConfig(object):
    # JSONL file path
    dataset_path: str = ""
        
    winowhy_dataset_path: str = ""
        
    atomic_dataset_path: str = ""
        
    conceptnet_dataset_path: str = ""
        
    dataset: str = "winowhy"
    # Task description
    task_name: str = ""
    # Only using single GPU
    gpu_id: int = 0
    # Seed for random
    seed: int = 42
    # 'cpu', 'cuda'
    device: str = 'cpu' 
    # "roberta-base", "roberta-largbe"
    model_name: str = ""
    # If model_path is not None or not empty, load model from model_path instead of transformers' pretrained ones
    model_path: str = ""
    # For training the classifier layer
    learning_rate: float = 1e-3
    # Number of total epochs
    num_training_epochs: int = 15
    # Max sequence length
    max_seq_len: int = 128
        
    batch_size: int = 1

    def set_seed(self, new_seed = None):
        seed = self.seed if new_seed is None else new_seed
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def set_gpu_if_possible(self, gpu_id = None):
        if torch.cuda.is_available():
            self.device = 'cuda'
            if gpu_id is not None:
                self.device = 'cuda:{}'.format(gpu_id)
        else:
            self.device = 'cpu'
            
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
    
class RobertaOnlyClassificationHead(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)

    def forward(self, sequence_output):
        logits = self.classifier(sequence_output)
        return logits 


# Use RoBERTa

In [None]:
def roberta_cross_entropy_for_csqa(cs_sentence: CommonsenseqaSentence, model: RobertaForMaskedLM, tokenizer: RobertaTokenizer, max_seq_len: int, device: str='cpu') -> float:

    MAX_SEQ_LEN = max_seq_len
    question = tokenizer.tokenize(" Q: " + cs_sentence.question, add_prefix_space=True)
    Atoken = tokenizer.tokenize(" A: ", add_prefix_space=True)
    choicesA = tokenizer.tokenize(cs_sentence.choicesA, add_prefix_space=True)
    choicesB = tokenizer.tokenize(cs_sentence.choicesB, add_prefix_space=True)
    choicesC = tokenizer.tokenize(cs_sentence.choicesC, add_prefix_space=True)
    choicesD = tokenizer.tokenize(cs_sentence.choicesD, add_prefix_space=True)
    choicesE = tokenizer.tokenize(cs_sentence.choicesE, add_prefix_space=True)
    
    choicesA_masks = [tokenizer.mask_token] * len(choicesA)
    choicesB_masks = [tokenizer.mask_token] * len(choicesB)
    choicesC_masks = [tokenizer.mask_token] * len(choicesC)
    choicesD_masks = [tokenizer.mask_token] * len(choicesD)
    choicesE_masks = [tokenizer.mask_token] * len(choicesE)

    input_ids_A = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + question + [tokenizer.sep_token] + Atoken + choicesA_masks + [tokenizer.sep_token])
    input_ids_B = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + question + [tokenizer.sep_token] + Atoken + choicesB_masks + [tokenizer.sep_token])
    input_ids_C = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + question + [tokenizer.sep_token] + Atoken + choicesC_masks + [tokenizer.sep_token])
    input_ids_D = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + question + [tokenizer.sep_token] + Atoken + choicesD_masks + [tokenizer.sep_token])
    input_ids_E = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + question + [tokenizer.sep_token] + Atoken + choicesE_masks + [tokenizer.sep_token])

    masked_lm_labels_A = [-100] + [-100]*len(question) + [-100] + [-100]*len(Atoken) + tokenizer.convert_tokens_to_ids(choicesA) + [-100]
    masked_lm_labels_B = [-100] + [-100]*len(question) + [-100] + [-100]*len(Atoken) + tokenizer.convert_tokens_to_ids(choicesA) + [-100]
    masked_lm_labels_C = [-100] + [-100]*len(question) + [-100] + [-100]*len(Atoken) + tokenizer.convert_tokens_to_ids(choicesA) + [-100]
    masked_lm_labels_D = [-100] + [-100]*len(question) + [-100] + [-100]*len(Atoken) + tokenizer.convert_tokens_to_ids(choicesA) + [-100]
    masked_lm_labels_E = [-100] + [-100]*len(question) + [-100] + [-100]*len(Atoken) + tokenizer.convert_tokens_to_ids(choicesA) + [-100]
    
    input_ids_A = input_ids_A[:MAX_SEQ_LEN]
    input_ids_B = input_ids_B[:MAX_SEQ_LEN]
    input_ids_C = input_ids_C[:MAX_SEQ_LEN]
    input_ids_D = input_ids_D[:MAX_SEQ_LEN]
    input_ids_E = input_ids_E[:MAX_SEQ_LEN]
    
    masked_lm_labels_A = masked_lm_labels_A[:MAX_SEQ_LEN]
    masked_lm_labels_B = masked_lm_labels_B[:MAX_SEQ_LEN]
    masked_lm_labels_C = masked_lm_labels_C[:MAX_SEQ_LEN]
    masked_lm_labels_D = masked_lm_labels_D[:MAX_SEQ_LEN]
    masked_lm_labels_E = masked_lm_labels_E[:MAX_SEQ_LEN]
    
    attention_mask_A = [1] * len(input_ids_A)
    attention_mask_B = [1] * len(input_ids_B)
    attention_mask_C = [1] * len(input_ids_C)
    attention_mask_D = [1] * len(input_ids_D)
    attention_mask_E = [1] * len(input_ids_E)
    
    input_ids_A += [1] * (MAX_SEQ_LEN - len(input_ids_A))
    input_ids_B += [1] * (MAX_SEQ_LEN - len(input_ids_B))
    input_ids_C += [1] * (MAX_SEQ_LEN - len(input_ids_C))
    input_ids_D += [1] * (MAX_SEQ_LEN - len(input_ids_D))
    input_ids_E += [1] * (MAX_SEQ_LEN - len(input_ids_E))
    
    masked_lm_labels_A += [-100] * (MAX_SEQ_LEN - len(masked_lm_labels_A))
    masked_lm_labels_B += [-100] * (MAX_SEQ_LEN - len(masked_lm_labels_B))
    masked_lm_labels_C += [-100] * (MAX_SEQ_LEN - len(masked_lm_labels_C))
    masked_lm_labels_D += [-100] * (MAX_SEQ_LEN - len(masked_lm_labels_D))
    masked_lm_labels_E += [-100] * (MAX_SEQ_LEN - len(masked_lm_labels_E))
    
    attention_mask_A += [0] * (MAX_SEQ_LEN - len(attention_mask_A))
    attention_mask_B += [0] * (MAX_SEQ_LEN - len(attention_mask_B))
    attention_mask_C += [0] * (MAX_SEQ_LEN - len(attention_mask_C))
    attention_mask_D += [0] * (MAX_SEQ_LEN - len(attention_mask_D))
    attention_mask_E += [0] * (MAX_SEQ_LEN - len(attention_mask_E))
    
    input_ids_A = torch.tensor([input_ids_A]).to(device)
    input_ids_B = torch.tensor([input_ids_B]).to(device)
    input_ids_C = torch.tensor([input_ids_C]).to(device)
    input_ids_D = torch.tensor([input_ids_D]).to(device)
    input_ids_E = torch.tensor([input_ids_E]).to(device)
    
    masked_lm_labels_A = torch.tensor([masked_lm_labels_A]).to(device)
    masked_lm_labels_B = torch.tensor([masked_lm_labels_B]).to(device)
    masked_lm_labels_C = torch.tensor([masked_lm_labels_C]).to(device)
    masked_lm_labels_D = torch.tensor([masked_lm_labels_D]).to(device)
    masked_lm_labels_E = torch.tensor([masked_lm_labels_E]).to(device)
    
    attention_mask_A = torch.tensor([attention_mask_A]).to(device)
    attention_mask_B = torch.tensor([attention_mask_B]).to(device)
    attention_mask_C = torch.tensor([attention_mask_C]).to(device)
    attention_mask_D = torch.tensor([attention_mask_D]).to(device)
    attention_mask_E = torch.tensor([attention_mask_E]).to(device)

    with torch.no_grad():
        model.eval()
        outputs_A = model(input_ids_A, attention_mask=attention_mask_A, masked_lm_labels=masked_lm_labels_A)[0].to('cpu').item()
        outputs_B = model(input_ids_B, attention_mask=attention_mask_B, masked_lm_labels=masked_lm_labels_B)[0].to('cpu').item()
        outputs_C = model(input_ids_C, attention_mask=attention_mask_C, masked_lm_labels=masked_lm_labels_C)[0].to('cpu').item()
        outputs_D = model(input_ids_D, attention_mask=attention_mask_D, masked_lm_labels=masked_lm_labels_D)[0].to('cpu').item()
        outputs_E = model(input_ids_E, attention_mask=attention_mask_E, masked_lm_labels=masked_lm_labels_E)[0].to('cpu').item()
        scores = [outputs_A, outputs_B, outputs_C, outputs_D, outputs_E]
        
    return scores.index(min(scores))


def roberta_for_csqa(config: ExpConfig, roberta_cross_entropy_for_csqa):

    csqa_sentences = load_commonsenseqa_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaForMaskedLM.from_pretrained(config.model_path)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    model.eval()
    model.to(config.device)
    
    #result = open(SAVE_DIR + config.model_name + ".csv", "w")
    #writer = csv.writer(result)
    #writer.writerow(["pred", "label", "question"])
    
    correct = 0
    for cs in csqa_sentences:
        pred = roberta_cross_entropy_for_csqa(cs, model, tokenizer, config.max_seq_len, config.device)
        
        if pred == cs.answerKey:
            correct += 1
        #writer.writerow([pred, cs.answerKey, cs.question])
            
    acc = correct / len(csqa_sentences)
    print(len(csqa_sentences))
    return acc

In [None]:
for model_name in ['roberta-base', 'roberta-large']:
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    robertaconfig = ExpConfig()
    robertaconfig.set_seed()
    robertaconfig.set_gpu_if_possible(0)
    robertaconfig.dataset = 'commonsenseqa'
    robertaconfig.dataset_path = ROOT_DIR + CSQA_PATH
    robertaconfig.task_name = 'Test on CommonsenseQA'
    robertaconfig.model_name = model_name
    robertaconfig.model_path = CACHE_DIR + model_name
    
    print('\n================================')
    print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))
    
    scores = roberta_for_csqa(robertaconfig, roberta_cross_entropy_for_csqa)
    
    print('Scores: {}'.format(scores))
    print('================================')