In [None]:
import torch
import re
import numpy as np
import random
import torch.nn as nn
import json
import csv
from transformers import BertPreTrainedModel, RobertaConfig
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

ROOT_DIR = '~/NLI-KB/'
CACHE_DIR = '~/.cache/'
ATOMIC_PATH= 'datasets/atomic.csv'
CN_PATH = 'datasets/conceptnet.csv'
CSQA_PATH = 'datasets/csqa.jsonl'
MODEL_LIST = ['roberta-base-qnli', 'roberta-base-mnli', 'roberta-large-qnli', 'roberta-large-mnli']

In [None]:
# helper function: read and dump data
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
from dataclasses import dataclass
      
class CommonsenseqaSentence(object):
    answerKey: int = 1
    choicesA: str = None
    choicesB: str = None
    choicesC: str = None
    choicesD: str = None
    choicesE: str = None
    question: str = None
    qid: str = None

def load_commonsenseqa_from_path(filepath: str):
    csqa = load_jsonl(filepath)
    answerToIndex = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
    csqs_sentences = list()
    for c in csqa:
        s = CommonsenseqaSentence()
        s.qid = c['id']
        s.question = c['question']['stem']
        s.choicesA = c['question']['choices'][0]['text']
        s.choicesB = c['question']['choices'][1]['text']
        s.choicesC = c['question']['choices'][2]['text']
        s.choicesD = c['question']['choices'][3]['text']
        s.choicesE = c['question']['choices'][4]['text']
        s.answerKey = answerToIndex[c['answerKey']]
        csqs_sentences.append(s)
    return csqs_sentences    

@dataclass
class ExpConfig(object):
    # JSONL file path
    dataset_path: str = ""
        
    winowhy_dataset_path: str = ""
        
    atomic_dataset_path: str = ""
        
    conceptnet_dataset_path: str = ""
        
    dataset: str = "winowhy"
    # Task description
    task_name: str = ""
    # Only using single GPU
    gpu_id: int = 0
    # Seed for random
    seed: int = 42
    # 'cpu', 'cuda'
    device: str = 'cpu' 
    # "roberta-base", "roberta-largbe"
    model_name: str = ""
    # If model_path is not None or not empty, load model from model_path instead of transformers' pretrained ones
    model_path: str = ""
    # For training the classifier layer
    learning_rate: float = 1e-3
    # Number of total epochs
    num_training_epochs: int = 15
    # Max sequence length
    max_seq_len: int = 128
        
    batch_size: int = 1

    def set_seed(self, new_seed = None):
        seed = self.seed if new_seed is None else new_seed
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def set_gpu_if_possible(self, gpu_id = None):
        if torch.cuda.is_available():
            self.device = 'cuda'
            if gpu_id is not None:
                self.device = 'cuda:{}'.format(gpu_id)
        else:
            self.device = 'cpu'
            
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
    
class RobertaOnlyClassificationHead(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)

    def forward(self, sequence_output):
        logits = self.classifier(sequence_output)
        return logits 


## Extract sentences with TopK-similarity from ATOMIC (full set)

In [None]:
def model_dataloader(atomic_set, tokenizer: RobertaTokenizer, batch_size: int):
    atomic_sens = list()
    for sen in atomic_set:
        atomic_sen = tokenizer.encode("<s> " + sen + " </s>", add_special_tokens=True)
        atomic_sen = atomic_sen[:36]
        atomic_sen = atomic_sen + [1] * (36 - len(atomic_sen))
        atomic_sens.append(atomic_sen)
        
    atomic_inputs = torch.tensor(atomic_sens)
    data = TensorDataset(atomic_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    return dataloader


def roberta_for_KB(config: ExpConfig):
    
    csqa_sentences = load_commonsenseqa_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path)#, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
    model.eval()
    model.to(config.device)
    
    atomic = torch.tensor([])
    atomic_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + ATOMIC_PATH,'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            atomic_set.append(sen)
        f.close()
        atomic_dataloader = model_dataloader(atomic_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(atomic_dataloader):
            atomic_input_ids = batch[0]
            atomic_input_ids = atomic_input_ids.to(config.device)
            atomic_output = model(atomic_input_ids)
            atomic_output = atomic_output[0].mean(dim=1).cpu()
            atomic = torch.cat((atomic, atomic_output), 0)
            if(atomic.size()[0] % 10240 == 0):
                print(atomic.size())
        print(atomic.size())
    
    csqa_A = torch.tensor([])
    csqa_B = torch.tensor([])
    csqa_C = torch.tensor([])
    csqa_D = torch.tensor([])
    csqa_E = torch.tensor([])
    
    with torch.no_grad():
        for cs in csqa_sentences:
            # cs_sentence_A = cs.question + ' ' + cs.choicesA
            cs_sentence_A = '<s> ' + cs.question + ' </s> ' + cs.choicesA + ' </s>'
            cs_sentence_B = '<s> ' + cs.question + ' </s> ' + cs.choicesB + ' </s>'
            cs_sentence_C = '<s> ' + cs.question + ' </s> ' + cs.choicesC + ' </s>'
            cs_sentence_D = '<s> ' + cs.question + ' </s> ' + cs.choicesD + ' </s>'
            cs_sentence_E = '<s> ' + cs.question + ' </s> ' + cs.choicesE + ' </s>'
            
            cs_input_A = torch.tensor(tokenizer.encode(cs_sentence_A, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_B = torch.tensor(tokenizer.encode(cs_sentence_B, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_C = torch.tensor(tokenizer.encode(cs_sentence_C, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_D = torch.tensor(tokenizer.encode(cs_sentence_D, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_E = torch.tensor(tokenizer.encode(cs_sentence_E, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            
            cs_output_A = model(cs_input_A)[0].mean(dim=1).cpu()
            cs_output_B = model(cs_input_B)[0].mean(dim=1).cpu()
            cs_output_C = model(cs_input_C)[0].mean(dim=1).cpu()
            cs_output_D = model(cs_input_D)[0].mean(dim=1).cpu()
            cs_output_E = model(cs_input_E)[0].mean(dim=1).cpu()
            
            csqa_A = torch.cat((csqa_A, cs_output_A), 0)
            csqa_B = torch.cat((csqa_B, cs_output_B), 0)
            csqa_C = torch.cat((csqa_C, cs_output_C), 0)
            csqa_D = torch.cat((csqa_D, cs_output_D), 0)
            csqa_E = torch.cat((csqa_E, cs_output_E), 0)
            
        print(csqa_A.size(), csqa_B.size(), csqa_C.size(), csqa_D.size(), csqa_E.size())
    first_norm_A = csqa_A / csqa_A.norm(dim=1)[:, None]
    first_norm_B = csqa_B / csqa_B.norm(dim=1)[:, None]
    first_norm_C = csqa_C / csqa_C.norm(dim=1)[:, None]
    first_norm_D = csqa_D / csqa_D.norm(dim=1)[:, None]
    first_norm_E = csqa_E / csqa_E.norm(dim=1)[:, None]
    
    second_norm = atomic / atomic.norm(dim=1)[:, None]
    
    res_A = torch.mm(first_norm_A, second_norm.transpose(0,1))
    res_B = torch.mm(first_norm_B, second_norm.transpose(0,1))
    res_C = torch.mm(first_norm_C, second_norm.transpose(0,1))
    res_D = torch.mm(first_norm_D, second_norm.transpose(0,1))
    res_E = torch.mm(first_norm_E, second_norm.transpose(0,1))
    
    res_value_A, res_index_A = res_A.topk(50, dim=1, largest=True, sorted=True)
    res_value_B, res_index_B = res_B.topk(50, dim=1, largest=True, sorted=True)
    res_value_C, res_index_C = res_C.topk(50, dim=1, largest=True, sorted=True)
    res_value_D, res_index_D = res_D.topk(50, dim=1, largest=True, sorted=True)
    res_value_E, res_index_E = res_E.topk(50, dim=1, largest=True, sorted=True)
    
    print(res_index_A.size(), res_index_B.size(), res_index_C.size(), res_index_D.size(), res_index_E.size())
    return res_index_A.numpy(), res_index_B.numpy(), res_index_C.numpy(), res_index_D.numpy(), res_index_E.numpy()

In [None]:
for model_name in MODEL_LIST:
    final = {}
    for cat in ['Overall']:
        final[cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
    for cat in ['Overall']:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'commonsenseqa'
        robertaconfig.dataset_path = ROOT_DIR + CSQA_PATH
        robertaconfig.task_name = 'Test on CommonsenseQA'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res_A, res_B, res_C, res_D, res_E = roberta_for_KB(robertaconfig)
        final[cat]['A'] = res_A.tolist()
        final[cat]['B'] = res_B.tolist()
        final[cat]['C'] = res_C.tolist()
        final[cat]['D'] = res_D.tolist()
        final[cat]['E'] = res_E.tolist()

    KB_DIR = "kb_extract/csqa_atomic/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

    f = open(ROOT_DIR + ATOMIC_PATH,'r', encoding='UTF-8')
    reader = csv.reader(f)
    Overall = []
    for line in reader:
        Overall.append(line[1])
    print(len(Overall))
    print('------------')
    f.close()
    ff = open(ROOT_DIR + KB_DIR + model_name + "-atomic-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}

    print(len(final_dict['Overall']['A']))
    for i in range(len(final_dict['Overall']['A'])):
        final[str(i)] = {}
        for cat in ['Overall']:
            final[str(i)][cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
        for answerIndex in ['A', 'B', 'C', 'D', 'E']:
            aa = list()
            for index in range(50):
                a = Overall[final_dict['Overall'][answerIndex][i][index]]
                aa.append(a)
            final[str(i)]['Overall'][answerIndex] = aa


    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

## Extract sentences with TopK-similarity from ATOMIC (five categories)

In [None]:
def model_dataloader(atomic_set, tokenizer: RobertaTokenizer, batch_size: int):
    atomic_sens = list()
    for sen in atomic_set:
        atomic_sen = tokenizer.encode("<s> " + sen + " </s>", add_special_tokens=True)
        atomic_sen = atomic_sen[:36]
        atomic_sen = atomic_sen + [1] * (36 - len(atomic_sen))
        atomic_sens.append(atomic_sen)
        
    atomic_inputs = torch.tensor(atomic_sens)
    data = TensorDataset(atomic_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    return dataloader


def roberta_for_KB(config: ExpConfig, first_category, second_category):
    
    csqa_sentences = load_commonsenseqa_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaModel.from_pretrained(model_name)
    model.eval()
    model.to(config.device)
    
    atomic = torch.tensor([])
    atomic_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + ATOMIC_PATH,'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            first_cat = line[2]
            second_cat = line[3]
            third_cat = line[4]
            if first_category != '' and first_cat != first_category:
                continue
            if second_category != '' and second_cat != second_category:
                continue
            atomic_set.append(sen)
        f.close()
        atomic_dataloader = model_dataloader(atomic_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(atomic_dataloader):
            atomic_input_ids = batch[0]
            atomic_input_ids = atomic_input_ids.to(config.device)
            atomic_output = model(atomic_input_ids)
            atomic_output = atomic_output[0].mean(dim=1).cpu()
            atomic = torch.cat((atomic, atomic_output), 0)
            if(atomic.size()[0] % 10240 == 0):
                print(atomic.size())
        print(atomic.size())
    
    csqa_A = torch.tensor([])
    csqa_B = torch.tensor([])
    csqa_C = torch.tensor([])
    csqa_D = torch.tensor([])
    csqa_E = torch.tensor([])
    
    with torch.no_grad():
        for cs in csqa_sentences:
            cs_sentence_A = '<s> ' + cs.question + ' </s> ' + cs.choicesA + ' </s>'
            cs_sentence_B = '<s> ' + cs.question + ' </s> ' + cs.choicesB + ' </s>'
            cs_sentence_C = '<s> ' + cs.question + ' </s> ' + cs.choicesC + ' </s>'
            cs_sentence_D = '<s> ' + cs.question + ' </s> ' + cs.choicesD + ' </s>'
            cs_sentence_E = '<s> ' + cs.question + ' </s> ' + cs.choicesE + ' </s>'
            
            cs_input_A = torch.tensor(tokenizer.encode(cs_sentence_A, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_B = torch.tensor(tokenizer.encode(cs_sentence_B, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_C = torch.tensor(tokenizer.encode(cs_sentence_C, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_D = torch.tensor(tokenizer.encode(cs_sentence_D, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_E = torch.tensor(tokenizer.encode(cs_sentence_E, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            
            cs_output_A = model(cs_input_A)[0].mean(dim=1).cpu()
            cs_output_B = model(cs_input_B)[0].mean(dim=1).cpu()
            cs_output_C = model(cs_input_C)[0].mean(dim=1).cpu()
            cs_output_D = model(cs_input_D)[0].mean(dim=1).cpu()
            cs_output_E = model(cs_input_E)[0].mean(dim=1).cpu()
            
            csqa_A = torch.cat((csqa_A, cs_output_A), 0)
            csqa_B = torch.cat((csqa_B, cs_output_B), 0)
            csqa_C = torch.cat((csqa_C, cs_output_C), 0)
            csqa_D = torch.cat((csqa_D, cs_output_D), 0)
            csqa_E = torch.cat((csqa_E, cs_output_E), 0)
            
        print(csqa_A.size(), csqa_B.size(), csqa_C.size(), csqa_D.size(), csqa_E.size())
    first_norm_A = csqa_A / csqa_A.norm(dim=1)[:, None]
    first_norm_B = csqa_B / csqa_B.norm(dim=1)[:, None]
    first_norm_C = csqa_C / csqa_C.norm(dim=1)[:, None]
    first_norm_D = csqa_D / csqa_D.norm(dim=1)[:, None]
    first_norm_E = csqa_E / csqa_E.norm(dim=1)[:, None]
    
    second_norm = atomic / atomic.norm(dim=1)[:, None]
    
    res_A = torch.mm(first_norm_A, second_norm.transpose(0,1))
    res_B = torch.mm(first_norm_B, second_norm.transpose(0,1))
    res_C = torch.mm(first_norm_C, second_norm.transpose(0,1))
    res_D = torch.mm(first_norm_D, second_norm.transpose(0,1))
    res_E = torch.mm(first_norm_E, second_norm.transpose(0,1))
    
    res_value_A, res_index_A = res_A.topk(50, dim=1, largest=True, sorted=True)
    res_value_B, res_index_B = res_B.topk(50, dim=1, largest=True, sorted=True)
    res_value_C, res_index_C = res_C.topk(50, dim=1, largest=True, sorted=True)
    res_value_D, res_index_D = res_D.topk(50, dim=1, largest=True, sorted=True)
    res_value_E, res_index_E = res_E.topk(50, dim=1, largest=True, sorted=True)
    
    print(res_index_A.size(), res_index_B.size(), res_index_C.size(), res_index_D.size(), res_index_E.size())
    return res_index_A.numpy(), res_index_B.numpy(), res_index_C.numpy(), res_index_D.numpy(), res_index_E.numpy()

In [None]:
for model_name in MODEL_LIST:
    final = {}
    for cat in ['Physical-Entity', 'Event-Centered', 'MentalState', 'Persona', 'Behavior']:
        final[cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
    for cat in [('Physical-Entity', ''), ('Event-Centered', ''), ('', 'MentalState'), ('', 'Persona'), ('', 'Behavior')]:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'commonsenseqa'
        robertaconfig.dataset_path = ROOT_DIR + CSQA_PATH
        robertaconfig.task_name = 'Test on CommonsenseQA'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res_A, res_B, res_C, res_D, res_E = roberta_for_KB(robertaconfig, cat[0], cat[1])
        if cat[0] != '':
            final[cat[0]]['A'] = res_A.tolist()
            final[cat[0]]['B'] = res_B.tolist()
            final[cat[0]]['C'] = res_C.tolist()
            final[cat[0]]['D'] = res_D.tolist()
            final[cat[0]]['E'] = res_E.tolist()
        else:
            final[cat[1]]['A'] = res_A.tolist()
            final[cat[1]]['B'] = res_B.tolist()
            final[cat[1]]['C'] = res_C.tolist()
            final[cat[1]]['D'] = res_D.tolist()
            final[cat[1]]['E'] = res_E.tolist()

    KB_DIR = "kb_extract/csqa_atomic/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic-category-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()
    
    f = open(ROOT_DIR + ATOMIC_PATH,'r', encoding='UTF-8')
    reader = csv.reader(f)
    PE = []
    EC = []
    MS = []
    Per = []
    Be = []
    for line in reader:
        if(line[2] == 'Physical-Entity'):
            PE.append(line[1])
        if(line[2] == 'Event-Centered'):
            EC.append(line[1])
        if(line[3] == 'MentalState'):
            MS.append(line[1])
        if(line[3] == 'Persona'):
            Per.append(line[1])
        if(line[3] == 'Behavior'):
            Be.append(line[1])
    f.close()
    
    ff = open(ROOT_DIR + KB_DIR + model_name + "-atomic-category-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}

    for i in range(len(final_dict['Physical-Entity']['A'])):
        final[str(i)] = {}
        for cat in ['Physical-Entity', 'Event-Centered', 'MentalState', 'Persona', 'Behavior']:
            final[str(i)][cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
        for answerIndex in ['A', 'B', 'C', 'D', 'E']:
            aa = list()
            bb = list()
            cc = list()
            dd = list()
            ee = list()
            for index in range(50):
                a = PE[final_dict['Physical-Entity'][answerIndex][i][index]]
                b = EC[final_dict['Event-Centered'][answerIndex][i][index]]
                c = MS[final_dict['MentalState'][answerIndex][i][index]]
                d = Per[final_dict['Persona'][answerIndex][i][index]]
                e = Be[final_dict['Behavior'][answerIndex][i][index]]
                aa.append(a)
                bb.append(b)
                cc.append(c)
                dd.append(d)
                ee.append(e)
            final[str(i)]['Physical-Entity'][answerIndex] = aa
            final[str(i)]['Event-Centered'][answerIndex] = bb
            final[str(i)]['MentalState'][answerIndex] = cc
            final[str(i)]['Persona'][answerIndex] = dd
            final[str(i)]['Behavior'][answerIndex] = ee

    
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic-category.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

## Extract sentences with TopK-similarity from ConceptNet (full set)

In [None]:
def model_dataloader(conceptnet_set, tokenizer: RobertaTokenizer, batch_size: int):
    conceptnet_sens = list()

    for sen in conceptnet_set:
        conceptnet_sen = tokenizer.encode("<s> " + sen + " </s>", add_special_tokens=True)
        conceptnet_sen = conceptnet_sen[:36]
        conceptnet_sen = conceptnet_sen + [1] * (36 - len(conceptnet_sen))
        conceptnet_sens.append(conceptnet_sen)
        
    conceptnet_inputs = torch.tensor(conceptnet_sens)
    data = TensorDataset(conceptnet_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    
    return dataloader


def roberta_for_KB(config: ExpConfig, first_category):
    
    csqa_sentences = load_commonsenseqa_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
    model.eval()
    model.to(config.device)
    
    conceptnet = torch.tensor([])
    conceptnet_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + CN_PATH, 'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            conceptnet_set.append(sen)
        f.close()
        conceptnet_dataloader = model_dataloader(conceptnet_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(conceptnet_dataloader):
            conceptnet_input_ids = batch[0]
            conceptnet_input_ids = conceptnet_input_ids.to(config.device)
            conceptnet_output = model(conceptnet_input_ids)
            conceptnet_output = conceptnet_output[0].mean(dim=1).cpu()
            conceptnet = torch.cat((conceptnet, conceptnet_output), 0)
            if(conceptnet.size()[0] % 10240 == 0):
                print(conceptnet.size())
        print(conceptnet.size())
        
    csqa_A = torch.tensor([])
    csqa_B = torch.tensor([])
    csqa_C = torch.tensor([])
    csqa_D = torch.tensor([])
    csqa_E = torch.tensor([])
    
    with torch.no_grad():
        for cs in csqa_sentences:
            cs_sentence_A = '<s> ' + cs.question + ' </s> ' + cs.choicesA + ' </s>'
            cs_sentence_B = '<s> ' + cs.question + ' </s> ' + cs.choicesB + ' </s>'
            cs_sentence_C = '<s> ' + cs.question + ' </s> ' + cs.choicesC + ' </s>'
            cs_sentence_D = '<s> ' + cs.question + ' </s> ' + cs.choicesD + ' </s>'
            cs_sentence_E = '<s> ' + cs.question + ' </s> ' + cs.choicesE + ' </s>'
            
            cs_input_A = torch.tensor(tokenizer.encode(cs_sentence_A, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_B = torch.tensor(tokenizer.encode(cs_sentence_B, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_C = torch.tensor(tokenizer.encode(cs_sentence_C, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_D = torch.tensor(tokenizer.encode(cs_sentence_D, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_E = torch.tensor(tokenizer.encode(cs_sentence_E, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            
            cs_output_A = model(cs_input_A)[0].mean(dim=1).cpu()
            cs_output_B = model(cs_input_B)[0].mean(dim=1).cpu()
            cs_output_C = model(cs_input_C)[0].mean(dim=1).cpu()
            cs_output_D = model(cs_input_D)[0].mean(dim=1).cpu()
            cs_output_E = model(cs_input_E)[0].mean(dim=1).cpu()
            
            csqa_A = torch.cat((csqa_A, cs_output_A), 0)
            csqa_B = torch.cat((csqa_B, cs_output_B), 0)
            csqa_C = torch.cat((csqa_C, cs_output_C), 0)
            csqa_D = torch.cat((csqa_D, cs_output_D), 0)
            csqa_E = torch.cat((csqa_E, cs_output_E), 0)
            
        print(csqa_A.size(), csqa_B.size(), csqa_C.size(), csqa_D.size(), csqa_E.size())
    first_norm_A = csqa_A / csqa_A.norm(dim=1)[:, None]
    first_norm_B = csqa_B / csqa_B.norm(dim=1)[:, None]
    first_norm_C = csqa_C / csqa_C.norm(dim=1)[:, None]
    first_norm_D = csqa_D / csqa_D.norm(dim=1)[:, None]
    first_norm_E = csqa_E / csqa_E.norm(dim=1)[:, None]
    
    second_norm = conceptnet / conceptnet.norm(dim=1)[:, None]
    
    res_A = torch.mm(first_norm_A, second_norm.transpose(0,1))
    res_B = torch.mm(first_norm_B, second_norm.transpose(0,1))
    res_C = torch.mm(first_norm_C, second_norm.transpose(0,1))
    res_D = torch.mm(first_norm_D, second_norm.transpose(0,1))
    res_E = torch.mm(first_norm_E, second_norm.transpose(0,1))
    
    res_value_A, res_index_A = res_A.topk(50, dim=1, largest=True, sorted=True)
    res_value_B, res_index_B = res_B.topk(50, dim=1, largest=True, sorted=True)
    res_value_C, res_index_C = res_C.topk(50, dim=1, largest=True, sorted=True)
    res_value_D, res_index_D = res_D.topk(50, dim=1, largest=True, sorted=True)
    res_value_E, res_index_E = res_E.topk(50, dim=1, largest=True, sorted=True)
    
    print(res_index_A.size(), res_index_B.size(), res_index_C.size(), res_index_D.size(), res_index_E.size())
    return res_index_A.numpy(), res_index_B.numpy(), res_index_C.numpy(), res_index_D.numpy(), res_index_E.numpy()

In [None]:
for model_name in MODEL_LIST:
    final = {}
    for cat in ['Overall']:
        final[cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
    for cat in ['Overall']:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'commonsenseqa'
        robertaconfig.dataset_path = ROOT_DIR + CSQA_PATH
        robertaconfig.task_name = 'Test on CommonsenseQA'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res_A, res_B, res_C, res_D, res_E = roberta_for_KB(robertaconfig, cat)
        
        final[cat]['A'] = res_A.tolist()
        final[cat]['B'] = res_B.tolist()
        final[cat]['C'] = res_C.tolist()
        final[cat]['D'] = res_D.tolist()
        final[cat]['E'] = res_E.tolist()

    KB_DIR = "kb_extract/csqa_cn/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()
    
    f = open(ROOT_DIR + CN_PATH, 'r', encoding='UTF-8')
    reader = csv.reader(f)
    Overall = []
    for line in reader:
        Overall.append(line[1])
    print(len(Overall))
    print('------------')
    f.close()
    ff = open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}

    print(len(final_dict['Overall']['A']))
    for i in range(len(final_dict['Overall']['A'])):
        final[str(i)] = {}
        for cat in ['Overall']:
            final[str(i)][cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
        for answerIndex in ['A', 'B', 'C', 'D', 'E']:
            aa = list()
            for index in range(50):
                a = Overall[final_dict['Overall'][answerIndex][i][index]]
                aa.append(a)
            final[str(i)]['Overall'][answerIndex] = aa


    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

## Extract sentences with TopK-similarity from ConceptNet (four categories)

In [None]:
def model_dataloader(conceptnet_set, tokenizer: RobertaTokenizer, batch_size: int):
    conceptnet_sens = list()

    for sen in conceptnet_set:
        conceptnet_sen = tokenizer.encode("<s> " + sen + " </s>", add_special_tokens=True)
        conceptnet_sen = conceptnet_sen[:36]
        conceptnet_sen = conceptnet_sen + [1] * (36 - len(conceptnet_sen))
        conceptnet_sens.append(conceptnet_sen)
        
    conceptnet_inputs = torch.tensor(conceptnet_sens)
    data = TensorDataset(conceptnet_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    
    return dataloader


def roberta_for_KB(config: ExpConfig, first_category):
    
    csqa_sentences = load_commonsenseqa_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
    model.eval()
    model.to(config.device)
    
    conceptnet = torch.tensor([])
    conceptnet_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + CN_PATH, 'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            first_cat = line[2]
            if first_cat != first_category:
                continue
            conceptnet_set.append(sen)
        f.close()
        conceptnet_dataloader = model_dataloader(conceptnet_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(conceptnet_dataloader):
            conceptnet_input_ids = batch[0]
            conceptnet_input_ids = conceptnet_input_ids.to(config.device)
            conceptnet_output = model(conceptnet_input_ids)
            conceptnet_output = conceptnet_output[0].mean(dim=1).cpu()
            conceptnet = torch.cat((conceptnet, conceptnet_output), 0)
            if(conceptnet.size()[0] % 10240 == 0):
                print(conceptnet.size())
        print(conceptnet.size())
        
    csqa_A = torch.tensor([])
    csqa_B = torch.tensor([])
    csqa_C = torch.tensor([])
    csqa_D = torch.tensor([])
    csqa_E = torch.tensor([])
    
    with torch.no_grad():
        for cs in csqa_sentences:
            cs_sentence_A = '<s> ' + cs.question + ' </s> ' + cs.choicesA + ' </s>'
            cs_sentence_B = '<s> ' + cs.question + ' </s> ' + cs.choicesB + ' </s>'
            cs_sentence_C = '<s> ' + cs.question + ' </s> ' + cs.choicesC + ' </s>'
            cs_sentence_D = '<s> ' + cs.question + ' </s> ' + cs.choicesD + ' </s>'
            cs_sentence_E = '<s> ' + cs.question + ' </s> ' + cs.choicesE + ' </s>'
            
            cs_input_A = torch.tensor(tokenizer.encode(cs_sentence_A, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_B = torch.tensor(tokenizer.encode(cs_sentence_B, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_C = torch.tensor(tokenizer.encode(cs_sentence_C, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_D = torch.tensor(tokenizer.encode(cs_sentence_D, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            cs_input_E = torch.tensor(tokenizer.encode(cs_sentence_E, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            
            cs_output_A = model(cs_input_A)[0].mean(dim=1).cpu()
            cs_output_B = model(cs_input_B)[0].mean(dim=1).cpu()
            cs_output_C = model(cs_input_C)[0].mean(dim=1).cpu()
            cs_output_D = model(cs_input_D)[0].mean(dim=1).cpu()
            cs_output_E = model(cs_input_E)[0].mean(dim=1).cpu()
            
            csqa_A = torch.cat((csqa_A, cs_output_A), 0)
            csqa_B = torch.cat((csqa_B, cs_output_B), 0)
            csqa_C = torch.cat((csqa_C, cs_output_C), 0)
            csqa_D = torch.cat((csqa_D, cs_output_D), 0)
            csqa_E = torch.cat((csqa_E, cs_output_E), 0)
            
        print(csqa_A.size(), csqa_B.size(), csqa_C.size(), csqa_D.size(), csqa_E.size())
    first_norm_A = csqa_A / csqa_A.norm(dim=1)[:, None]
    first_norm_B = csqa_B / csqa_B.norm(dim=1)[:, None]
    first_norm_C = csqa_C / csqa_C.norm(dim=1)[:, None]
    first_norm_D = csqa_D / csqa_D.norm(dim=1)[:, None]
    first_norm_E = csqa_E / csqa_E.norm(dim=1)[:, None]
    
    second_norm = conceptnet / conceptnet.norm(dim=1)[:, None]
    
    res_A = torch.mm(first_norm_A, second_norm.transpose(0,1))
    res_B = torch.mm(first_norm_B, second_norm.transpose(0,1))
    res_C = torch.mm(first_norm_C, second_norm.transpose(0,1))
    res_D = torch.mm(first_norm_D, second_norm.transpose(0,1))
    res_E = torch.mm(first_norm_E, second_norm.transpose(0,1))
    
    res_value_A, res_index_A = res_A.topk(50, dim=1, largest=True, sorted=True)
    res_value_B, res_index_B = res_B.topk(50, dim=1, largest=True, sorted=True)
    res_value_C, res_index_C = res_C.topk(50, dim=1, largest=True, sorted=True)
    res_value_D, res_index_D = res_D.topk(50, dim=1, largest=True, sorted=True)
    res_value_E, res_index_E = res_E.topk(50, dim=1, largest=True, sorted=True)
    
    print(res_index_A.size(), res_index_B.size(), res_index_C.size(), res_index_D.size(), res_index_E.size())
    return res_index_A.numpy(), res_index_B.numpy(), res_index_C.numpy(), res_index_D.numpy(), res_index_E.numpy()

In [None]:
for model_name in MODEL_LIST:
    final = {}
    for cat in ['Physical-Entity', 'Event-Centered', 'Social-Interaction', 'Taxonomic-Lexical']:
        final[cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
    for cat in ['Physical-Entity', 'Event-Centered', 'Social-Interaction', 'Taxonomic-Lexical']:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'commonsenseqa'
        robertaconfig.dataset_path = ROOT_DIR + CSQA_PATH
        robertaconfig.task_name = 'Test on CommonsenseQA'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res_A, res_B, res_C, res_D, res_E = roberta_for_KB(robertaconfig, cat)
        
        final[cat]['A'] = res_A.tolist()
        final[cat]['B'] = res_B.tolist()
        final[cat]['C'] = res_C.tolist()
        final[cat]['D'] = res_D.tolist()
        final[cat]['E'] = res_E.tolist()
    
    KB_DIR = 'kb_extract/csqa_cn/'
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-category-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()
    
    f = open(ROOT_DIR + CN_PATH, 'r', encoding='UTF-8')
    reader = csv.reader(f)
    reader.__next__()
    PE = []
    EC = []
    SI = []
    TL = []
    for line in reader:
        if(line[2] == 'Physical-Entity'):
            PE.append(line[1])
        elif(line[2] == 'Event-Centered'):
            EC.append(line[1])
        elif(line[2] == 'Taxonomic-Lexical'):
            TL.append(line[1])
        elif(line[2] == 'Social-Interaction'):
            SI.append(line[1])
            
    f.close()
    ff = open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-category-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}

    for i in range(len(final_dict['Physical-Entity']['A'])):
        final[str(i)] = {}
        for cat in ['Physical-Entity', 'Event-Centered', 'Social-Interaction', 'Taxonomic-Lexical']:
            final[str(i)][cat] = {'A': [], 'B': [], 'C': [], 'D': [], 'E': []}
        for answerIndex in ['A', 'B', 'C', 'D', 'E']:
            aa = list()
            bb = list()
            cc = list()
            dd = list()
            for index in range(50):
                a = PE[final_dict['Physical-Entity'][answerIndex][i][index]]
                b = EC[final_dict['Event-Centered'][answerIndex][i][index]]
                c = SI[final_dict['Social-Interaction'][answerIndex][i][index]]
                d = TL[final_dict['Taxonomic-Lexical'][answerIndex][i][index]]
                aa.append(a)
                bb.append(b)
                cc.append(c)
                dd.append(d)
            final[str(i)]['Physical-Entity'][answerIndex] = aa
            final[str(i)]['Event-Centered'][answerIndex] = bb
            final[str(i)]['Social-Interaction'][answerIndex] = cc
            final[str(i)]['Taxonomic-Lexical'][answerIndex] = dd

    
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-category.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()