In [None]:
import torch
import re
import numpy as np
import random
import torch.nn as nn
import json
import csv
from transformers import BertPreTrainedModel, RobertaConfig
from transformers import RobertaTokenizer, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

ROOT_DIR = '~/NLI-KB/'
CACHE_DIR = '~/.cache/'
ATOMIC_PATH= 'datasets/atomic.csv'
CN_PATH = 'datasets/conceptnet.csv'
WINOWHY_PATH = 'datasets/winowhy/winowhy.jsonl'
MODEL_LIST = ['roberta-base-qnli', 'roberta-base-mnli', 'roberta-large-qnli', 'roberta-large-mnli']

In [None]:
# helper function: read and dump data
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
from dataclasses import dataclass

class WinoWhySentence(object):
    sentence: str = None
    context: str = None
    wsc_sentence: str = None
    answer_reason: str = None
    reason: str = None
    label: int = 0
    wsc_id: int = 0
    fold_num: int = 1
    wsc_marked_sentence: str = None
    wsc_asked_sentence: str = None
        
def load_winowhy_from_path(filepath: str):
    ws = load_jsonl(filepath)
    winowhy_sentences = list()
    for w in ws:
        s = WinoWhySentence()
        s.sentence = w['sentence']
        s.context = w['context']
        s.wsc_sentence = w['wsc_sentence']
        s.answer_reason = w['answer_reason']
        s.reason = w['reason']
        s.label = w['label']
        s.wsc_id = w['wsc_id']
        s.fold_num = w['fold_num']
        s.wsc_marked_sentence = w['wsc_marked_sentence']
        s.wsc_asked_sentence = w['wsc_asked_sentence']
        winowhy_sentences.append(s)
    return winowhy_sentences

@dataclass
class ExpConfig(object):
    # JSONL file path
    dataset_path: str = ""
        
    winowhy_dataset_path: str = ""
        
    atomic_dataset_path: str = ""
        
    conceptnet_dataset_path: str = ""

    dataset: str = "winowhy"
    # Task description
    task_name: str = ""
    # Only using single GPU
    gpu_id: int = 0
    # Seed for random
    seed: int = 42
    # 'cpu', 'cuda'
    device: str = 'cpu' 
    # "roberta-base", "roberta-largbe"
    model_name: str = ""
    # If model_path is not None or not empty, load model from model_path instead of transformers' pretrained ones
    model_path: str = ""
    # For training the classifier layer
    learning_rate: float = 1e-3
    # Number of total epochs
    num_training_epochs: int = 15
    # Max sequence length
    max_seq_len: int = 128
        
    batch_size: int = 1

    def set_seed(self, new_seed = None):
        seed = self.seed if new_seed is None else new_seed
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

    def set_gpu_if_possible(self, gpu_id = None):
        if torch.cuda.is_available():
            self.device = 'cuda'
            if gpu_id is not None:
                self.device = 'cuda:{}'.format(gpu_id)
        else:
            self.device = 'cpu'
            
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
    
class RobertaOnlyClassificationHead(BertPreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)

    def forward(self, sequence_output):
        logits = self.classifier(sequence_output)
        return logits 


## Extract sentences with TopK-similarity from ATOMIC (full set)

In [None]:
def model_dataloader(atomic_set, tokenizer: RobertaTokenizer, batch_size: int):
    atomic_sens = list()
    
    for sen in atomic_set:
        atomic_sen = tokenizer.encode('<s> ' + sen + ' </s>', add_special_tokens=True)
        
        atomic_sen = atomic_sen[:48]
        atomic_sen = atomic_sen + [1] * (48 - len(atomic_sen))
        atomic_sens.append(atomic_sen)
        
    atomic_inputs = torch.tensor(atomic_sens)
    data = TensorDataset(atomic_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    
    return dataloader


def roberta_for_KB(config: ExpConfig):
    
    winowhy_sentences = load_winowhy_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path)#, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
        
    model.eval()
    model.to(config.device)
    
    atomic = torch.tensor([])
    atomic_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + ATOMIC_PATH, 'r', encoding='UTF-8')
        reader = csv.reader(f)
        
        for line in reader:
            sen = line[1]
            atomic_set.append(sen)
        f.close()
        atomic_dataloader = model_dataloader(atomic_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(atomic_dataloader):
            atomic_input_ids = batch[0]
            atomic_input_ids = atomic_input_ids.to(config.device)
            atomic_output = model(atomic_input_ids)
            atomic_output = atomic_output[0].mean(dim=1).cpu()
            atomic = torch.cat((atomic, atomic_output), 0)
            if(atomic.size()[0] % 10240 == 0):
                print(atomic.size())
                break
        print(atomic.size())
    
    ww = torch.tensor([])
    with torch.no_grad():
        for ws in winowhy_sentences:
            #wsc_reason = "<s> " + ws.reason + " </s> " + ws.wsc_marked_sentence + " </s>" 
            #wsc_reason = ws.wsc_asked_sentence + " " + ws.reason
            wsc_reason = "<s>" + ws.wsc_asked_sentence + "</s>" + ws.reason + "</s>"
            ww_input = torch.tensor(tokenizer.encode(wsc_reason)).unsqueeze(0).to(config.device)  
            ww_output = model(ww_input)
            ww_output = ww_output[0].mean(dim=1).cpu()
            ww = torch.cat((ww, ww_output), 0)
        print(ww.size())
        
    first_norm = ww / ww.norm(dim=1)[:, None]
    second_norm = atomic / atomic.norm(dim=1)[:, None]
    res = torch.mm(first_norm, second_norm.transpose(0,1))
    
    res_value, res_index = res.topk(50, dim=1, largest=True, sorted=True)
    print(res_index.size())
    return res_index.numpy()

In [None]:
import json
for model_name in MODEL_LIST:
    final = {'Overall': []}
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    robertaconfig = ExpConfig()
    robertaconfig.set_seed()
    robertaconfig.batch_size = 256
    robertaconfig.set_gpu_if_possible(1)
    robertaconfig.dataset = 'winowhy'
    robertaconfig.dataset_path = ROOT_DIR + WINOWHY_PATH
    robertaconfig.task_name = 'Test on WinoWhy'
    robertaconfig.model_name = model_name
    robertaconfig.model_path = CACHE_DIR + model_name

    print('\n================================')
    print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

    res = roberta_for_KB(robertaconfig)
    final['Overall'] = res.tolist()
    KB_DIR = "kb_extract/winowhy_atomic/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()
    
    f = open(ROOT_DIR + ATOMIC_PATH,'r', encoding='UTF-8')
    reader = csv.reader(f)
    Overall = []
    for line in reader:
        Overall.append(line[1])
    print(len(Overall))
    print('------------')
    f.close()
    
    ff = open(ROOT_DIR + KB_DIR + model_name + "-atomic-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()
    final = {}

    for i in range(len(final_dict['Overall'])):
        aa = list()
        final[str(i)] = {'Overall': []}

        for index in range(50):
            a = Overall[final_dict['Overall'][i][index]]
            aa.append(a)
        final[str(i)]['Overall'] = aa

    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

## Extract sentences with TopK-similarity from ATOMIC (five categories)

In [None]:
def model_dataloader(atomic_set, tokenizer: RobertaTokenizer, batch_size: int):
    
    atomic_sens = list()
    for sen in atomic_set:
        atomic_sen = tokenizer.encode('<s> ' + sen + ' </s>', add_special_tokens=True)
        atomic_sen = atomic_sen[:48]
        atomic_sen = atomic_sen + [1] * (48 - len(atomic_sen))
        atomic_sens.append(atomic_sen)
        
    atomic_inputs = torch.tensor(atomic_sens)
    data = TensorDataset(atomic_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    
    return dataloader


def roberta_for_KB(config: ExpConfig, first_category, second_category):
    
    winowhy_sentences = load_winowhy_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path)#, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
        
    model.eval()
    model.to(config.device)
    
    atomic = torch.tensor([])
    atomic_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + ATOMIC_PATH,'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            first_cat = line[2]
            second_cat = line[3]
            third_cat = line[4]
            if first_category != '' and first_cat != first_category:
                continue
            if second_category != '' and second_cat != second_category:
                continue
            atomic_set.append(sen)
        f.close()
        
        atomic_dataloader = model_dataloader(atomic_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(atomic_dataloader):
            atomic_input_ids = batch[0]
            atomic_input_ids = atomic_input_ids.to(config.device)
            atomic_output = model(atomic_input_ids)
            atomic_output = atomic_output[0].mean(dim=1).cpu()
            atomic = torch.cat((atomic, atomic_output), 0)
            if(atomic.size()[0] % 3200 == 0):
                print(atomic.size())
                break
        print(atomic.size())
    
    ww = torch.tensor([])
    with torch.no_grad():
        for ws in winowhy_sentences:
            #wsc_reason = "<s> " + ws.reason + " </s> " + ws.wsc_marked_sentence + " </s>" 
            wsc_reason = "<s>" + ws.wsc_asked_sentence + "</s>" + ws.reason + "</s>"
            ww_input = torch.tensor(tokenizer.encode(wsc_reason)).unsqueeze(0).to(config.device)
            ww_output = model(ww_input)
            ww_output = ww_output[0].mean(dim=1).cpu()
            ww = torch.cat((ww, ww_output), 0)
        print(ww.size())
        
    first_norm = ww / ww.norm(dim=1)[:, None]
    second_norm = atomic / atomic.norm(dim=1)[:, None]
    res = torch.mm(first_norm, second_norm.transpose(0,1))
    
    res_value, res_index = res.topk(50, dim=1, largest=True, sorted=True)
    print(res_index.size())
    return res_index.numpy()

In [None]:
import json
for model_name in MODEL_LIST:
    final = {'Physical-Entity': [], 'Event-Centered': [], 'MentalState': [], 'Persona': [], 'Behavior': []}
    for cat in [('Physical-Entity', ''), ('Event-Centered', ''), ('', 'MentalState'), ('', 'Persona'), ('', 'Behavior')]:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'winowhy'
        robertaconfig.dataset_path = ROOT_DIR + WINOWHY_PATH
        robertaconfig.task_name = 'Test on WinoWhy, predicting KB'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res = roberta_for_KB(robertaconfig, cat[0], cat[1])
        if cat[0] != '':
            final[cat[0]] = res.tolist()
        else:
            final[cat[1]] = res.tolist()

    KB_DIR = "kb_extract/winowhy_atomic/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic-category-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

    f = open(ROOT_DIR + ATOMIC_PATH, 'r', encoding='UTF-8')
    reader = csv.reader(f)
    reader.__next__()
    PE = []
    EC = []
    MS = []
    Per = []
    Be = []
    for line in reader:
        if(line[2] == 'Physical-Entity'):
            PE.append(line[1])
        if(line[2] == 'Event-Centered'):
            EC.append(line[1])
        if(line[3] == 'MentalState'):
            MS.append(line[1])
        if(line[3] == 'Persona'):
            Per.append(line[1])
        if(line[3] == 'Behavior'):
            Be.append(line[1])
    f.close()
    ff = open(ROOT_DIR + KB_DIR + model_name + "-atomic-category-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}

    for i in range(len(final_dict['Physical-Entity'])):
        aa = list()
        bb = list()
        cc = list()
        dd = list()
        ee = list()
        final[str(i)] = {'Physical-Entity': [], 'Event-Centered': [], 'MentalState': [], 'Persona': [], 'Behavior': []}

        for index in range(50):
            a = PE[final_dict['Physical-Entity'][i][index]]
            b = EC[final_dict['Event-Centered'][i][index]]
            c = MS[final_dict['MentalState'][i][index]]
            d = Per[final_dict['Persona'][i][index]]
            e = Be[final_dict['Behavior'][i][index]]
            aa.append(a)
            bb.append(b)
            cc.append(c)
            dd.append(d)
            ee.append(e)
        final[str(i)]['Physical-Entity'] = aa
        final[str(i)]['Event-Centered'] = bb
        final[str(i)]['MentalState'] = cc
        final[str(i)]['Persona'] = dd
        final[str(i)]['Behavior'] = ee


    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-atomic-category.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

## Extract sentences with TopK-similarity from ConceptNet (full set)

In [None]:
def model_dataloader(conceptnet_set, tokenizer: RobertaTokenizer, batch_size: int):
    conceptnet_sens = list()
    
    for sen in conceptnet_set:
        conceptnet_sen = tokenizer.encode('<s> ' + sen + ' </s>', add_special_tokens=True)
        conceptnet_sen = conceptnet_sen[:48]
        conceptnet_sen = conceptnet_sen + [1] * (48 - len(conceptnet_sen))
        conceptnet_sens.append(conceptnet_sen)
        
    conceptnet_inputs = torch.tensor(conceptnet_sens)
    data = TensorDataset(conceptnet_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    
    return dataloader

def roberta_for_KB(config: ExpConfig, first_category):
    
    winowhy_sentences = load_winowhy_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path)#, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
        
    model.eval()
    model.to(config.device)
    
    conceptnet = torch.tensor([])
    conceptnet_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + CN_PATH,'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            conceptnet_set.append(sen)
        f.close()
        conceptnet_dataloader = model_dataloader(conceptnet_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(conceptnet_dataloader):
            conceptnet_input_ids = batch[0]
            conceptnet_input_ids = conceptnet_input_ids.to(config.device)
            conceptnet_output = model(conceptnet_input_ids)
            conceptnet_output = conceptnet_output[0].mean(dim=1).cpu()
            conceptnet = torch.cat((conceptnet, conceptnet_output), 0)
            if(conceptnet.size()[0] % 10240 == 0):
                print(conceptnet.size())
                break
        print(conceptnet.size())

    ww = torch.tensor([])
    with torch.no_grad():
        for ws in winowhy_sentences:
            #wsc_reason = "<s> " + ws.reason + " </s> " + ws.wsc_marked_sentence + " </s>" 
            wsc_reason = "<s>" + ws.wsc_asked_sentence + "</s>" + ws.reason + "</s>"
            ww_input = torch.tensor(tokenizer.encode(wsc_reason, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            ww_output = model(ww_input)
            ww_output = ww_output[0].mean(dim=1).cpu()
            ww = torch.cat((ww, ww_output), 0)
        print(ww.size())
        
    first_norm = ww / ww.norm(dim=1)[:, None]
    second_norm = conceptnet / conceptnet.norm(dim=1)[:, None]
    res = torch.mm(first_norm, second_norm.transpose(0,1))
    
    res_value, res_index = res.topk(50, dim=1, largest=True, sorted=True)
    print(res_index.size())
    return res_index.numpy()

In [None]:
import json
for model_name in MODEL_LIST:
    final = {'Overall': []}
    for cat in ['Overall']:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'winowhy'
        robertaconfig.dataset_path = ROOT_DIR + WINOWHY_PATH
        robertaconfig.task_name = 'Test on WinoWhy'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res = roberta_for_KB(robertaconfig, cat)
        final[cat] = res.tolist()

    KB_DIR = "kb_extract/winowhy_cn/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()
    
    f = open(ROOT_DIR + CN_PATH, 'r', encoding='UTF-8')
    reader = csv.reader(f)
    Overall = []
    for line in reader:
        Overall.append(line[1])
    print(len(Overall))
    print('------------')
    f.close()
    ff = open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}
    for i in range(len(final_dict['Overall'])):
        aa = list()
        final[str(i)] = {'Overall': []}

        for index in range(50):
            a = Overall[final_dict['Overall'][i][index]]
            aa.append(a)
        final[str(i)]['Overall'] = aa


    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

## Extract sentences with TopK-similarity from ConceptNet (four categories)

In [None]:
def model_dataloader(conceptnet_set, tokenizer: RobertaTokenizer, batch_size: int):
    conceptnet_sens = list()
    
    for sen in conceptnet_set:
        conceptnet_sen = tokenizer.encode('<s> ' + sen + ' </s>', add_special_tokens=True)
        conceptnet_sen = conceptnet_sen[:48]
        conceptnet_sen = conceptnet_sen + [1] * (48 - len(conceptnet_sen))
        conceptnet_sens.append(conceptnet_sen)
        
    conceptnet_inputs = torch.tensor(conceptnet_sens)
    data = TensorDataset(conceptnet_inputs)
    
    data_sampler = SequentialSampler(data)
    dataloader = DataLoader(data, sampler=data_sampler, batch_size=batch_size)
    
    return dataloader

def roberta_for_KB(config: ExpConfig, first_category):
    
    winowhy_sentences = load_winowhy_from_path(config.dataset_path)
        
    if config.model_path is not None and config.model_path != "":
        tokenizer = RobertaTokenizer.from_pretrained(config.model_path)
        model = RobertaModel.from_pretrained(config.model_path)#, num_labels=3)
    else:
        tokenizer = RobertaTokenizer.from_pretrained(config.model_name)
        model = RobertaModel.from_pretrained(config.model_name)
        
    model.eval()
    model.to(config.device)
    
    conceptnet = torch.tensor([])
    conceptnet_set = []
    with torch.no_grad():
        f = open(ROOT_DIR + CN_PATH,'r', encoding='UTF-8')
        reader = csv.reader(f)
        for line in reader:
            sen = line[1]
            first_cat = line[2]
            if first_cat != first_category:
                continue
            conceptnet_set.append(sen)
        f.close()
        conceptnet_dataloader = model_dataloader(conceptnet_set, tokenizer, batch_size=config.batch_size)
        
        for step, batch in enumerate(conceptnet_dataloader):
            conceptnet_input_ids = batch[0]
            conceptnet_input_ids = conceptnet_input_ids.to(config.device)
            conceptnet_output = model(conceptnet_input_ids)
            conceptnet_output = conceptnet_output[0].mean(dim=1).cpu()
            conceptnet = torch.cat((conceptnet, conceptnet_output), 0)
            if(conceptnet.size()[0] % 10240 == 0):
                print(conceptnet.size())
                break
        print(conceptnet.size())

    ww = torch.tensor([])
    with torch.no_grad():
        for ws in winowhy_sentences:
            #wsc_reason = "<s> " + ws.reason + " </s> " + ws.wsc_marked_sentence + " </s>" 
            wsc_reason = "<s>" + ws.wsc_asked_sentence + "</s>" + ws.reason + "</s>"
            ww_input = torch.tensor(tokenizer.encode(wsc_reason, add_special_tokens=True)).unsqueeze(0).to(config.device)  
            ww_output = model(ww_input)
            ww_output = ww_output[0].mean(dim=1).cpu()
            ww = torch.cat((ww, ww_output), 0)
        print(ww.size())
        
    first_norm = ww / ww.norm(dim=1)[:, None]
    second_norm = conceptnet / conceptnet.norm(dim=1)[:, None]
    res = torch.mm(first_norm, second_norm.transpose(0,1))
    
    res_value, res_index = res.topk(50, dim=1, largest=True, sorted=True)
    print(res_index.size())
    return res_index.numpy()

In [None]:
import json
for model_name in MODEL_LIST:
    final = {'Physical-Entity': [], 'Event-Centered': [], 'Social-Interaction': [], 'Taxonomic-Lexical': []}
    for cat in ['Physical-Entity', 'Event-Centered', 'Social-Interaction', 'Taxonomic-Lexical']:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        robertaconfig = ExpConfig()
        robertaconfig.set_seed()
        robertaconfig.batch_size = 1024
        robertaconfig.set_gpu_if_possible(0)
        robertaconfig.dataset = 'winowhy'
        robertaconfig.dataset_path = ROOT_DIR + WINOWHY_PATH
        robertaconfig.task_name = 'Test on WinoWhy'
        robertaconfig.model_name = model_name
        robertaconfig.model_path = CACHE_DIR + model_name

        print('\n================================')
        print('Experiment: {} using {}'.format(robertaconfig.task_name, robertaconfig.model_name))

        res = roberta_for_KB(robertaconfig, cat)
        final[cat] = res.tolist()
    
    KB_DIR = "kb_extract/winowhy_cn/"
    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-category-index.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()

    f = open(ROOT_DIR + CN_PATH,'r', encoding='UTF-8')
    reader = csv.reader(f)
    PE = []
    EC = []
    SI = []
    TL = []
    for line in reader:
        if(line[2] == 'Physical-Entity'):
            PE.append(line[1])
        elif(line[2] == 'Event-Centered'):
            EC.append(line[1])
        elif(line[2] == 'Taxonomic-Lexical'):
            TL.append(line[1])
        elif(line[2] == 'Social-Interaction'):
            SI.append(line[1])
    f.close()
    ff = open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-category-index.json") 
    final_dict = json.loads(ff.read())
    ff.close()

    final = {}

    for i in range(len(final_dict['Physical-Entity'])):
        aa = list()
        bb = list()
        cc = list()
        dd = list()
        final[str(i)] = {'Physical-Entity': [], 'Event-Centered': [], 'Social-Interaction': [], 'Taxonomic-Lexical': []}

        for index in range(50):
            a = PE[final_dict['Physical-Entity'][i][index]]
            b = EC[final_dict['Event-Centered'][i][index]]
            c = SI[final_dict['Social-Interaction'][i][index]]
            d = TL[final_dict['Taxonomic-Lexical'][i][index]]
            aa.append(a)
            bb.append(b)
            cc.append(c)
            dd.append(d)
        final[str(i)]['Physical-Entity'] = aa
        final[str(i)]['Event-Centered'] = bb
        final[str(i)]['Social-Interaction'] = cc
        final[str(i)]['Taxonomic-Lexical'] = dd


    final_dumps = json.dumps(final)
    with open(ROOT_DIR + KB_DIR + model_name + "-conceptnet-category.json", 'w', encoding='utf-8') as f:
        f.write(final_dumps)
    f.close()