In [1]:
"""Imports"""
import pandas as pd
import numpy as np
import random
import json
import csv
# importing the itertools library
import itertools
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
"""Load models &  resources"""
resources_dir = '../inputs/resources_dir' 
mapping_csv = resources_dir+'/mapping.csv'
mapping_specific_csv = resources_dir+'/mapping_specific.csv'

model_dir = '../outputs/models_dbpedia'
category_model_dir = model_dir +'/BERT_model_5classes_epoch3'
resource_top_model_dir = model_dir + '/BERT_model_top_epoch3/'
resource_bottom_model_dir = model_dir + '/BERT_model_bottom_epoch4/'

In [3]:
def get_label_and_id(mapping_csv):
    label_to_id = {}
    id_to_label = {}
    with open(mapping_csv) as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            label_to_id[row[0]] = row[1]
            id_to_label[row[1]] = row[0]
            
    return label_to_id,id_to_label

top_label_to_id,top_id_to_label = get_label_and_id(mapping_csv)
bottom_label_to_id,bottom_id_to_label = get_label_and_id(mapping_specific_csv)

# print(len(top_label_to_id))
# top_label_to_id

In [4]:
dbpedia_types = pd.read_csv('../inputs/dbpedia_types.tsv', sep='\t')
hierarchy_json = resources_dir+'/dbpedia_hierarchy.json'

hierarchy = {}

def get_hierarchy(dbpedia_types, hierarchy_json):
    hierarchy = {}
    with open(hierarchy_json) as json_file:
        hierarchy = json.load(json_file)
        
    for i,row in dbpedia_types.iterrows():
        parent = row['Parent']
        child = row['Type']
        if parent not in hierarchy:
            hierarchy[parent] ={}
            hierarchy[parent]['children'] =[]
        hierarchy[parent]['children'].append(child)

    hierarchy['dbo:Location'] = hierarchy['dbo:Place']
    hierarchy['dbo:Location']['children'].append('dbo:Place')
    hierarchy['dbo:MedicalSpecialty'] = {'children':['dbo:MedicalSpecialty']}
    hierarchy['dbo:PublicService'] = {'children':['dbo:PublicService']}
    
    hierarchy['dbo:Location']['level'] = 1
    hierarchy['dbo:MedicalSpecialty']['level']  = 1
    hierarchy['dbo:PublicService']['level'] = 1
    
    return hierarchy

hierarchy = get_hierarchy(dbpedia_types, hierarchy_json)

In [5]:
# test_df= pd.read_json('../inputs/smarttask_dbpedia_test.json')
# test_df= pd.read_json('../inputs/task1_dbpedia_test.json')
test_df= pd.read_json('../inputs/dataset/smart2021-AT_Answer_Type_Prediction/dbpedia/2021_dbpedia_0.1test.json')
print('test size: ',len(test_df))
test_df = test_df[test_df['question'].notna()]
print('After: ',len(test_df))
# test_df.category.value_counts()

test size:  3667
After:  3667


In [6]:
def classify_category(q):
    input_ids = torch.tensor(category_tokenizer.encode(q, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

    input_ids= input_ids.to(device) 
    labels= labels.to(device)
    with torch.no_grad():
        outputs = category_model(input_ids, labels=labels)
    logits = outputs[1]
    result = np.argmax(logits.cpu().numpy(),axis=1)[0]
    if result == 0:
        categoryLabel = 'resource'
    elif result == 1:
        categoryLabel = 'boolean'
    elif result == 2:
        categoryLabel = 'string'
    elif result == 3:
        categoryLabel = 'number'
    elif result == 4:
        categoryLabel = 'date'
    else:
        print('wrong prediction!!!')
    return categoryLabel

In [7]:
category_tokenizer = BertTokenizer.from_pretrained(category_model_dir)
category_model = BertForSequenceClassification.from_pretrained(category_model_dir,num_labels=5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# send model to device
category_model.to(device) 
print ('Success', device)
test_df['pred_category'] = test_df.apply(lambda x: classify_category(x['question']), axis=1)

Success cuda


In [8]:
test_df['pred_category2'] = test_df['pred_category']

for idx in test_df.index:
    if (test_df['pred_category2'][idx] in ['string','number','date']):
        test_df['pred_category2'][idx] = 'literal'
classes = test_df['pred_category2'].value_counts()
print(classes)
test_df.sample(5)

resource    3013
literal      409
boolean      245
Name: pred_category2, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred_category2'][idx] = 'literal'


Unnamed: 0,id,question,category,type,pred_category,pred_category2
2842,40264,what artist is on perry como love letters?,resource,"[dbo:Person, dbo:Artist, dbo:Band, dbo:Musical...",resource,resource
880,10833,What was the first Queen album?,resource,"[dbo:Album, dbo:MusicalWork, dbo:Work]",resource,resource
2709,38569,where is annet artani originally from,resource,"[dbo:Organisation, dbo:Country, dbo:Person, db...",resource,resource
2569,36980,Where did the second battle of auburn take place?,resource,"[dbo:Place, dbo:Location, dbo:AdministrativeRe...",resource,resource
1667,25746,what time in china hong kong,resource,[dbo:SportsEvent],resource,resource


In [9]:
# true_category_df = test_df[test_df['category'] == test_df['pred_category2']]
# error_category_df = test_df[test_df['category'] != test_df['pred_category2']]
# error_category_df.sample(5)
# print(len(test_df))
# print(len(error_category_df))

In [10]:
boolean_df = test_df[test_df['pred_category2'] == 'boolean']
boolean_df['answer_types'] = boolean_df.apply(lambda x: [x['pred_category']], axis=1)
print(len(boolean_df))
# boolean_df.head()

245


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boolean_df['answer_types'] = boolean_df.apply(lambda x: [x['pred_category']], axis=1)


In [11]:
literal_df = test_df[test_df['pred_category2'] == 'literal']
literal_df['answer_types'] = literal_df.apply(lambda x: [x['pred_category']], axis=1)
print(len(literal_df))
# literal_df.head()

409


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  literal_df['answer_types'] = literal_df.apply(lambda x: [x['pred_category']], axis=1)


In [12]:
resource_df = test_df[test_df['pred_category2'] == 'resource']
print('resource size: ',len(resource_df))

resource_top_tokenizer = BertTokenizer.from_pretrained(resource_top_model_dir)
resource_top_model = BertForSequenceClassification.from_pretrained(resource_top_model_dir,num_labels=len(top_label_to_id))

resource_tokenizer_sp = BertTokenizer.from_pretrained(resource_bottom_model_dir)
resource_model_sp = BertForSequenceClassification.from_pretrained(resource_bottom_model_dir,num_labels=len(bottom_label_to_id))

resource_top_model.to(device) 
resource_model_sp.to(device) 
print ('Success', device)

resource size:  3013
Success cuda


In [13]:
for i, row in dbpedia_types.iterrows():
    c= row['Type']
    if c in hierarchy:
        hierarchy[c]['parent'] =row['Parent']

In [31]:
import torch
import torch.nn.functional as F

def classify_resource_top_level(row):
    sent = str('[CLS]') + row['question'] 
    input_ids = torch.tensor(resource_top_tokenizer.encode(sent, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    
    input_ids= input_ids.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        outputs = resource_top_model(input_ids, labels=labels)
    
    logits = outputs[1]
    l_array = logits.cpu().numpy()[0]
    #normalize logits so that max is 1
    norm = [float(i)/max(l_array) for i in l_array]
    result_before = np.argsort(norm)[::-1]
    initial_top_index = np.argmax(norm)
    top_types ={}
    for i in result_before[:5]:
        if norm[i] > 0.5:
            top_types[(top_id_to_label[str(i)])]= norm[i]
        
    #topk_types = [k for k,v in top_class.items() if v > 0.5]
    print('top_types: ',top_types)
        
    return top_types


def classify_resource(row, top_types):
    #q = str('[CLS]') + row['question'] + str('[SEP]') + top_level
    sent = str('[CLS]') + row['question'] 
    input_ids = torch.tensor(resource_tokenizer_sp.encode(sent, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    
    input_ids= input_ids.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        outputs = resource_model_sp(input_ids, labels=labels)
    
    logits = outputs[1]
    l_array = logits.cpu().numpy()[0]
    #normalize logits so that max is 1
    norm = [float(i)/max(l_array) for i in l_array]
    # Arrange the elements in norm in descending, return corresponding indexes
    result_before = np.argsort(norm)[::-1]
    
    specific_types ={}
    for i in result_before[:11]:
        specific_types[(bottom_id_to_label[str(i)])]= norm[i]
    print('specific types: ',specific_types)
   
    #print('initial_top: ',top_level)
    for top_level in top_types:
        initial_top = hierarchy[top_level]
        if top_level in bottom_label_to_id:
            initial_top_index = int(bottom_label_to_id[top_level])
            norm[initial_top_index] = norm[initial_top_index] + top_types[top_level]*int(initial_top['level'])/6
        #reward sub classes of top class
        initial_top_children = initial_top['children']
        for c in initial_top_children:
            if c in bottom_label_to_id:
                if c not in hierarchy:
                    continue
                norm[int(bottom_label_to_id[c])] = norm[int(bottom_label_to_id[c])] + top_types[top_level]*int(hierarchy[c]['level'])/6
                
    #classes in descending order
    result = np.argsort(norm)[::-1]

    result_mapped = {}
    cTypes = []
    for r in result[:9]:
        result_mapped[bottom_id_to_label[str(r)]]= norm[r] 

    combi = itertools.combinations(result_mapped.keys(),2)
    # printing all combinations in output using for loop
    #print (result_mapped)
    m_arr = []
    for m in combi: 
        c1 = m[0]
        c2 = m[1]
        if c1 not in hierarchy or c2 not in hierarchy: continue
        if (hierarchy[c1]['parent'] == hierarchy[c2]['parent']):
            if hierarchy[c1]['parent'] not in m_arr: # Avoid inserting duplicate elements
                #print (hierarchy[c1]['parent'])
                score= (norm[int(bottom_label_to_id[c1])]+norm[int(bottom_label_to_id[c2])])/2.0
                if hierarchy[c1]['parent'] not in result_mapped:
                    result_mapped[hierarchy[c1]['parent']] = score
                elif score > result_mapped[hierarchy[c1]['parent']]:
                      result_mapped[hierarchy[c1]['parent']] = score
    print('result_mapped: ',result_mapped)
    
    return result_mapped

#id_to_label_sp
def classify_sample(row):
    top_class = classify_resource_top_level(row)
    topk_types = [k for k,v in top_class.items() if v > 0.5]
    specific_classes = classify_resource(row,top_class)
    
    for k, v in sorted(specific_classes.items(), key=lambda item: item[1], reverse=True):
        if len(topk_types)< 10:
            if k not in topk_types:
                topk_types.append(k)
                
    print('topk_types: ',topk_types)
    
    return topk_types


In [15]:
# def jaccard_func(y_true, y_pred):
#     inter = set(y_true).intersection(set(y_pred))
#     union = set(y_true).union(set(y_pred))
    
#     return float(len(inter)/len(union))

# idx = 100
# y_true = resource_df.iloc[idx]['type']
# y_pred = resource_df.iloc[idx]['answer_types']
# y_true = np.array(y_true)
# y_pred = np.array(y_pred)
# jaccard_func(y_true, y_pred)


In [16]:
# i = 0 
# for idx in resource_df.index:
#     y_true = resource_df.loc[idx]['type']
#     y_pred = resource_df.loc[idx]['answer_types']
#     y_true = np.array(y_true)
#     y_pred = np.array(y_pred)
#     jac = jaccard_func(y_true, y_pred)
#     if jac > 0.3 and jac < 0.5:
#         print(resource_df.iloc[idx]['question'])
#         print(y_true)
#         print(y_pred)
#         print('\n')
#         i += 1
# print(i)
    

In [32]:
idx = 2000 #1000 #666#102
print(resource_df.iloc[idx]['question'])
print(resource_df.iloc[idx]['pred_category2'])
print(resource_df.iloc[idx]['type'])
classify_sample(resource_df.iloc[idx])

what is an album by usher?
resource
['dbo:MusicalWork', 'dbo:Album', 'dbo:Work']
top_types:  {'dbo:Work': 1.0}
specific types:  {'dbo:MusicalWork': 1.0, 'dbo:Album': 0.5235952710937145, 'dbo:Single': 0.3637721816227973, 'dbo:Country': 0.23041263864720296, 'dbo:Company': 0.2295354938126019, 'dbo:Book': 0.2139558411928312, 'dbo:Person': 0.21301649111620163, 'dbo:Software': 0.19302640375334798, 'dbo:Media': 0.19268253285380177, 'dbo:Band': 0.1835506639585454, 'dbo:Film': 0.17312388882837806}
result_mapped:  {'dbo:MusicalWork': 1.6666666666666665, 'dbo:Album': 1.0235952710937144, 'dbo:Single': 0.8637721816227972, 'dbo:Software': 0.8596930704200145, 'dbo:Film': 0.8397905554950447, 'dbo:TelevisionShow': 0.7587037980235546, 'dbo:Artwork': 0.7210577302792855, 'dbo:Book': 0.7139558411928312, 'dbo:WrittenWork': 0.632377457504609, 'dbo:Work': 1.2631798685433404}
topk_types:  ['dbo:Work', 'dbo:MusicalWork', 'dbo:Album', 'dbo:Single', 'dbo:Software', 'dbo:Film', 'dbo:TelevisionShow', 'dbo:Artwork',

['dbo:Work',
 'dbo:MusicalWork',
 'dbo:Album',
 'dbo:Single',
 'dbo:Software',
 'dbo:Film',
 'dbo:TelevisionShow',
 'dbo:Artwork',
 'dbo:Book',
 'dbo:WrittenWork']

In [18]:
resource_df.head()

Unnamed: 0,id,question,category,type,pred_category,pred_category2
3,61,Who gave the {location of work} of {proprietor...,resource,"[dbo:City, dbo:Settlement, dbo:PopulatedPlace,...",resource,resource
6,90,Who discovered Europa and Callisto?,resource,"[dbo:Scientist, dbo:Person, dbo:Agent]",resource,resource
9,99,What award did Ilya Mehnikov win where the cha...,resource,[dbo:Award],resource,resource
11,118,Which book is written in Hebrew.,resource,"[dbo:Book, dbo:WrittenWork, dbo:Work]",resource,resource
12,122,What is a chemical compound encoded by CGU wit...,resource,"[dbo:ChemicalCompound, dbo:ChemicalSubstance]",resource,resource


In [19]:
resource_df['answer_types']= resource_df.apply(lambda x: classify_sample(x), axis=1)
resource_df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resource_df['answer_types']= resource_df.apply(lambda x: classify_sample(x), axis=1)


Unnamed: 0,id,question,category,type,pred_category,pred_category2,answer_types
3248,45715,which artist released jimi plays berkeley?,resource,"[dbo:Person, dbo:Artist, dbo:MusicalArtist, db...",resource,resource,"[dbo:Agent, dbo:Person, dbo:MusicalArtist, dbo..."
1534,23697,where is sony ericsson based,resource,"[dbo:Place, dbo:Location, dbo:City, dbo:Settle...",resource,resource,"[dbo:Location, dbo:Agent, dbo:Place, dbo:Libra..."
1684,25944,what currency does singapore use,resource,[dbo:Currency],resource,resource,"[dbo:Currency, owl:Thing, dbo:Settlement, dbo:..."
1974,29877,Name a famous saxophone player,resource,"[dbo:Person, dbo:Artist, dbo:MusicalArtist, db...",resource,resource,"[dbo:Agent, dbo:Person, dbo:MusicalArtist, dbo..."
3206,45135,what is the nationality of shin kishida?,resource,"[dbo:Place, dbo:Location, dbo:MusicalArtist, d...",resource,resource,"[dbo:Location, dbo:Agent, dbo:Place, dbo:Settl..."


In [20]:
predicte_df = pd.concat([boolean_df, literal_df, resource_df],axis=0,ignore_index=False)
predicte_df.sample(5)

Unnamed: 0,id,question,category,type,pred_category,pred_category2,answer_types
278,3498,Who lives on the shore of Richard Dawkins?,resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N...",resource,resource,"[dbo:Agent, dbo:Deity, dbo:River, dbo:Military..."
2013,30381,which italian city did majorian die in,resource,"[dbo:Place, dbo:Location, dbo:Settlement, dbo:...",resource,resource,"[dbo:Location, dbo:Agent, dbo:Place, dbo:Settl..."
1852,28398,What is charles trudeau's nationality,resource,"[dbo:Place, dbo:Location, dbo:MusicalArtist, d...",resource,resource,"[dbo:Location, dbo:Agent, dbo:Organisation, db..."
2777,39472,"who is prince arthur, duke of connaught and st...",resource,"[dbo:Person, dbo:Royalty, dbo:Agent]",resource,resource,"[dbo:Agent, dbo:Person, dbo:MusicalArtist, dbo..."
324,4140,Which administrative headquarters of the Pulau...,resource,"[dbo:Country, dbo:PopulatedPlace, dbo:Place, d...",resource,resource,"[dbo:Location, dbo:Country, dbo:City, dbo:Popu..."


In [21]:
save_predict_df = predicte_df[['id','question','pred_category2','answer_types']]

save_predict_df.rename(columns={'pred_category2': 'category', 'answer_types': 'type'},inplace=True)
save_predict_df.to_json('../outputs/2021_dbpedia_test_pred.json', orient='records')
# save_predict_df.to_json('../outputs/2021_dbpedia_0.1test_pred.json', orient='records')
len(save_predict_df)
save_predict_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id,question,category,type
0,13,Did Steve Sampson manage a club of Santa Clara...,boolean,[boolean]
5,76,Did Raymond Picard take birth in Paris?,boolean,[boolean]
10,114,Is the female population of Bommadasanahalli l...,boolean,[boolean]
19,151,Is the installed capacity of Kraftwerk Nussdor...,boolean,[boolean]
22,214,Is Tirana the largest city of Albania?,boolean,[boolean]


# Prediction 

In [22]:
import evaluate
type_hierarchy, max_depth = evaluate.load_type_hierarchy('../inputs/dbpedia_types.tsv')

Loading type hierarchy from ../inputs/dbpedia_types.tsv... 762 types loaded (max depth: 7)


In [23]:
ground_truth_fname = '../inputs/dataset/smart2021-AT_Answer_Type_Prediction/dbpedia/2021_dbpedia_0.1test.json'
# ground_truth_fname = '../inputs/smarttask_dbpedia_test.json'
ground_truth = evaluate.load_ground_truth(ground_truth_fname, type_hierarchy)

Loading ground truth from ../inputs/dataset/smart2021-AT_Answer_Type_Prediction/dbpedia/2021_dbpedia_0.1test.json... 
   3667 questions loaded


In [24]:
system_output = evaluate.load_system_output('../outputs/2021_dbpedia_0.1test_pred.json')

Loading system predictions from ../outputs/2021_dbpedia_0.1test_pred.json... 
   3667 predictions loaded


In [25]:
# RF
evaluate.evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 3667 questions)
  Accuracy: 0.983
Type ranking (based on 3368 questions)
  NDCG@5:  0.690
  NDCG@10: 0.660


In [26]:
# RF
evaluate.evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 3667 questions)
  Accuracy: 0.983
Type ranking (based on 3368 questions)
  NDCG@5:  0.690
  NDCG@10: 0.660


In [27]:
# RF
evaluate.evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 3667 questions)
  Accuracy: 0.983
Type ranking (based on 3368 questions)
  NDCG@5:  0.690
  NDCG@10: 0.660


In [28]:
# RF
evaluate.evaluate(system_output, ground_truth, type_hierarchy, max_depth)



Evaluation results:
-------------------
Category prediction (based on 3667 questions)
  Accuracy: 0.983
Type ranking (based on 3368 questions)
  NDCG@5:  0.690
  NDCG@10: 0.660
