In [1]:
"""Imports"""
import pandas as pd
import numpy as np
import random
import json
import csv
# importing the itertools library
import itertools
import torch
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
"""Load models &  resources"""
resources_dir = '../inputs/resources_dir' 
mapping_csv = resources_dir+'/mapping.csv'
mapping_specific_csv = resources_dir+'/mapping_specific.csv'
hierarchy_json = resources_dir+'/dbpedia_hierarchy.json'
test_json  = resources_dir+'/test.json'

model_dir2 = '../outputs/models36_dbpedia2'
category_model_dir = model_dir2 +'/BERT_model_5classes'
model_dir = '../outputs/models36_dbpedia'
resource_top_model_dir = model_dir + '/BERT_resources_top/'
resource_bottom_model_dir = model_dir + '/BERT_resources_bottom/'

In [18]:
id_to_label = {}
label_to_id = {}
with open(mapping_csv) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        id_to_label[row[1]] = row[0]
        label_to_id[row[0]] = row[1]
        
id_to_label_sp = {}
label_to_id_sp = {}
with open(mapping_specific_csv) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        id_to_label_sp[row[1]] = row[0]
        label_to_id_sp[row[0]] = row[1]

dbpedia_types = pd.read_csv('../inputs/dbpedia_types.tsv', sep='\t')

hierarchy = {}
with open(hierarchy_json) as json_file:
    hierarchy = json.load(json_file)

for i,row in dbpedia_types.iterrows():
    parent = row['Parent']
    child = row['Type']
    if parent not in hierarchy:
        hierarchy[parent] = {}
        hierarchy[parent]['children'] = []
    hierarchy[parent]['children'].append(child)

hierarchy['dbo:Location'] = hierarchy['dbo:Place']
hierarchy['dbo:Location']['children'].append('dbo:Place')
hierarchy['dbo:MedicalSpecialty'] = {'children':['dbo:MedicalSpecialty']}
hierarchy['dbo:PublicService'] = {'children':['dbo:PublicService']}
hierarchy['dbo:Location']['level'] = 1
hierarchy['dbo:MedicalSpecialty']['level']  = 1
hierarchy['dbo:PublicService']['level'] = 1


In [19]:
test_df= pd.read_json('../inputs/smarttask_dbpedia_test.json')
# test_df= pd.read_json('../inputs/task1_dbpedia_test.json')
print('before: ',len(test_df))
test_df = test_df[test_df['question'].notna()]
print('After: ',len(test_df))

before:  4381
After:  4381


In [20]:
resource_df = test_df[test_df['category']=='resource']
# print(len(resource_df))

In [7]:
# resource_df.head()

In [28]:
def types_to_top_type(type_list):
    generic = type_list[0]
    for anst in type_list:
        for top_t in label_to_id:
            if anst == top_t or anst in  hierarchy[top_t]['children']:
                  return  top_t
    return generic

resource_df['gt_top'] = resource_df.type.apply(types_to_top_type)


def types_to_specific_type(type_list):
    if len(type_list) == 0:
        return None 
    return type_list[0]
resource_df['gt_bottom'] = resource_df.type.apply(types_to_specific_type)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resource_df['gt_top'] = resource_df.type.apply(types_to_top_type)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resource_df['gt_bottom'] = resource_df.type.apply(types_to_specific_type)


In [9]:
# resource_df.head()

In [10]:
# resource_top_tokenizer = BertTokenizer.from_pretrained(resource_top_model_dir)
# resource_top_model = BertForSequenceClassification.from_pretrained(resource_top_model_dir,num_labels=len(label_to_id))

resource_tokenizer_sp = BertTokenizer.from_pretrained(resource_bottom_model_dir)
resource_model_sp = BertForSequenceClassification.from_pretrained(resource_bottom_model_dir,num_labels=len(label_to_id_sp))


In [22]:
# DBPedia hierarchy provided by the challenge organizers
dbpedia_types = pd.read_csv('../inputs/dbpedia_types.tsv', sep='\t')

In [23]:
for i, row in dbpedia_types.iterrows():
    c= row['Type']
    if c in hierarchy:
        hierarchy[c]['parent'] =row['Parent']

In [24]:
hierarchy['dbo:Location']['parent'] = 'owl:Thing'
hierarchy['dbo:MedicalSpecialty']['parent']  = 'owl:Thing'
hierarchy['dbo:PublicService']['parent'] = 'owl:Thing'

In [25]:
import torch
import torch.nn.functional as F


def classify_resource(row, top_level):
    q = str('[CLS]') + row['question'] + str('[SEP]') + top_level
    input_ids = torch.tensor(resource_tokenizer_sp.encode(q, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
    
    with torch.no_grad():
        outputs = resource_model_sp(input_ids, labels=labels)
    
    logits = outputs[1]
    l_array = logits.detach().numpy()[0]
    #normalize logits so that max is 1
    norm = [float(i)/max(l_array) for i in l_array]
    result_before = np.argsort(norm)[::-1]
    
#     print('result_before: ')
#     for i in result_before[:15]:
#         print(id_to_label_sp[str(i)], norm[i])
#     print('\n')

    # reward
    initial_top_index = np.argmax(norm)
    #print('initial_top_index: ',initial_top_index)
    initial_top = hierarchy[top_level]
    #print('initial_top: ',initial_top)
    if initial_top != {}:
        norm[initial_top_index] = norm[initial_top_index] + int(initial_top['level'])/6
        
        #reward sub classes of top class
        initial_top_children = initial_top['children']
        for c in initial_top_children:
            if c in label_to_id_sp:
                if c not in hierarchy:
                    continue
                norm[int(label_to_id_sp[c])] = norm[int(label_to_id_sp[c])] + int(hierarchy[c]['level'])/6
                
    #classes in descending order
    result = np.argsort(norm)[::-1]
#     print('after reward:  ')
#     for r in result[:15]:
#         print(id_to_label_sp[str(r)], norm[r])
        
    result_mapped = {}
    cTypes = []
    #m_arr = []
    for r in result[:9]:
        result_mapped[id_to_label_sp[str(r)]]= norm[r]
        #print(r)
        if id_to_label_sp[str(r)] in hierarchy.keys():
#             # rule 1:
#             if hierarchy[id_to_label_sp[str(r)]]['level'] in [3,4,5]:
                #print('rule 1:',id_to_label_sp[str(r)])
            cTypes.append(id_to_label_sp[str(r)])
             # rule 2:
#             if (hierarchy[top_level]['children'] == hierarchy[id_to_label_sp[str(r)]]['parent']):
#                 print('rule 2:',id_to_label_sp[str(r)])
#                 cTypes.append(id_to_label_sp[str(r)])
            # rule 2:
            #print(id_to_label_sp[str(r)])
            parent= hierarchy[id_to_label_sp[str(r)]]['parent']
#             print('errr:  ',parent)
            if ( parent in hierarchy and  'parent' in hierarchy[parent] and hierarchy[parent]['parent'] == top_level):
                #print('rule 2:',parent)
                score= norm[r]
                if parent not in result_mapped:
                    result_mapped[parent] = score
                elif score > result_mapped[parent]:
                      result_mapped[parent] = score
                #print (parent,score)
     
    # calculate possible combinations with letter 
    combi = itertools.combinations(cTypes, 2)
    # printing all combinations in output using for loop
    #print (result_mapped)
    m_arr = []
    for m in combi: 
        c1 = m[0]
        c2 = m[1]
        if (hierarchy[c1]['parent'] == hierarchy[c2]['parent']):
            if hierarchy[c1]['parent'] not in m_arr: # Avoid inserting duplicate elements
                #print (hierarchy[c1]['parent'])
                score= (norm[int(label_to_id_sp[c1])]+norm[int(label_to_id_sp[c2])])/2.0
                if hierarchy[c1]['parent'] not in result_mapped:
                    result_mapped[hierarchy[c1]['parent']] = score
                elif score > result_mapped[hierarchy[c1]['parent']]:
                      result_mapped[hierarchy[c1]['parent']] = score
    
    
    return result_mapped

#id_to_label_sp
def classify_sample(row):
    top_class= row['gt_top']
    if top_class =='dbo:Location':
        top_class='dbo:Place'
    specific_classes = classify_resource(row,top_class)
    topk_types = [top_class]
    for k, v in sorted(specific_classes.items(), key=lambda item: item[1], reverse=True):
        if len(topk_types)< 10:
            if k not in topk_types:
                topk_types.append(k)
    return topk_types


In [26]:
resource_df['bottom']= resource_df.apply(lambda x: classify_sample(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resource_df['bottom']= resource_df.apply(lambda x: classify_sample(x), axis=1)


In [39]:
# resource_df.sample(5)
resource_df.iloc[67]

id                                                 dbpedia_183
question     What is a sovereign state for office held by t...
category                                              resource
type         [dbo:Country, dbo:City, dbo:State, dbo:Settlem...
gt_top                                            dbo:Location
bottom       [dbo:Place, dbo:Mountain, dbo:NaturalPlace, db...
gt_bottom                                          dbo:Country
Name: 121, dtype: object

In [30]:
true_top_df = resource_df[resource_df['bottom'] == resource_df['gt_bottom']]
error_top_df = resource_df[resource_df['bottom'] != resource_df['gt_bottom']]

true_num = len(true_top_df)
error_num = len(error_top_df)
total_num = len(resource_df)

print('true: ',true_num)
print('error: ',error_num)
print('total: ',total_num)

true:  0
error:  2445
total:  2445


In [26]:
acc_top = (true_num / total_num) * 100
print('acc of top: %2f'%acc_top,'%')

acc of top: 84.417178 %


In [28]:
error_top_df.sample(10)

Unnamed: 0,id,question,category,type,top,gt_top
2283,dbpedia_14033,When was Nicolas Sarkozy elected president?,resource,[dbo:Activity],dbo:Agent,dbo:Activity
81,dbpedia_20499,List the websites which the authors of Tornado...,resource,"[dbo:Company, dbo:Organisation, dbo:Agent]",dbo:Work,dbo:Agent
534,dbpedia_2916,Which are the studies for toxin?,resource,[dbo:MedicalSpecialty],dbo:TopicalConcept,dbo:MedicalSpecialty
1036,dbpedia_11924,What is the Hypertext Transfer Protocol document,resource,[dbo:Media],dbo:Work,dbo:Media
2328,dbpedia_7584,What internet presence does the voice actor Je...,resource,"[dbo:Film, dbo:Work]",dbo:Agent,dbo:Work
351,dbpedia_5602,What works did the sculptor Ebenezer Scrooge do?,resource,"[dbo:Book, dbo:WrittenWork, dbo:Work]",dbo:Agent,dbo:Work
2723,dbpedia_19577,Which non fictional subject of Thud is the ath...,resource,"[dbo:Game, dbo:Sport, dbo:Activity]",dbo:TopicalConcept,dbo:Activity
43,dbpedia_2904,What is the genetic association of the regulat...,resource,[dbo:Disease],dbo:Biomolecule,dbo:Disease
3905,dbpedia_6700,Which is the anthem for Kimi Ga Yo?,resource,"[dbo:Country, dbo:State, dbo:PopulatedPlace, d...",dbo:Work,dbo:Location
1260,dbpedia_5963,"VAT rate of 27%,",resource,"[dbo:Country, dbo:State, dbo:PopulatedPlace, d...",dbo:Disease,dbo:Location


In [34]:
error_top = error_num / len(resource_df)
error_top

0.16605316973415132

In [None]:
for indexs in resource_df.index:
    if (resource_df['question'][indexs] == 'NA'):
        print(predicte_df['question'][indexs])


error_df = predicte_df[predicte_df['category'] != predicte_df['pred_cateory2']]
print(len(error_df))
error_df