# Read GraphDB

In [1]:
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
import pandas as pd
import json
import os

In [2]:
from gastrodon import _parseQuery
from SPARQLWrapper import SPARQLWrapper, N3
from rdflib import Graph
def describe(self, sparql:str):
    return self._describe(sparql).serialize(format='n3').decode()

def _describe(self, sparql:str):
    that = endpoint._wrapper()
    that.setQuery(endpoint._prepend_namespaces(sparql, _parseQuery))
    that.setReturnFormat(N3)
    results = that.query().convert()
    g = Graph()
    g.parse(data=results, format="n3")
    return g

RemoteEndpoint.describe = describe
RemoteEndpoint._describe = _describe

In [3]:
namespaces_str = """
@prefix : <https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/AidaDomainOntologiesCommon#> .
@prefix aida: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/InterchangeOntology#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix domainOntology: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/SeedlingOntology> .
@prefix ldc: <https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/LdcAnnotations#> .
@prefix ldcOnt: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/LDCOntology#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

### Params

In [4]:
endpoint_url = 'http://gaiadev01.isi.edu:7200/repositories'
repo = 'eval-gaia2ta1'
version = '001'
store_data_dir = 'store_data/' + repo # should exist already
add_origin = False # Set True if this origin and origin labels needed by TA3 (slow)

In [None]:
if not os.path.isdir(store_data_dir):
    os.makedirs(store_data_dir)

In [None]:
endpoint = RemoteEndpoint(url= endpoint_url + '/' + repo,
                          prefixes=inline(namespaces_str).graph)

In [5]:
system_list = endpoint.select('select distinct ?sys where {?s aida:system ?sys}')
system_list

Unnamed: 0,sys
0,http://www.rpi.edu
1,http://www.isi.edu/compoundJustificationWrapper
2,http://www.rpi.edu/fileType
3,http://www.columbia.edu/AIDA/DVMM/Systems/Grou...
4,http://www.rpi.edu/coreference
5,http://www.rpi.edu-projectToSingleton
6,http://www.isi.edu/promoteConfidencesToTypeAss...
7,http://www.isi.edu/clusters/identicalRelations
8,http://www.columbia.edu/Columbia_Sentiment
9,http://www.columbia.edu/Columbia_Sentiment-pro...


In [6]:
def to_int(s):
    return int(s) if isinstance(s, str) or isinstance(s, int) else -1

## Entities

In [7]:
print(endpoint.describe("""
DESCRIBE ?e {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu>
}
LIMIT 1
"""))

@prefix ns1: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/InterchangeOntology#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://www.isi.edu/gaia/assertions/fab95870-e721-4c41-a6ff-3281a5cc1c3b> rdf:subject <http://www.isi.edu/gaia/entities/a471d7ab-df33-48c7-ac4f-d49d0581cfe6> .

<http://www.isi.edu/gaia/entities/a471d7ab-df33-48c7-ac4f-d49d0581cfe6-cluster-projectedFromSingleton> ns1:prototype <http://www.isi.edu/gaia/entities/a471d7ab-df33-48c7-ac4f-d49d0581cfe6> .

<http://www.isi.edu/gaia/entities/a471d7ab-df33-48c7-ac4f-d49d0581cfe6> a ns1:Entity ;
    ns1:informativeJustification _:fb86ab39cc06d4a1ba9965f6803dc45e1b2 ;
    ns1:justifiedBy [ a ns1:TextJustification ;
            ns1:confidence [ a ns1:Confidence ;
                    ns1:confidenceValue 1e+00 ;
                    ns1:system 

Issues:

1. We don't have `prefLabel` any more.
2. Some entities contain more than one external link. E.g. entity `<http://www.isi.edu/gaia/entities/0007bd9f-dfc2-4c5c-ab44-18a86c7be089>` contains two external link, one is `m.07t21` "Ukraine" and one is `m.05vz3zq` "Soviet Union".

In [8]:
entities = endpoint.select("""
SELECT DISTINCT ?e {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu> ;
}
""")

entities.shape

(134968, 1)

## Entity Freebase IDs

In [9]:
def getFBIDs(s):
    if s:
        fbids = []
        avg_scores = []
        max_scores = []
        fbids_json = json.loads(s).get('freebase_link')
        fbids_keys = fbids_json.keys()
    #     sorted_fbids = sorted(fbids, key = lambda fbid:(fbids_json.get(fbid).get('average_score')))
    #     return tuple(sorted_fbids)
        for fbid in fbids_keys:
            fbids.append(fbid)
            avg_scores.append(fbids_json.get(fbid).get('average_score'))
            max_scores.append(fbids_json.get(fbid).get('max_score'))
        return pd.Series({'fbid': tuple(fbids), 'fbid_score_avg': tuple(avg_scores), 'fbid_score_max': tuple(max_scores)})
    else:
        return pd.Series({'fbid': tuple([]), 'fbid_score_avg': tuple([]), 'fbid_score_max': tuple([])})
    
    
df = endpoint.select("""
SELECT DISTINCT ?e ?fbid {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu> ;
       aida:privateData [
            aida:jsonContent ?fbid ;
            aida:system <http://www.rpi.edu/EDL_Freebase>
        ]
}
""")

# df.fbid = df.fbid.apply(lambda s: getFBIDs(s) if s else None)
df[['fbid', 'fbid_score_avg', 'fbid_score_max']] = df.fbid.apply(getFBIDs)
df = df.astype({
    'e': str, 'fbid': object, 'fbid_score_avg': object, 'fbid_score_max': object
})
rpi_external = df
rpi_external.head()

Unnamed: 0,e,fbid,fbid_score_avg,fbid_score_max
0,http://www.isi.edu/gaia/entities/5d3b85bf-639e...,"(m.03bzmt4,)","(0.5912981629,)","(0.8028140068,)"
1,http://www.isi.edu/gaia/entities/c5c03d4e-532a...,"(m.0276pfz,)","(0.4116017173,)","(0.4981036484,)"
2,http://www.isi.edu/gaia/entities/b8dd6b88-1c85...,"(m.034q1f, m.0phpy)","(0.411047241, 0.5485107243)","(0.9146501422, 0.55425632)"
3,http://www.isi.edu/gaia/entities/6c924596-17a2...,"(m.0d05w3, m.02z2cdj)","(0.6664679868, 0.5492341876)","(1.0, 1.0)"
4,http://www.isi.edu/gaia/entities/b835abbd-3ec8...,"(m.010gp6mj,)","(0.9551087022,)","(0.9551087022,)"


### Find unique freebase IDs

In [10]:
fbids = set([])
fbidss = rpi_external.fbid.tolist()
for fbid_list in fbidss:
    for fbid in fbid_list:
        fbids.add(fbid)
d = {'fbid': list(fbids)}
rpi_fbid = pd.DataFrame(data=d)
rpi_fbid.head()

Unnamed: 0,fbid
0,m.09x4kl
1,m.0261m
2,m.0d3k14
3,m.0h3ngjt
4,m.01l8x2


## Entity Targets

In [11]:
df = endpoint.select("""
select ?e ?target ?score {
SELECT distinct ?e ?target (MAX(?cv) as ?score)  {
    ?e a aida:Entity .
    ?e aida:system <http://www.rpi.edu> .
    ?e aida:link ?link .
    ?link aida:linkTarget ?target .
    ?link aida:confidence ?conf .
    ?conf aida:confidenceValue ?cv .
} group by ?e ?target
}
""")

def merge_targets(table):
    if len(table.index) > 0:
        targets = tuple(table['target'].to_list())
        scores = tuple(table['score'].to_list())
        return pd.Series({'targets': targets, 'target_scores': scores})
    else:
        return pd.Series({'targets': tuple([]), 'target_scores': tuple([])})
df = df.groupby('e')['target', 'score'].apply(merge_targets).reset_index()
df = df.astype({
    'e': str, 'targets': object, 'target_scores': object
})
df_target = df
df_target.head()


Unnamed: 0,e,targets,target_scores
0,http://www.isi.edu/gaia/entities/0000a56f-2b48...,"(LDC2019E43:11315381, LDC2019E43:1280071, LDC2...","(0.0755, 0.1509434, 0.1887)"
1,http://www.isi.edu/gaia/entities/0001e867-1054...,"(LDC2019E43:3796638, LDC2019E43:5168839, LDC20...","(0.00304414, 0.0002, 1.0, 0.0003015863, 0.0015..."
2,http://www.isi.edu/gaia/entities/000414b7-eff2...,"(LDC2019E43:1622619, LDC2019E43:588053, LDC201...","(0.0976, 0.1707317, 0.2439024)"
3,http://www.isi.edu/gaia/entities/000b99ad-1000...,"(LDC2019E43:20000170, LDC2019E43:20000153)","(0.0001038206, 1.0)"
4,http://www.isi.edu/gaia/entities/000d9ef9-d70e...,"(LDC2019E43:2510769, LDC2019E43:10297235, LDC2...","(0.9995611, 0.001, 0.0001)"


## Entity Justifications

In [12]:
df = endpoint.select("""
SELECT DISTINCT ?e ?type ?label ?target ?source ?start ?end ?justificationType {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu> ;
       ^rdf:subject [
        a rdf:Statement ;
        rdf:predicate rdf:type ;
        rdf:object ?type ;
        aida:justifiedBy ?justification ]
    OPTIONAL { ?justification aida:privateData [
            aida:jsonContent ?label ;
            aida:system <http://www.rpi.edu/EDL_Translation> ]}
    OPTIONAL { ?e aida:link/aida:linkTarget ?target }
    OPTIONAL { ?justification aida:source ?source }
    OPTIONAL { ?justification aida:startOffset ?start }
    OPTIONAL { ?justification aida:endOffsetInclusive ?end }
    OPTIONAL { ?justification aida:privateData [ 
            aida:system <http://www.rpi.edu> ;
            aida:jsonContent ?justificationType ] }
}
""")
df.start = df.start.apply(to_int)
df.end = df.end.apply(to_int)
df.justificationType = df.justificationType.apply(lambda s: json.loads(s).get('justificationType'))
df.label = df.label.apply(lambda s: tuple(json.loads(s).get('translation')) if s else None)
df = df.astype({
    'e': str, 'type': str, 'target': str, 'source': str, 'start': int, 'end': int, 'justificationType': str
})
rpi_entity_with_justification = df
rpi_entity_with_justification.head()

Unnamed: 0,e,type,label,target,source,start,end,justificationType
0,http://www.isi.edu/gaia/entities/c1318fca-7fb7...,ldcOnt:ORG.Government,,,HC00002Z8,1457,1463,nominal_mention
1,http://www.isi.edu/gaia/entities/c1318fca-7fb7...,ldcOnt:ORG.Government,,,HC00002Z8,1519,1525,nominal_mention
2,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,,LDC2019E43:80000078,HC00002Z8,7877,7890,mention
3,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,,LDC2019E43:80000090,HC00002Z8,7877,7890,mention
4,http://www.isi.edu/gaia/entities/a16d4e70-dd96...,ldcOnt:VAL.Number.Number,,,HC00002Z8,5180,5181,mention


## Entity Types

In [13]:
df = endpoint.select("""
SELECT DISTINCT ?e ?type ?name ?text ?source ?target {
    ?e a aida:Entity ;
       aida:justifiedBy/aida:source ?source ;
       aida:system <http://www.rpi.edu> .
    ?statement a rdf:Statement ;
               rdf:subject ?e ;
               rdf:predicate rdf:type ;
               rdf:object ?type .
    OPTIONAL { ?e aida:hasName ?name }
    OPTIONAL { ?e aida:textValue ?text }
    OPTIONAL { ?e aida:link/aida:linkTarget ?target }
}
""")
df = df.astype({
    'e': str, 'type': str, 'name': str, 'source': str, 'text': str, 'target': str
})
rpi_entity_valid = df
rpi_entity_valid.head()

Unnamed: 0,e,type,name,text,source,target
0,http://www.isi.edu/gaia/entities/c1318fca-7fb7...,ldcOnt:ORG.Government,,,HC00002Z8,
1,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,Ukrainian Army,,HC00002Z8,LDC2019E43:80000078
2,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,Ukrainian Army,,HC00002Z8,LDC2019E43:80000090
3,http://www.isi.edu/gaia/entities/a16d4e70-dd96...,ldcOnt:VAL.Number.Number,,34.0,HC00002Z8,
4,http://www.isi.edu/gaia/entities/ba1c195e-992d...,ldcOnt:VEH.WheeledVehicle,,,HC00002Z8,


## Relations

In [14]:
df = endpoint.select("""
SELECT DISTINCT ?e ?type ?source ?start ?end {
    ?e a aida:Relation ;
       aida:system <http://www.rpi.edu> .
    ?statement a rdf:Statement ;
               rdf:subject ?e ;
               rdf:predicate rdf:type ;
               rdf:object ?type ;
               aida:justifiedBy ?justification 
    OPTIONAL { ?justification aida:source ?source }
    OPTIONAL { ?justification aida:startOffset ?start }
    OPTIONAL { ?justification aida:endOffsetInclusive ?end }
}
""")
df.start = df.start.apply(to_int)
df.end = df.end.apply(to_int)
df = df.astype({
    'e': str, 'type': str, 'source': str, 'start': int, 'end': int
})
rpi_relation = df
rpi_relation.head()

Unnamed: 0,e,type,source,start,end
0,http://www.isi.edu/gaia/relations/8c92ec18-b53...,ldcOnt:PartWhole.Subsidiary.NationalityCitizen,HC00002Z8,7523,7642
1,http://www.isi.edu/gaia/relations/6f96d9fd-055...,ldcOnt:OrganizationAffiliation.EmploymentMembe...,HC00002Z8,7877,8006
2,http://www.isi.edu/gaia/relations/38038f7c-f4b...,ldcOnt:OrganizationAffiliation.EmploymentMembe...,HC00002Z8,878,1009
3,http://www.isi.edu/gaia/relations/80112f4a-896...,ldcOnt:Measurement.Size.Count,HC00002Z8,4109,4319
4,http://www.isi.edu/gaia/relations/fde2bf74-f57...,ldcOnt:OrganizationAffiliation.EmploymentMembe...,HC00002Z8,559,689


In [15]:
df = endpoint.select("""
SELECT DISTINCT ?e ?p ?o {
    ?e a aida:Relation ;
       aida:system <http://www.rpi.edu> .
    ?statement a rdf:Statement ;
               rdf:subject ?e ;
               rdf:predicate ?p ;
               rdf:object ?o 
    FILTER (?p != rdf:type)
}
""")
df = df.astype({
    'e': str, 'p': str, 'o': str
})
rpi_relation_roles = df
rpi_relation_roles.head()

Unnamed: 0,e,p,o
0,http://www.isi.edu/gaia/relations/8c92ec18-b53...,ldcOnt:PartWhole.Subsidiary.NationalityCitizen...,http://www.isi.edu/gaia/entities/a471d7ab-df33...
1,http://www.isi.edu/gaia/relations/8c92ec18-b53...,ldcOnt:PartWhole.Subsidiary.NationalityCitizen...,http://www.isi.edu/gaia/entities/084233cc-ae23...
2,http://www.isi.edu/gaia/relations/6f96d9fd-055...,ldcOnt:OrganizationAffiliation.EmploymentMembe...,http://www.isi.edu/gaia/entities/22a89f59-8477...
3,http://www.isi.edu/gaia/relations/6f96d9fd-055...,ldcOnt:OrganizationAffiliation.EmploymentMembe...,http://www.isi.edu/gaia/entities/d433e6bb-5284...
4,http://www.isi.edu/gaia/relations/38038f7c-f4b...,ldcOnt:OrganizationAffiliation.EmploymentMembe...,http://www.isi.edu/gaia/entities/961f822f-bfc7...


## Documents

In [16]:
df = endpoint.select("""
SELECT DISTINCT ?source ?fileType {
    ?justification a aida:TextJustification ;
                   aida:system <http://www.rpi.edu> ;
                   aida:source ?source ;
                   aida:privateData ?filePrivate .
    ?filePrivate aida:system <http://www.rpi.edu/fileType> ;
                 aida:jsonContent ?fileType
}
""")
df['lang'] = df.fileType.apply(lambda s: json.loads(s).get('fileType'))
df = df.drop(columns='fileType')
df = df.astype({
    'source': str, 'lang': str
})
document_types = df
document_types.head()

Unnamed: 0,source,lang
0,HC00002Z8,en
1,HC00007OM,ru
2,HC00007OS,ru
3,HC00007P2,ru
4,HC00007P6,ru


In [21]:
rpi_entity_with_justification.to_hdf(store_data_dir + '/entity_with_labels_' + version + '.h5', 'entity', mode='w', format='fixed')
rpi_entity_valid.to_hdf(store_data_dir + '/entity_valid_' + version + '.h5', 'entity', mode='w', format='fixed')
rpi_relation.to_hdf(store_data_dir + '/relation_' + version + '.h5', 'entity', mode='w', format='fixed')
rpi_relation_roles.to_hdf(store_data_dir + '/relation_roles_' + version + '.h5', 'entity', mode='w', format='fixed')
document_types.to_hdf(store_data_dir + '/document_' + version + '.h5', 'entity', mode='w', format='fixed')
_ = pd.read_hdf(store_data_dir + '/entity_with_labels_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/entity_valid_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/relation_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/relation_roles_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/document_' + version + '.h5')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['e', 'type', 'label', 'target', 'source', 'justificationType']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Transform Entities

### Entities

In [22]:
df = rpi_entity_valid
df['name'] = df.apply(lambda r: r['name'] if r['name'] != 'None' else None, axis=1)
df = df.drop(columns='text')
df = df.drop_duplicates()
df = df[['e', 'type', 'source']].groupby('e').head(1).join(df.groupby('e')['name'].apply(tuple), on='e')
df['name'] = df['name'].apply(lambda s: s if s[0] else None)
df_names = df
df.head()

Unnamed: 0,e,type,source,name
0,http://www.isi.edu/gaia/entities/c1318fca-7fb7...,ldcOnt:ORG.Government,HC00002Z8,
1,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,HC00002Z8,"(Ukrainian Army, Ukrainian Army)"
3,http://www.isi.edu/gaia/entities/a16d4e70-dd96...,ldcOnt:VAL.Number.Number,HC00002Z8,
4,http://www.isi.edu/gaia/entities/ba1c195e-992d...,ldcOnt:VEH.WheeledVehicle,HC00002Z8,
5,http://www.isi.edu/gaia/entities/921c9071-2e37...,ldcOnt:VAL.Number.Number,HC00002Z8,


### Origin - Slow, omit if not needed by TA3

In [23]:
from model.source import LTFSourceContext

def query_context(source, start, end):
    if start == -1 or end == -1: return None
    context_extractor = LTFSourceContext(source)
    if context_extractor.doc_exists():
        return context_extractor.query_context(start, end)
    
def query_label(source, start, end):
    if start == -1 or end == -1: 
        return None
    context_extractor = LTFSourceContext(source)
    if context_extractor.doc_exists():
        text = context_extractor.query(start, end)
        return text

In [None]:
if add_origin:
    rpi_entity_with_justification['origin'] = rpi_entity_with_justification.apply(lambda r: query_context(r.source, r.start, r.end), axis=1)
    rpi_entity_with_justification['originLabel'] = rpi_entity_with_justification.apply(lambda r: query_label(r.source, r.start, r.end), axis=1)
    rpi_entity_with_justification.head()

In [None]:
if add_origin:
    df_origin = df[['e', 'origin']].groupby('e')['origin'].apply(tuple).to_frame()
    df_origin['origin'] = df_origin['origin'].apply(lambda s: s if s[0] else None)
    df_origin.head()

In [None]:
if add_origin:
    df_origin_label = df[['e', 'originLabel']].groupby('e')['originLabel'].apply(tuple).to_frame()
    df_origin_label = df_origin_label['originLabel'].apply(lambda s: s if s[0] else None)
    df_origin_label.head()

### Justification Type

In [25]:
df = rpi_entity_with_justification
df = df[(df['justificationType']!='nominal_mention') & (df['justificationType']!='pronominal_mention')]
# df['debug'] = df['justificationType'].apply(lambda s: False if s != 'nominal_mention' and s != 'pronominal_mention' else True)
rpi_entity_with_justification_filtered = df
rpi_entity_with_justification_filtered.head()

Unnamed: 0,e,type,label,target,source,start,end,justificationType
2,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,,LDC2019E43:80000078,HC00002Z8,7877,7890,mention
3,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,,LDC2019E43:80000090,HC00002Z8,7877,7890,mention
4,http://www.isi.edu/gaia/entities/a16d4e70-dd96...,ldcOnt:VAL.Number.Number,,,HC00002Z8,5180,5181,mention
6,http://www.isi.edu/gaia/entities/921c9071-2e37...,ldcOnt:VAL.Number.Number,,,HC00002Z8,4329,4330,mention
7,http://www.isi.edu/gaia/entities/86ae3b62-df67...,ldcOnt:VAL.Number.Number,,,HC00002Z8,6660,6661,mention


### Labels

In [29]:
df_label = rpi_entity_with_justification_filtered[['e', 'label']].drop_duplicates()
df_label = df_label.groupby('e')['label'].apply(tuple).to_frame()
df_label = df_label['label'].apply(lambda s: s if s and s[0] else None)
df_label.head()

e
http://www.isi.edu/gaia/entities/0000a56f-2b48-47c5-bfec-f108475f8b6f        ((SAR ,),)
http://www.isi.edu/gaia/entities/0000bb73-7bad-44f2-85cc-b884f07d7c47              None
http://www.isi.edu/gaia/entities/0001e867-1054-4637-a53e-4993d0e691e5    ((Russian ,),)
http://www.isi.edu/gaia/entities/0002c7d3-0d0f-461c-b50e-85bfce67e34e              None
http://www.isi.edu/gaia/entities/00036981-808c-43e6-8a53-7467367889a0              None
Name: label, dtype: object

### Wikidata from freebase ID

In [30]:
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from rdflib.namespace import Namespace, RDFS, SKOS
from rdflib import URIRef, Literal

wikidata_sparql = SPARQLStore("http://sitaware.isi.edu:8080/bigdata/namespace/wdq/sparql")
# wikidata_sparql = SPARQLStore("https://query.wikidata.org/sparql")
WDT = Namespace('http://www.wikidata.org/prop/direct/')
namespaces = {'wdt': WDT, 'skos': SKOS}

def link_wikidata(fbid):
    if not fbid or 'NIL' in fbid: return None
    #.startswith('LDC2015E42:NIL'): return None   
    fbid = '/' + fbid.replace('.', '/')
    query = "SELECT ?qid WHERE { ?qid wdt:P646 ?freebase } LIMIT 1"
    print('\r', query, fbid, end='')
    for qid, in wikidata_sparql.query(query, namespaces, {'freebase': Literal(fbid)}):
        return str(qid)
    
def get_labels(pred, lang):
    def get_labels_for_entity(qid):
        if not qid: 
            return None
        query = """
        SELECT ?label 
        WHERE { 
            ?qid pred ?label
            FILTER (lang(?label) = "language") }
        """.replace('pred', pred).replace('language', lang)
        labels = []
        for label, in wikidata_sparql.query(query, namespaces, {'qid': URIRef(qid)}):
            labels.append(str(label))
        return tuple(labels)
    return get_labels_for_entity



In [31]:
df_fbid = rpi_fbid[['fbid']].drop_duplicates()
df_fbid['wikidata'] = df_fbid.fbid.apply(link_wikidata)
df_fbid.head()

 SELECT ?qid WHERE { ?qid wdt:P646 ?freebase } LIMIT 1 /m/05f66lq

Unnamed: 0,fbid,wikidata
0,m.09x4kl,http://www.wikidata.org/entity/Q887046
1,m.0261m,http://www.wikidata.org/entity/Q664609
2,m.0d3k14,http://www.wikidata.org/entity/Q9696
3,m.0h3ngjt,http://www.wikidata.org/entity/Q4483762
4,m.01l8x2,http://www.wikidata.org/entity/Q164706


In [32]:
df_fbid['wiki_label_en'] = df_fbid['wikidata'].apply(get_labels('rdfs:label', 'en'))
df_fbid['wiki_label_ru'] = df_fbid['wikidata'].apply(get_labels('rdfs:label', 'ru'))
df_fbid['wiki_label_uk'] = df_fbid['wikidata'].apply(get_labels('rdfs:label', 'uk'))
df_fbid['wiki_alias_en'] = df_fbid['wikidata'].apply(get_labels('skos:altLabel', 'en'))
df_fbid['wiki_alias_ru'] = df_fbid['wikidata'].apply(get_labels('skos:altLabel', 'ru'))
df_fbid['wiki_alias_uk'] = df_fbid['wikidata'].apply(get_labels('skos:altLabel', 'uk'))
df_fbid = df_fbid.where(pd.notnull(df_fbid), None)
df_fbid.head()

Unnamed: 0,fbid,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk
0,m.09x4kl,http://www.wikidata.org/entity/Q887046,"(Kristen Maloney,)",(),"(Крістін Мелоні,)",(),(),()
1,m.0261m,http://www.wikidata.org/entity/Q664609,"(Caribbean,)","(Карибы,)","(Кариби,)","(The Caribbean,)","(Вест-Индия, Карибский регион)",()
2,m.0d3k14,http://www.wikidata.org/entity/Q9696,"(John F. Kennedy,)","(Джон Фицджеральд Кеннеди,)","(Джон Фітцджеральд Кеннеді,)","(John Kennedy, Kennedy, Jack Kennedy, JFK, Joh...","(John F. Kennedy, Кеннеди, Джон Фицджеральд)","(Джон Кеннеді,)"
3,m.0h3ngjt,http://www.wikidata.org/entity/Q4483762,"(Borys Filatov,)","(Филатов, Борис Альбертович,)","(Філатов Борис Альбертович,)",(),"(Борис Альбертович Филатов, Филатов Борис Альб...",()
4,m.01l8x2,http://www.wikidata.org/entity/Q164706,"(The Watchtower,)","(Сторожевая башня,)","(Вартова башта оголошує Царство Єгови,)","(The Watchtower Announcing Jehovah's Kingdom,)","(Журнал Сторожевая Башня, Журнал Сторожевая ба...",()


## Combine columns

In [33]:
print(df_fbid.columns)
print(rpi_entity_with_justification.columns)
print(rpi_entity_valid.columns)
print(document_types.columns)

Index(['fbid', 'wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk',
       'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk'],
      dtype='object')
Index(['e', 'type', 'label', 'target', 'source', 'start', 'end',
       'justificationType'],
      dtype='object')
Index(['e', 'type', 'name', 'text', 'source', 'target'], dtype='object')
Index(['source', 'lang'], dtype='object')


In [34]:
df = rpi_entity_with_justification_filtered[['e', 'type', 'source']].drop_duplicates()
if add_origin:
    df = df.join(df_origin, on='e').join(df_origin_label, on='e')
df = df.join(df_label, on='e')
df = df.join(df_target.set_index('e'), on='e')
df = df.join(rpi_external.set_index('e'), on='e')

def add_wd(fbids):
    if fbids:
        wd = []
        label_en = []
        label_ru = []
        label_uk = []
        alias_en = ()
        alias_ru = ()
        alias_uk = ()
        for fbid in fbids:
            row_df = df_fbid.loc[df_fbid.fbid==fbid]
            
            wikidata = row_df['wikidata'].values[0]
            wd.append(wikidata)
            
            wiki_label_en = row_df['wiki_label_en'].values[0]
            if wiki_label_en:
                wiki_label_en = wiki_label_en[0]
            label_en.append(wiki_label_en)

            wiki_label_ru = row_df['wiki_label_ru'].values[0]
            if wiki_label_ru:
                wiki_label_ru = wiki_label_ru[0]
            label_ru.append(wiki_label_ru)
            
            wiki_label_uk = row_df['wiki_label_uk'].values[0]
            if wiki_label_uk:
                wiki_label_uk = wiki_label_uk[0]
            label_uk.append(wiki_label_uk)
            
            wiki_alias_en = row_df['wiki_alias_en'].values[0]
            alias_en = alias_en + (wiki_alias_en,)
            wiki_alias_ru = row_df['wiki_alias_ru'].values[0]
            alias_ru = alias_ru + (wiki_alias_ru,)
            wiki_alias_uk = row_df['wiki_alias_uk'].values[0]
            alias_uk = alias_uk + (wiki_alias_uk,)
        return pd.Series({'wikidata': tuple(wd), 'wiki_label_en': tuple(label_en), 'wiki_label_ru': tuple(label_ru), 'wiki_label_uk': tuple(label_uk), 'wiki_alias_en': alias_en, 'wiki_alias_ru': alias_ru, 'wiki_alias_uk': alias_uk})
    else:
        return pd.Series({'wikidata': None, 'wiki_label_en': None, 'wiki_label_ru': None, 'wiki_label_uk': None, 'wiki_alias_en': None, 'wiki_alias_ru': None, 'wiki_alias_uk': None})
df = df.where(pd.notnull(df), None)
df[['wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk']] = df['fbid'].apply(add_wd)

df = df.join(document_types.set_index('source'), on='source')
df = df.join(df_names[['e', 'name']].set_index('e'), on='e')

if add_origin:
    df = df[['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata',
           'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en',
           'wiki_alias_ru', 'wiki_alias_uk', 'origin', 'originLabel', 'lang', 'label']]
else:
    df = df[['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata',
           'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en',
           'wiki_alias_ru', 'wiki_alias_uk', 'lang', 'label']]

df_all = df
df_all.head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label
2,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,"(Ukrainian Army, Ukrainian Army)",HC00002Z8,"(LDC2019E43:80000078, LDC2019E43:80000090)","(0.2857143, 0.9158879)","(m.0261hds, m.07wh1)","(0.5795250386, 0.8517600451)","(0.5834840387, 0.9600124955)","(http://www.wikidata.org/entity/Q682152, http:...","(Ukrainian Ground Forces, United States Army)","(Сухопутные войска Украины, Армия США)","(Сухопутні війська Збройних Сил України, Армія...","((Ukrainian Army,), (USA, American Army, Groun...","((), ())","((15-й полк реактивної артилерії, Сухопутні Ві...",en,
4,http://www.isi.edu/gaia/entities/a16d4e70-dd96...,ldcOnt:VAL.Number.Number,,HC00002Z8,,,,,,,,,,,,,en,
6,http://www.isi.edu/gaia/entities/921c9071-2e37...,ldcOnt:VAL.Number.Number,,HC00002Z8,,,,,,,,,,,,,en,
7,http://www.isi.edu/gaia/entities/86ae3b62-df67...,ldcOnt:VAL.Number.Number,,HC00002Z8,,,,,,,,,,,,,en,
8,http://www.isi.edu/gaia/entities/8c94600f-74ca...,ldcOnt:VAL,,HC00002Z8,,,,,,,,,,,,,en,


## Write out dataframe

In [37]:
df_all.to_hdf(store_data_dir + '/entity_all_' + version + '.h5', 'entity', mode='w', format='fixed')
_ = pd.read_hdf(store_data_dir + '/entity_all_' + version + '.h5')
df_all.head()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk', 'lang', 'label']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label
2,http://www.isi.edu/gaia/entities/d433e6bb-5284...,ldcOnt:ORG.MilitaryOrganization,"(Ukrainian Army, Ukrainian Army)",HC00002Z8,"(LDC2019E43:80000078, LDC2019E43:80000090)","(0.2857143, 0.9158879)","(m.0261hds, m.07wh1)","(0.5795250386, 0.8517600451)","(0.5834840387, 0.9600124955)","(http://www.wikidata.org/entity/Q682152, http:...","(Ukrainian Ground Forces, United States Army)","(Сухопутные войска Украины, Армия США)","(Сухопутні війська Збройних Сил України, Армія...","((Ukrainian Army,), (USA, American Army, Groun...","((), ())","((15-й полк реактивної артилерії, Сухопутні Ві...",en,
4,http://www.isi.edu/gaia/entities/a16d4e70-dd96...,ldcOnt:VAL.Number.Number,,HC00002Z8,,,,,,,,,,,,,en,
6,http://www.isi.edu/gaia/entities/921c9071-2e37...,ldcOnt:VAL.Number.Number,,HC00002Z8,,,,,,,,,,,,,en,
7,http://www.isi.edu/gaia/entities/86ae3b62-df67...,ldcOnt:VAL.Number.Number,,HC00002Z8,,,,,,,,,,,,,en,
8,http://www.isi.edu/gaia/entities/8c94600f-74ca...,ldcOnt:VAL,,HC00002Z8,,,,,,,,,,,,,en,
