# Read GraphDB

In [64]:
from gastrodon import RemoteEndpoint,QName,ttl,URIRef,inline
import pandas as pd
import json
import os

In [65]:
from gastrodon import _parseQuery
from SPARQLWrapper import SPARQLWrapper, N3
from rdflib import Graph
def describe(self, sparql:str):
    return self._describe(sparql).serialize(format='n3').decode()

def _describe(self, sparql:str):
    that = endpoint._wrapper()
    that.setQuery(endpoint._prepend_namespaces(sparql, _parseQuery))
    that.setReturnFormat(N3)
    results = that.query().convert()
    g = Graph()
    g.parse(data=results, format="n3")
    return g

RemoteEndpoint.describe = describe
RemoteEndpoint._describe = _describe

In [66]:
namespaces_str = """
@prefix : <https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/AidaDomainOntologiesCommon#> .
@prefix aida: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/InterchangeOntology#> .
@prefix dc: <http://purl.org/dc/elements/1.1/> .
@prefix domainOntology: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/SeedlingOntology> .
@prefix ldc: <https://tac.nist.gov/tracks/SM-KBP/2018/ontologies/LdcAnnotations#> .
@prefix ldcOnt: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/LDCOntology#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

### Params

In [None]:
endpoint_url = 'http://gaiadev01.isi.edu:7200/repositories'
repo = 'jchen-test-ta1'
version = '001'
store_data_dir = 'store_data/' + repo # should exist already
add_origin = False # Set True if this origin and origin labels needed by TA3 (slow)
wikidata_sparql_endpoint = "https://dsbox02.isi.edu:8888/bigdata/namespace/wdq/sparql"

In [68]:
if not os.path.isdir(store_data_dir):
    os.makedirs(store_data_dir)

In [69]:
endpoint = RemoteEndpoint(url= endpoint_url + '/' + repo,
                          prefixes=inline(namespaces_str).graph)

In [70]:
system_list = endpoint.select('select distinct ?sys where {?s aida:system ?sys}')
system_list

Unnamed: 0,sys
0,http://www.rpi.edu
1,http://www.isi.edu/promoteConfidencesToTypeAss...
2,http://www.rpi.edu-projectToSingleton
3,http://www.isi.edu/gaia/propagateLinksToClusters
4,http://www.isi.edu/clusters/identicalRelations
5,http://www.rpi.edu/fileType
6,http://www.rpi.edu/coreference
7,http://www.rpi.edu/EDL_Freebase
8,http://www.rpi.edu/EDL_FineGrained
9,http://www.rpi.edu/EDL_LORELEI_maxPool


In [71]:
def to_int(s):
    return int(s) if isinstance(s, str) or isinstance(s, int) else -1

## Entities

In [72]:
print(endpoint.describe("""
DESCRIBE ?e {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu>
}
LIMIT 1
"""))

@prefix ns1: <https://tac.nist.gov/tracks/SM-KBP/2019/ontologies/InterchangeOntology#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://www.isi.edu/gaia/assertions/00134707-ce4c-4a2f-910c-96a1e1614b49> rdf:subject <http://www.isi.edu/gaia/entities/2d26abd5-94c7-4028-a4df-3ce5d630d41c> .

<http://www.isi.edu/gaia/assertions/8491415b-27c8-4651-822a-1d0eb0d83415> rdf:subject <http://www.isi.edu/gaia/entities/2d26abd5-94c7-4028-a4df-3ce5d630d41c> .

<http://www.isi.edu/gaia/entities/2d26abd5-94c7-4028-a4df-3ce5d630d41c-cluster-projectedFromSingleton> ns1:prototype <http://www.isi.edu/gaia/entities/2d26abd5-94c7-4028-a4df-3ce5d630d41c> .

<http://www.isi.edu/gaia/entities/2d26abd5-94c7-4028-a4df-3ce5d630d41c> a ns1:Entity ;
    ns1:hasName "contractor" ;
    ns1:informativeJustification _:f8dea7f740dc143f480f3

Issues:

1. We don't have `prefLabel` any more.
2. Some entities contain more than one external link. E.g. entity `<http://www.isi.edu/gaia/entities/0007bd9f-dfc2-4c5c-ab44-18a86c7be089>` contains two external link, one is `m.07t21` "Ukraine" and one is `m.05vz3zq` "Soviet Union".

In [73]:
entities = endpoint.select("""
SELECT DISTINCT ?e {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu> ;
}
""")

entities.shape

(739, 1)

## Entity Freebase IDs

In [74]:
def getFBIDs(s):
    if s:
        fbids = []
        avg_scores = []
        max_scores = []
        fbids_json = json.loads(s).get('freebase_link')
        fbids_keys = fbids_json.keys()
    #     sorted_fbids = sorted(fbids, key = lambda fbid:(fbids_json.get(fbid).get('average_score')))
    #     return tuple(sorted_fbids)
        for fbid in fbids_keys:
            fbids.append(fbid)
            avg_scores.append(fbids_json.get(fbid).get('average_score'))
            max_scores.append(fbids_json.get(fbid).get('max_score'))
        return pd.Series({'fbid': tuple(fbids), 'fbid_score_avg': tuple(avg_scores), 'fbid_score_max': tuple(max_scores)})
    else:
        return pd.Series({'fbid': tuple([]), 'fbid_score_avg': tuple([]), 'fbid_score_max': tuple([])})
    
    
df = endpoint.select("""
SELECT DISTINCT ?e ?fbid {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu> ;
       aida:privateData [
            aida:jsonContent ?fbid ;
            aida:system <http://www.rpi.edu/EDL_Freebase>
        ]
}
""")

# df.fbid = df.fbid.apply(lambda s: getFBIDs(s) if s else None)
df[['fbid', 'fbid_score_avg', 'fbid_score_max']] = df.fbid.apply(getFBIDs)
df = df.astype({
    'e': str, 'fbid': object, 'fbid_score_avg': object, 'fbid_score_max': object
})
rpi_external = df
rpi_external.head()

Unnamed: 0,e,fbid,fbid_score_avg,fbid_score_max
0,http://www.isi.edu/gaia/entities/2c120082-b098...,"(m.09c7w0,)","(0.4483456767,)","(0.9721111059,)"
1,http://www.isi.edu/gaia/entities/330d72d3-d391...,"(m.0840w,)","(0.7852166295,)","(0.7852166295,)"
2,http://www.isi.edu/gaia/entities/3523b0d0-a9e5...,"(m.0840w,)","(0.7079921365,)","(0.7079921365,)"
3,http://www.isi.edu/gaia/entities/46cee7f7-c961...,"(m.0g9zsyz,)","(0.9074373543,)","(0.9074373543,)"
4,http://www.isi.edu/gaia/entities/52d7e9ed-db5e...,"(m.0840w,)","(0.8988551658,)","(0.9914882779,)"


### Find unique freebase IDs

In [75]:
fbids = set([])
fbidss = rpi_external.fbid.tolist()
for fbid_list in fbidss:
    for fbid in fbid_list:
        fbids.add(fbid)
d = {'fbid': list(fbids)}
rpi_fbid = pd.DataFrame(data=d)
rpi_fbid.head()

Unnamed: 0,fbid
0,m.0c2105
1,m.02gjcx
2,m.061456
3,m.02wz7x
4,m.0j0k


## Entity Targets

In [76]:
df = endpoint.select("""
select ?e ?target ?score {
SELECT distinct ?e ?target (MAX(?cv) as ?score)  {
    ?e a aida:Entity .
    ?e aida:system <http://www.rpi.edu> .
    ?e aida:link ?link .
    ?link aida:linkTarget ?target .
    ?link aida:confidence ?conf .
    ?conf aida:confidenceValue ?cv .
} group by ?e ?target
}
""")

def merge_targets(table):
    if len(table.index) > 0:
        targets = tuple(table['target'].tolist())
        scores = tuple(table['score'].tolist())
        return pd.Series({'targets': targets, 'target_scores': scores})
    else:
        return pd.Series({'targets': tuple([]), 'target_scores': tuple([])})
df = df.groupby('e')['target', 'score'].apply(merge_targets).reset_index()
df = df.astype({
    'e': str, 'targets': object, 'target_scores': object
})
df_target = df
df_target.head()

Unnamed: 0,e,targets,target_scores
0,http://www.isi.edu/gaia/entities/00029697-0740...,"(LDC2019E43:2971316, LDC2019E43:6252001, LDC20...","(0.0003, 1.0, 0.0005066223)"
1,http://www.isi.edu/gaia/entities/021ed99f-8060...,"(LDC2019E43:8054100, LDC2019E43:1213721, LDC20...","(0.0001, 0.0076, 1.0, 0.0001)"
2,http://www.isi.edu/gaia/entities/0c80ef93-559a...,"(LDC2019E43:4187204, LDC2019E43:1668284, LDC20...","(0.0003028468, 0.03174603, 0.0005028437, 0.996..."
3,http://www.isi.edu/gaia/entities/15703b9b-2267...,"(LDC2019E43:3577718, LDC2019E43:479441, LDC201...","(0.0004, 0.0004, 0.008547008, 1.0)"
4,http://www.isi.edu/gaia/entities/1d4273b1-b7a2...,"(LDC2019E43:1268737, LDC2019E43:1252558)","(0.0027, 0.9986)"


## Entity Justifications

In [77]:
df = endpoint.select("""
SELECT DISTINCT ?e ?type ?label ?target ?source ?start ?end ?justificationType {
    ?e a aida:Entity ;
       aida:system <http://www.rpi.edu> ;
       ^rdf:subject [
        a rdf:Statement ;
        rdf:predicate rdf:type ;
        rdf:object ?type ;
        aida:justifiedBy ?justification ]
    OPTIONAL { ?justification aida:privateData [
            aida:jsonContent ?label ;
            aida:system <http://www.rpi.edu/EDL_Translation> ]}
    OPTIONAL { ?e aida:link/aida:linkTarget ?target }
    OPTIONAL { ?justification aida:source ?source }
    OPTIONAL { ?justification aida:startOffset ?start }
    OPTIONAL { ?justification aida:endOffsetInclusive ?end }
    OPTIONAL { ?justification aida:privateData [ 
            aida:system <http://www.rpi.edu> ;
            aida:jsonContent ?justificationType ] }
}
""")
df.start = df.start.apply(to_int)
df.end = df.end.apply(to_int)
df.justificationType = df.justificationType.apply(lambda s: json.loads(s).get('justificationType'))
df.label = df.label.apply(lambda s: tuple(json.loads(s).get('translation')) if s else None)
df = df.astype({
    'e': str, 'type': str, 'target': str, 'source': str, 'start': int, 'end': int, 'justificationType': str
})
rpi_entity_with_justification = df
rpi_entity_with_justification.head()

Unnamed: 0,e,type,label,target,source,start,end,justificationType
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,,,HC00017P3,9838,9847,mention
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,,,HC00017P3,3280,3295,mention
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,,LDC2019E43:4699848,HC00017P3,9537,9542,mention
3,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,,LDC2019E43:1545739,HC00017P3,9537,9542,mention
4,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,,LDC2019E43:1269750,HC00017P3,9537,9542,mention


## Entity Types

In [78]:
df = endpoint.select("""
SELECT DISTINCT ?e ?type ?name ?text ?source ?target {
    ?e a aida:Entity ;
       aida:justifiedBy/aida:source ?source ;
       aida:system <http://www.rpi.edu> .
    ?statement a rdf:Statement ;
               rdf:subject ?e ;
               rdf:predicate rdf:type ;
               rdf:object ?type .
    OPTIONAL { ?e aida:hasName ?name }
    OPTIONAL { ?e aida:textValue ?text }
    OPTIONAL { ?e aida:link/aida:linkTarget ?target }
}
""")
df = df.astype({
    'e': str, 'type': str, 'name': str, 'source': str, 'text': str, 'target': str
})
rpi_entity_valid = df
rpi_entity_valid.head()

Unnamed: 0,e,type,name,text,source,target
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,contractor,,HC00017P3,
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,Jayashree Lakhan,,HC00017P3,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,India,,HC00017P3,LDC2019E43:4699848
3,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,India,,HC00017P3,LDC2019E43:1545739
4,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,India,,HC00017P3,LDC2019E43:1269750


## Relations

In [79]:
df = endpoint.select("""
SELECT DISTINCT ?e ?type ?source ?start ?end {
    ?e a aida:Relation ;
       aida:system <http://www.rpi.edu> .
    ?statement a rdf:Statement ;
               rdf:subject ?e ;
               rdf:predicate rdf:type ;
               rdf:object ?type ;
               aida:justifiedBy ?justification 
    OPTIONAL { ?justification aida:source ?source }
    OPTIONAL { ?justification aida:startOffset ?start }
    OPTIONAL { ?justification aida:endOffsetInclusive ?end }
}
""")
df.start = df.start.apply(to_int)
df.end = df.end.apply(to_int)
df = df.astype({
    'e': str, 'type': str, 'source': str, 'start': int, 'end': int
})
rpi_relation = df
rpi_relation.head()

Unnamed: 0,e,type,source,start,end
0,http://www.isi.edu/gaia/relations/f1e0222d-5ec...,ldcOnt:Physical.Resident.Resident,HC00017P3,2925,2977
1,http://www.isi.edu/gaia/relations/b4fd8acf-686...,ldcOnt:Physical.LocatedNear,HC00017P3,1552,1673
2,http://www.isi.edu/gaia/relations/6e53c7fa-d16...,ldcOnt:PersonalSocial.Unspecified,HC00017P3,1421,1549
3,http://www.isi.edu/gaia/relations/62dbcec8-068...,ldcOnt:Measurement.Size.Count,HC00017P3,4956,5148
4,http://www.isi.edu/gaia/relations/eb01c193-c7d...,ldcOnt:Physical.LocatedNear,HC00017P3,2081,2277


In [80]:
df = endpoint.select("""
SELECT DISTINCT ?e ?p ?o {
    ?e a aida:Relation ;
       aida:system <http://www.rpi.edu> .
    ?statement a rdf:Statement ;
               rdf:subject ?e ;
               rdf:predicate ?p ;
               rdf:object ?o 
    FILTER (?p != rdf:type)
}
""")
df = df.astype({
    'e': str, 'p': str, 'o': str
})
rpi_relation_roles = df
rpi_relation_roles.head()

Unnamed: 0,e,p,o
0,http://www.isi.edu/gaia/relations/f1e0222d-5ec...,ldcOnt:Physical.Resident.Resident_Resident,http://www.isi.edu/gaia/entities/f44b5651-cd1a...
1,http://www.isi.edu/gaia/relations/f1e0222d-5ec...,ldcOnt:Physical.Resident.Resident_Place,http://www.isi.edu/gaia/entities/a3a208a9-a5df...
2,http://www.isi.edu/gaia/relations/b4fd8acf-686...,ldcOnt:Physical.LocatedNear_Place,http://www.isi.edu/gaia/entities/491e2d52-5772...
3,http://www.isi.edu/gaia/relations/b4fd8acf-686...,ldcOnt:Physical.LocatedNear_EntityOrFiller,http://www.isi.edu/gaia/entities/a3a208a9-a5df...
4,http://www.isi.edu/gaia/relations/6e53c7fa-d16...,ldcOnt:PersonalSocial.Unspecified_Person,http://www.isi.edu/gaia/entities/cb9a7794-88c2...


## Documents

In [81]:
df = endpoint.select("""
SELECT DISTINCT ?source ?fileType {
    ?justification a aida:TextJustification ;
                   aida:system <http://www.rpi.edu> ;
                   aida:source ?source ;
                   aida:privateData ?filePrivate .
    ?filePrivate aida:system <http://www.rpi.edu/fileType> ;
                 aida:jsonContent ?fileType
}
""")
df['lang'] = df.fileType.apply(lambda s: json.loads(s).get('fileType'))
df = df.drop(columns='fileType')
df = df.astype({
    'source': str, 'lang': str
})
document_types = df
document_types.head()

Unnamed: 0,source,lang
0,HC00017P3,en
1,HC00017UR,en
2,HC00017RX,en
3,HC00017RT,en
4,HC00017P5,en


In [82]:
rpi_entity_with_justification.to_hdf(store_data_dir + '/entity_with_labels_' + version + '.h5', 'entity', mode='w', format='fixed')
rpi_entity_valid.to_hdf(store_data_dir + '/entity_valid_' + version + '.h5', 'entity', mode='w', format='fixed')
rpi_relation.to_hdf(store_data_dir + '/relation_' + version + '.h5', 'entity', mode='w', format='fixed')
rpi_relation_roles.to_hdf(store_data_dir + '/relation_roles_' + version + '.h5', 'entity', mode='w', format='fixed')
document_types.to_hdf(store_data_dir + '/document_' + version + '.h5', 'entity', mode='w', format='fixed')
_ = pd.read_hdf(store_data_dir + '/entity_with_labels_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/entity_valid_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/relation_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/relation_roles_' + version + '.h5')
_ = pd.read_hdf(store_data_dir + '/document_' + version + '.h5')

# Transform Entities

### Entities

In [83]:
df = rpi_entity_valid
df['name'] = df.apply(lambda r: r['name'] if r['name'] != 'None' else None, axis=1)
df = df.drop(columns='text')
df = df.drop_duplicates()
df = df[['e', 'type', 'source']].groupby('e').head(1).join(df.groupby('e')['name'].apply(tuple), on='e')
df['name'] = df['name'].apply(lambda s: s if s[0] else None)
df_names = df
df.head()

Unnamed: 0,e,type,source,name
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,HC00017P3,"(contractor, contractor)"
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,HC00017P3,"(Jayashree Lakhan,)"
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,HC00017P3,"(India, India, India, India, India's, India's,..."
14,http://www.isi.edu/gaia/entities/6329ec5d-f1a0...,ldcOnt:PER.MilitaryPersonnel,HC00017P3,"(director, director)"
15,http://www.isi.edu/gaia/entities/086bcfa2-fdc7...,ldcOnt:PER,HC00017P3,


### Origin - Slow, omit if not needed by TA3

In [84]:
from model.source import LTFSourceContext

def query_context(source, start, end):
    if start == -1 or end == -1: return None
    context_extractor = LTFSourceContext(source)
    if context_extractor.doc_exists():
        return context_extractor.query_context(start, end)
    
def query_label(source, start, end):
    if start == -1 or end == -1: 
        return None
    context_extractor = LTFSourceContext(source)
    if context_extractor.doc_exists():
        text = context_extractor.query(start, end)
        return text

In [85]:
if add_origin:
    rpi_entity_with_justification['origin'] = rpi_entity_with_justification.apply(lambda r: query_context(r.source, r.start, r.end), axis=1)
    rpi_entity_with_justification['originLabel'] = rpi_entity_with_justification.apply(lambda r: query_label(r.source, r.start, r.end), axis=1)
    rpi_entity_with_justification.head()

In [86]:
if add_origin:
    df_origin = df[['e', 'origin']].groupby('e')['origin'].apply(tuple).to_frame()
    df_origin['origin'] = df_origin['origin'].apply(lambda s: s if s[0] else None)
    df_origin.head()

In [87]:
if add_origin:
    df_origin_label = df[['e', 'originLabel']].groupby('e')['originLabel'].apply(tuple).to_frame()
    df_origin_label = df_origin_label['originLabel'].apply(lambda s: s if s[0] else None)
    df_origin_label.head()

### Justification Type

In [88]:
df = rpi_entity_with_justification
df = df[(df['justificationType']!='nominal_mention') & (df['justificationType']!='pronominal_mention')]
# df['debug'] = df['justificationType'].apply(lambda s: False if s != 'nominal_mention' and s != 'pronominal_mention' else True)
rpi_entity_with_justification_filtered = df
rpi_entity_with_justification_filtered.head()

Unnamed: 0,e,type,label,target,source,start,end,justificationType
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,,,HC00017P3,9838,9847,mention
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,,,HC00017P3,3280,3295,mention
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,,LDC2019E43:4699848,HC00017P3,9537,9542,mention
3,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,,LDC2019E43:1545739,HC00017P3,9537,9542,mention
4,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,,LDC2019E43:1269750,HC00017P3,9537,9542,mention


### Labels

In [89]:
df_label = rpi_entity_with_justification_filtered[['e', 'label']].drop_duplicates()
df_label = df_label.groupby('e')['label'].apply(tuple).to_frame()
df_label = df_label['label'].apply(lambda s: s if s and s[0] else None)
df_label.head()

e
http://www.isi.edu/gaia/entities/00029697-0740-4032-ae9b-3c264a8898ca    None
http://www.isi.edu/gaia/entities/00bd1aed-65a6-43e5-a9b4-a5ee33180584    None
http://www.isi.edu/gaia/entities/017b8275-edef-4bdb-a6ea-05022e9d9f4b    None
http://www.isi.edu/gaia/entities/01b01979-b405-4ab1-94c3-97d5b141d998    None
http://www.isi.edu/gaia/entities/01b37907-d715-4422-8fcd-1757364d77bd    None
Name: label, dtype: object

### Wikidata from freebase ID

In [94]:
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from rdflib.namespace import Namespace, RDFS, SKOS
from rdflib import URIRef, Literal

wikidata_sparql = SPARQLStore(wikidata_sparql_endpoint)
# wikidata_sparql = SPARQLStore("https://query.wikidata.org/sparql")
WDT = Namespace('http://www.wikidata.org/prop/direct/')
namespaces = {'wdt': WDT, 'skos': SKOS}

def link_wikidata(fbid):
    if not fbid or 'NIL' in fbid: return None
    #.startswith('LDC2015E42:NIL'): return None   
    fbid = '/' + fbid.replace('.', '/')
    query = "SELECT ?qid WHERE { ?qid wdt:P646 ?freebase } LIMIT 1"
    print('\r', query, fbid, end='')
    for qid, in wikidata_sparql.query(query, namespaces, {'freebase': Literal(fbid)}):
        return str(qid)
    
def get_labels(pred, lang):
    def get_labels_for_entity(qid):
        if not qid: 
            return None
        query = """
        SELECT ?label 
        WHERE { 
            ?qid pred ?label
            FILTER (lang(?label) = "language") }
        """.replace('pred', pred).replace('language', lang)
        labels = []
        for label, in wikidata_sparql.query(query, namespaces, {'qid': URIRef(qid)}):
            labels.append(str(label))
        return tuple(labels)
    return get_labels_for_entity

In [95]:
df_fbid = rpi_fbid[['fbid']].drop_duplicates()
df_fbid['wikidata'] = df_fbid.fbid.apply(link_wikidata)
df_fbid.head()

 SELECT ?qid WHERE { ?qid wdt:P646 ?freebase } LIMIT 1 /m/0djgt

Unnamed: 0,fbid,wikidata
0,m.0c2105,http://www.wikidata.org/entity/Q789889
1,m.02gjcx,http://www.wikidata.org/entity/Q1142402
2,m.061456,
3,m.02wz7x,http://www.wikidata.org/entity/Q422404
4,m.0j0k,http://www.wikidata.org/entity/Q48


In [96]:
df_fbid['wiki_label_en'] = df_fbid['wikidata'].apply(get_labels('rdfs:label', 'en'))
df_fbid['wiki_label_ru'] = df_fbid['wikidata'].apply(get_labels('rdfs:label', 'ru'))
df_fbid['wiki_label_uk'] = df_fbid['wikidata'].apply(get_labels('rdfs:label', 'uk'))
df_fbid['wiki_alias_en'] = df_fbid['wikidata'].apply(get_labels('skos:altLabel', 'en'))
df_fbid['wiki_alias_ru'] = df_fbid['wikidata'].apply(get_labels('skos:altLabel', 'ru'))
df_fbid['wiki_alias_uk'] = df_fbid['wikidata'].apply(get_labels('skos:altLabel', 'uk'))
df_fbid = df_fbid.where(pd.notnull(df_fbid), None)
df_fbid.head()

Unnamed: 0,fbid,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk
0,m.0c2105,http://www.wikidata.org/entity/Q789889,"(Ministry of Foreign Affairs,)",(Министерство иностранных дел Российской Федер...,"(Міністерство закордонних справ Росії,)","(MFA Russia, Ministry of Foreign Affairs of th...","(МИД России, МИД Российской Федерации, МИД РФ,...","(МЗС Росії,)"
1,m.02gjcx,http://www.wikidata.org/entity/Q1142402,"(Konkan,)",(),(),(),(),()
2,m.061456,,,,,,,
3,m.02wz7x,http://www.wikidata.org/entity/Q422404,"(Government of Canada,)","(правительство Канады,)",(),"(Her Majesty's Government, federal government ...",(),()
4,m.0j0k,http://www.wikidata.org/entity/Q48,"(Asia,)","(Азия,)","(Азія,)",(),(),()


## Combine columns

In [97]:
print(df_fbid.columns)
print(rpi_entity_with_justification.columns)
print(rpi_entity_valid.columns)
print(document_types.columns)

Index(['fbid', 'wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk',
       'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk'],
      dtype='object')
Index(['e', 'type', 'label', 'target', 'source', 'start', 'end',
       'justificationType'],
      dtype='object')
Index(['e', 'type', 'name', 'text', 'source', 'target'], dtype='object')
Index(['source', 'lang'], dtype='object')


In [98]:
df = rpi_entity_with_justification_filtered[['e', 'type', 'source']].drop_duplicates()
if add_origin:
    df = df.join(df_origin, on='e').join(df_origin_label, on='e')
df = df.join(df_label, on='e')
df = df.join(df_target.set_index('e'), on='e')
df = df.join(rpi_external.set_index('e'), on='e')

def add_wd(fbids):
    if fbids:
        wd = []
        label_en = []
        label_ru = []
        label_uk = []
        alias_en = ()
        alias_ru = ()
        alias_uk = ()
        for fbid in fbids:
            row_df = df_fbid.loc[df_fbid.fbid==fbid]
            
            wikidata = row_df['wikidata'].values[0]
            wd.append(wikidata)
            
            wiki_label_en = row_df['wiki_label_en'].values[0]
            if wiki_label_en:
                wiki_label_en = wiki_label_en[0]
            label_en.append(wiki_label_en)

            wiki_label_ru = row_df['wiki_label_ru'].values[0]
            if wiki_label_ru:
                wiki_label_ru = wiki_label_ru[0]
            label_ru.append(wiki_label_ru)
            
            wiki_label_uk = row_df['wiki_label_uk'].values[0]
            if wiki_label_uk:
                wiki_label_uk = wiki_label_uk[0]
            label_uk.append(wiki_label_uk)
            
            wiki_alias_en = row_df['wiki_alias_en'].values[0]
            alias_en = alias_en + (wiki_alias_en,)
            wiki_alias_ru = row_df['wiki_alias_ru'].values[0]
            alias_ru = alias_ru + (wiki_alias_ru,)
            wiki_alias_uk = row_df['wiki_alias_uk'].values[0]
            alias_uk = alias_uk + (wiki_alias_uk,)
        return pd.Series({'wikidata': tuple(wd), 'wiki_label_en': tuple(label_en), 'wiki_label_ru': tuple(label_ru), 'wiki_label_uk': tuple(label_uk), 'wiki_alias_en': alias_en, 'wiki_alias_ru': alias_ru, 'wiki_alias_uk': alias_uk})
    else:
        return pd.Series({'wikidata': None, 'wiki_label_en': None, 'wiki_label_ru': None, 'wiki_label_uk': None, 'wiki_alias_en': None, 'wiki_alias_ru': None, 'wiki_alias_uk': None})
df = df.where(pd.notnull(df), None)
df[['wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk']] = df['fbid'].apply(add_wd)

df = df.join(document_types.set_index('source'), on='source')
df = df.join(df_names[['e', 'name']].set_index('e'), on='e')

if add_origin:
    df = df[['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata',
           'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en',
           'wiki_alias_ru', 'wiki_alias_uk', 'origin', 'originLabel', 'lang', 'label']]
else:
    df = df[['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata',
           'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en',
           'wiki_alias_ru', 'wiki_alias_uk', 'lang', 'label']]

df_all = df
df_all.head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,"(contractor, contractor)",HC00017P3,,,,,,,,,,,,,en,
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,"(Jayashree Lakhan,)",HC00017P3,,,,,,,,,,,,,en,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0, m.03rz4)","(0.469724013, 0.6634847224)","(0.9556255937, 0.6634847224)","(http://www.wikidata.org/entity/Q668, http://w...","(India, Indian Ocean)","(Индия, Индийский океан)","(Індія, Індійський океан)","((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,), ())","((Бгарат, Республіка Індія), ())",en,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0,)","(0.6578779092,)","(1.0,)","(http://www.wikidata.org/entity/Q668,)","(India,)","(Индия,)","(Індія,)","((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,),)","((Бгарат, Республіка Індія),)",en,
10,http://www.isi.edu/gaia/entities/6329ec5d-f1a0...,ldcOnt:PER.MilitaryPersonnel,"(director, director)",HC00017P3,,,,,,,,,,,,,en,


## Write out dataframe

In [99]:
df_all.to_hdf(store_data_dir + '/entity_all_' + version + '.h5', 'entity', mode='w', format='fixed')
_ = pd.read_hdf(store_data_dir + '/entity_all_' + version + '.h5')
df_all.head()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk', 'lang', 'label']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label
0,http://www.isi.edu/gaia/entities/2d26abd5-94c7...,ldcOnt:TTL,"(contractor, contractor)",HC00017P3,,,,,,,,,,,,,en,
1,http://www.isi.edu/gaia/entities/ee5a9d74-0942...,ldcOnt:PER.Politician.HeadOfGovernment,"(Jayashree Lakhan,)",HC00017P3,,,,,,,,,,,,,en,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0, m.03rz4)","(0.469724013, 0.6634847224)","(0.9556255937, 0.6634847224)","(http://www.wikidata.org/entity/Q668, http://w...","(India, Indian Ocean)","(Индия, Индийский океан)","(Індія, Індійський океан)","((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,), ())","((Бгарат, Республіка Індія), ())",en,
2,http://www.isi.edu/gaia/entities/58fd6e68-88aa...,ldcOnt:GPE.ProvinceState.ProvinceState,"(India, India, India, India, India's, India's,...",HC00017P3,"(LDC2019E43:4699848, LDC2019E43:1545739, LDC20...","(0.001, 1.0, 1.0, 0.0438)","(m.03rk0,)","(0.6578779092,)","(1.0,)","(http://www.wikidata.org/entity/Q668,)","(India,)","(Индия,)","(Індія,)","((IND, in, Bharat, Bharatvarsh, Hindustan, IN,...","((Республика Индия,),)","((Бгарат, Республіка Індія),)",en,
10,http://www.isi.edu/gaia/entities/6329ec5d-f1a0...,ldcOnt:PER.MilitaryPersonnel,"(director, director)",HC00017P3,,,,,,,,,,,,,en,
