In [1]:
from rdflib import *

data_url = 'https://raw.githubusercontent.com/DataONEorg/semantic-query/master/lib/test_corpus_E_id_list.txt'
dataset_service_url = 'https://cn.dataone.org/cn/v1/query/solr/?wt=json&fl=title,abstract,attribute&q=identifier:"%s"'
service_url = 'http://localhost:8080/annotate/annotate/'

measurement = URIRef('http://purl.dataone.org/odo/ECSO_00000039')
entity = URIRef('http://purl.dataone.org/odo/ECSO_00000525')
unit = URIRef('http://purl.obolibrary.org/obo/UO_0000000')
quality = URIRef('http://purl.obolibrary.org/obo/PATO_0000001')

oboe = Namespace('http://ecoinformatics.org/oboe/oboe.1.0/oboe-core.owl#')
cmo = Namespace('http://purl.org/twc/ontologies/cmo.owl#')
skos = Namespace('http://www.w3.org/2004/02/skos/core#')
_prefix = Namespace('http://purl.dataone.org/odo/ECSTRA_')

In [6]:
import csv, urllib, json, urllib2
import pandas as pd
from rdflib.extras.infixowl import *
import collections
import json
import base64
import random
import datetime
import requests
from rdflib.compare import to_isomorphic

datasets = urllib2.urlopen(data_url).read().split("\n")[1:]

def get_dataset_columns(identifier):
    url = dataset_service_url%identifier
    result = json.loads(urllib2.urlopen(url).read())['response']['docs'][0]['attribute']
    return result

nt_file = '/Users/jimmccusker/src/linkipedia/dataone-index/NTriple/merged.nt'
from rdflib import *

graph = ConjunctiveGraph(store='Sleepycat')
graph.open('ontology_db', create = True)

def extract_mentions(text):
    urls = collections.defaultdict(float)
    params = {
        'numResult':20,
        #'minScore':5,
        'query':text
    }
    response = requests.get(service_url, params=params).json()
    for r in response['results']:
        for annotation in r['annotations']:
            urls[annotation['url']] += float(annotation['score'])
    urls = [URIRef(url) for url, score in sorted(urls.items(), key=lambda x: x[1])]
    return urls

def create_id():
    return base64.urlsafe_b64encode(bytes(random.random()*datetime.datetime.now().toordinal())).rstrip("=")

def canonicalize(resource):
    digest = to_isomorphic(resource.graph).graph_digest()
    #canonical = list(iso[:RDFS.label:resource.label()])[0]
    #print str(canonical), '\n', iso.serialize(format='turtle')
    skolemized = _prefix[str(digest)]
    for s, p, o in resource.graph.triples((None, None, resource.identifier)):
        resource.graph.remove((s, p, o))
        resource.graph.add((s, p, skolemized))
    for s, p, o in resource.graph.triples((resource.identifier, None, None)):
        resource.graph.remove((s, p, o))
        resource.graph.add((skolemized, p, o))
    return resource.graph.resource(skolemized)
                
def create_class_uri():
    return _prefix[create_id()]

def by_super_class(resources):
    result = collections.defaultdict(list)
    for r in resources:
        for s in graph.transitive_objects(URIRef(r),RDFS.subClassOf):
            result[s].append(r)
    return result

def create_class(text, g, prefix=_prefix):
    try:
        resources = list(extract_mentions(text))
    except:
        print 'Error processing "', text, '".'
        return
    by_super = by_super_class(resources)
    result = graph.resource(measurement)
    used = set()
        
    if measurement in by_super:
        result = graph.resource(by_super[measurement][0])
        used.add(result.identifier)
    elif quality in by_super:
        subgraph = Graph()
        characteristic = by_super[quality][0]
        used.add(characteristic)
        super_classes = [Class(result.identifier, graph=subgraph)]
        c = Class(BNode(), graph=subgraph)
        c.subClassOf = super_classes
        equiv = Class(result.identifier, graph=subgraph) & (Property(oboe.ofCharacteristic, graph=subgraph) | only | 
                                          Class(characteristic, graph=subgraph))
        label = graph.label(characteristic)
        subgraph.add((c.identifier, RDFS.label, Literal(label)))
        c.equivalentClass = [equiv]
        skolemized = canonicalize(subgraph.resource(c.identifier))
        g += subgraph
        result = g.resource(skolemized.identifier)        

    if unit in by_super:
        subgraph = Graph()
        uom = by_super[unit][0]
        used.add(uom)
        super_classes = [Class(result.identifier, graph=subgraph)]
        c = Class(BNode(), graph=subgraph)
        c.subClassOf = super_classes
        equiv = Class(result.identifier, graph=subgraph) & (Property(oboe.hasUnit, graph=subgraph) | only | 
                                          Class(uom, graph=subgraph))
        label = '%s in %s' % (result.label(), graph.label(uom))
        subgraph.add((c.identifier, RDFS.label, Literal(label)))
        c.equivalentClass = [equiv]
        skolemized = canonicalize(subgraph.resource(c.identifier))
        g += subgraph
        result = g.resource(skolemized.identifier)
        
    if entity in by_super:
        subgraph = Graph()
        entity_class = by_super[entity][0]
        used.add(entity_class)
        super_classes = [Class(result.identifier, graph=subgraph)]
        c = Class(BNode(), graph=subgraph)
        c.subClassOf = super_classes
        equiv = Class(result.identifier, graph=subgraph) & (Property(oboe.measurementFor, graph=subgraph) | only | 
                                          (Property(oboe.ofEntity, graph=subgraph) | only | 
                                           Class(entity_class, graph=subgraph)))
        label = '%s %s' % (graph.label(entity_class), result.label())
        subgraph.add((c.identifier, RDFS.label, Literal(label)))
        c.equivalentClass = [equiv]
        skolemized = canonicalize(subgraph.resource(c.identifier))
        g += subgraph
        result = g.resource(skolemized.identifier)

    result.add(skos.example, Literal(text))
    return g.resource(result.identifier)    

import ipywidgets 
from IPython.display import display

def extract(datasets):
    f = ipywidgets.FloatProgress(min=0, max=len(datasets))
    ftext = ipywidgets.Text(value='0', description='%')
    display(f)
    display(ftext)
    result_graph = Graph()
    result_graph.parse(data='''
    @prefix owl: <http://www.w3.org/2002/07/owl#>.
    <https://purl.org/dataone/ontologies/observation/ecstra.owl> a owl:Ontology;
        owl:imports <https://purl.org/dataone/ontologies/observation/d1-ECSO.owl>.''', format="turtle")
    i = 0
    for dataset in datasets:
        try:
            columns = get_dataset_columns(dataset)
        except:
            print "Problem processing the dataset '", dataset, "'."
            continue
        for column in columns:
            create_class(column, result_graph)
        i += 1
        ftext.value = str(100 * float(i)/len(datasets))
        f.value = i
    return result_graph

def extract_frequencies(datasets):
    frequencies = collections.defaultdict(int)
    f = ipywidgets.FloatProgress(min=0, max=len(datasets))
    ftext = ipywidgets.Text(value='0', description='%')
    display(f)
    display(ftext)
    i = 0
    for dataset in datasets:
        try:
            columns = get_dataset_columns(dataset)
        except:
            print "Problem processing the dataset '", dataset, "'."
            continue
        for column in columns:
            try:
                resources = list(extract_mentions(column))
                for resource in resources:
                    frequencies[resource] += 1
            except:
                print 'Error processing "', column, '".'
        i += 1
        ftext.value = str(100 * float(i)/len(datasets))
        f.value = i
    
    result = [(uri, r, graph.label(uri)) for uri, r in frequencies.items()]
    result = sorted(result, key=lambda x: x[1], reverse=True)
    return result

In [4]:
graph.remove((None,None,None))
graph.load(nt_file, format="n3")
graph.commit()

In [7]:
print len(datasets)
extracted_graph = extract(datasets[:10])
with open("/Users/jimmccusker/Dropbox/Public/ecstra.ttl",'wb') as out:
    out.write(extracted_graph.serialize(format='turtle'))

1217


In [34]:
x = {}

In [35]:
x.items()

[]

In [50]:
datasets[0]

'https://pasta.lternet.edu/package/metadata/eml/ecotrends/5853/2'

In [17]:
entity_frequency = extract_frequencies(datasets)

Problem processing the dataset ' 7f0ae582-586f-4ff9-a027-b5aa6053d435-20140826_15_PBR1.xml '.
Problem processing the dataset ' f513f938-8547-4738-a31c-024d265b3ce3-CLM4VIC_SG1_Monthly_SWnet.nc4.fgdc.xml '.
Problem processing the dataset ' 83318869-55b8-46d3-bd71-c42e29c69098-ISAM_SG1_Monthly_SWE.nc4.fgdc.xml '.
Problem processing the dataset ' e2543d1b-c3f1-43e8-91d3-70bf5f4df85d-DLEM_SG2_Monthly_TotalResp.nc4.fgdc.xml '.
Problem processing the dataset ' 0da25041-0bf6-4b0e-99dd-a999ff8bcbf8-BIOME-BGC_BG1_Monthly_TotLivBiom.nc4.fgdc.xml '.
Problem processing the dataset ' 1e1649b8-2c43-4947-afda-c2ccdc54229d-CLM4_SG2_Monthly_NPP.nc4.fgdc.xml '.
Problem processing the dataset ' 27221852-3258-4018-8ee3-eb11a9874adc-CLASS-CTEM-N_SG2_Monthly_AbvGrndWood.nc4.fgdc.xml '.
Problem processing the dataset ' 6e5a0065-dd8c-44de-9946-fb2cf67326bd-SiB3_SG1_Monthly_HeteroResp.nc4.fgdc.xml '.
Problem processing the dataset ' d51b3a7e-a957-42be-aa6c-7cad4506fa12-VISIT_SG3_Monthly_HeteroResp.nc4.fgdc.xml

In [21]:
print '\n'.join(['%s,%s,%s'%(str(uri), str(frequency), str(label)) for uri, frequency, label in entity_frequency[:100]])

http://purl.obolibrary.org/obo/UO_0000008,2318,meter
http://purl.obolibrary.org/obo/ENVO_00002006,1780,water
http://purl.dataone.org/odo/ECSO_00000515,1706,depth
https://purl.org/dataone/ontologies/provenance/ProvONE/v1/owl/provone.owl#Data,1513,Data
http://ecoinformatics.org/oboe/oboe.1.0/oboe-ecology.owl#Sample,1478,Sample
http://purl.obolibrary.org/obo/ENVO_00000264,1163,seamount
http://purl.obolibrary.org/obo/ENVO_00000100,1136,valley
http://purl.obolibrary.org/obo/UO_0000036,1121,year
http://ecoinformatics.org/oboe/oboe.1.0/oboe-core.owl#Name,944,Name
http://purl.org/dc/terms/date,936,Date
http://purl.obolibrary.org/obo/PATO_0000146,931,temperature
http://purl.obolibrary.org/obo/UBERON_0000020,816,sense organ
http://ecoinformatics.org/oboe/oboe.1.0/oboe-core.owl#Measurement,747,Measurement
http://ecoinformatics.org/oboe/oboe.1.0/oboe-core.owl#Standard,731,Standard
http://purl.obolibrary.org/obo/ENVO_00000020,693,lake
http://purl.obolibrary.org/obo/UO_0000033,684,day
http://purl.da