In [1]:
from rdflib import *

bibliography_url = [
    'https://docs.google.com/spreadsheets/d/17XLYm1GCTGs-sz5XcNMOsKzlL8d9OSWsma9TWTC1-WI/export?exportFormat=csv&gid=1745429688',
    'https://docs.google.com/spreadsheets/d/17XLYm1GCTGs-sz5XcNMOsKzlL8d9OSWsma9TWTC1-WI/export?exportFormat=csv&gid=1199214376'
]

stopwords_url = 'https://docs.google.com/spreadsheets/d/17XLYm1GCTGs-sz5XcNMOsKzlL8d9OSWsma9TWTC1-WI/export?exportFormat=csv&gid=594525089'
service_url = 'http://localhost:8080/annotate/annotate/'
nt_file = '/Users/jimmccusker/src/linkipedia/hbgd-index/NTriple/hbgdki_imported.nt'
triple_store = 'hbgd_ontology_db'

np_service_url = 'https://textanalysis.p.mashape.com/textblob-noun-phrase-extraction'
mashape_key = 'E8w3jnMJ1ymshWdjDwp3zxESxlRGp1rJ9rRjsnspugsr8f7XXc'
hbgd = Namespace('https://hbgd.tw.rpi.edu/ns/')
cmo = Namespace('http://purl.org/twc/ontologies/cmo.owl#')
dc = Namespace('http://purl.org/dc/terms/')
skos = Namespace('http://www.w3.org/2004/02/skos/core#')
sio = Namespace("http://semanticscience.org/resource/")
prov  = Namespace("http://www.w3.org/ns/prov#")

In [2]:
graph = ConjunctiveGraph(store='Sleepycat')
graph.open(triple_store, create = True)

from rdflib.plugins.stores.sparqlstore import SPARQLStore
dbpedia_store = SPARQLStore('http://dbpedia.org/sparql')
dbpedia_graph = ConjunctiveGraph(dbpedia_store)



In [14]:
import csv, urllib, json, urllib2, re
import pandas as pd
from rdflib.extras.infixowl import *
import collections
import json
import base64
import random
import datetime
import unirest
from rdflib.compare import to_isomorphic
import requests

def extract_noun_phrases(text):
    response = unirest.post(np_service_url,
      headers={
        "X-Mashape-Key": mashape_key,
        "Content-Type": "application/x-www-form-urlencoded",
        "Accept": "application/json"
      },
      params={"text": text})
    return response.body['noun_phrases']

def create_id():
    return base64.urlsafe_b64encode(bytes(random.random()*datetime.datetime.now().toordinal())).rstrip("=")

def get_stopwords():
    data = [x for x in csv.reader(urllib2.urlopen(stopwords_url))][1:]
    stopwords = set([word for word, concept in data if len(word) > 0])
    stopconcepts = set([concept for word, concept in data if len(concept) > 0])
    return stopwords, stopconcepts
    

def get_papers(url=None, key=None, sheet=None):
    if url == None:
        url = 'https://docs.google.com/spreadsheets/d/%s/export?exportFormat=csv&gid=%s' % (key, sheet)
    data = [x for x in csv.reader(urllib2.urlopen(url))]
    header = data[0]
    data = [dict([(header[i],value.strip()) for i, value in enumerate(row)]) for row in data[1:]]
    return data

def extract_mentions(text, context):
    urls = collections.defaultdict(float)
    params = {
        'numResult':20,
        'minScore':1,
        'query':text,
        'context': context
    }
    response = requests.get(service_url, params=params).json()
    return response['results']

def create_class(resource, altLabels=[], prefix=hbgd):
    local_part = resource.identifier.split('/')[-1]
    local_part = local_part.replace('_',' ')
    if not local_part.istitle():
        local_part = local_part.title()
    local_part = local_part.replace(' ','')
    uri = hbgd[local_part]
    g = Graph()
    for label in resource[RDFS.label]:
        g.add((uri, RDFS.label, label))
        g.add((resource.identifier, RDFS.label, label))
    
    for altLabel in altLabels:
        g.add((uri, skos.altLabel, Literal(altLabel)))

    defn = hbgd['definition/'+create_id()]
    g.add((uri, hbgd.hasDefinition, defn))
    g.add((defn, RDF.type, sio.definition))
    g.add((defn, RDF.type, hbgd.PreferredDefinition))
    
    for definition in resource[RDFS.comment]:
        g.add((defn, prov.value, definition))
        g.add((uri, skos.definition, definition))
        
    g.add((uri,RDF.type,hbgd.HBGDkiConcept))
    g.add((uri,RDF.type,OWL.Class))
    g.add((uri,cmo.hasPrimaryConcept, resource.identifier))
    g.add((defn, skos.editorialNote, Literal('Was quoted from Wikipedia via DBpedia.')))
    
    return g.resource(uri)

import ipywidgets 
from IPython.display import display

def get_classes(mention):
    urls = dict([(x['url'], x['score']) for x in mention['annotations']])
    urls = [x[0] for x in sorted(urls.items(), key=lambda x: x[1])]
    urls = [URIRef(url) for url in urls if (url.startswith('https://hbgd.tw.rpi.edu/ns/')
            or not url.startswith('https://hbgd.tw.rpi.edu/')) and url not in stopconcepts]
    return urls

def get_hbgd_concept(urls):
    hbgd_concepts = [url for url in urls if graph[url: RDF.type: OWL.Class]]
    if len(hbgd_concepts) > 0:
        return hbgd_concepts[0]
    return None

def load_lod(uri):
    g = Graph()
    try:
        g.load(uri)
    except:
        g = dbpedia_graph
    result = g.resource(uri)
    redirect = result.value(URIRef('http://dbpedia.org/ontology/wikiPageRedirects'))
    if redirect:
        return load_lod(redirect.identifier)
    elif result.value(URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates')):
        return None
    else:
        return result

stopwords, stopconcepts = get_stopwords()

def extract(papers):
    #result_graph.parse(data='''
    #@prefix owl: <http://www.w3.org/2002/07/owl#>.
    #<https://purl.org/dataone/ontologies/observation/ecstra.owl> a owl:Ontology;
    #    owl:imports <https://purl.org/dataone/ontologies/observation/d1-ECSO.owl>.''', format="turtle")
    i = 0
    all_mentions = {}
    all_noun_phrases = set()
    for paper in papers:
        paper['Abstract'] = re.sub('([0-9],?-?)+','', paper['Abstract'])
        noun_phrases = set(extract_noun_phrases(paper['Abstract']))
        #if 'Keywords' in paper:
        #    noun_phrases.update([x.strip() for x in re.split('[,;]\s*',unicode(paper['Keywords']).encode('ascii',errors='ignore')) if len(x.strip()) > 0])
        all_noun_phrases.update(noun_phrases)
        for phrase in noun_phrases:
            mentions = extract_mentions(phrase, paper['Abstract'])
            for mention in mentions:
                if mention['entity_mention'] in stopwords:
                    continue
                m = mention
                if len(get_classes(mention)) == 0:
                    continue
                if mention['entity_mention'] in all_mentions:
                    m = all_mentions[mention['entity_mention']]
                else:
                    m['papers'] = set()
                    m['phrases'] = set()
                    all_mentions[m['entity_mention']] = m
                if paper['URI'] is not None and len(paper['URI']) > 0:
                    m['papers'].add(paper['URI'])
                m['phrases'].add(phrase)
        i += 1
        ftext.value = '%d/%d' % (i, len(papers))
        f.value = i
    return all_mentions.values(), all_noun_phrases

def construct_classes(mentions, all_noun_phrases):
    result_graph = Graph()
    f.max=len(mentions)
    i = 0
    f.value=0
    hbgd_hits = 0
    new_classes = 0
    for mention in mentions:
        urls = get_classes(mention)
        mention['urls'] = urls
        hbgd_concept = get_hbgd_concept(urls)
        if mention['entity_mention'].strip().lower() in all_noun_phrases:
            all_noun_phrases.remove(mention['entity_mention'].strip().lower())
            #print 'Full noun phrase extraction', mention['entity_mention']
        #print hbgd_concept
        if hbgd_concept is None:
            new_classes += 1
            if len(mention['urls']) == 0:
                print mention
            #print "Building suitable HBGDkiConcept for",mention['entity_mention'], 'using', best_concept
            best_concept = None
            while best_concept is None and len(mention['urls']) > 0:
                best_concept = mention['urls'][0]
                lod_concept = load_lod(best_concept)
                if lod_concept is None:
                    best_concept = None
                    mention['urls'] = mention['urls'][1:]
            if lod_concept is None:
                continue
            hbgd_concept_resource = create_class(lod_concept, altLabels=[mention['entity_mention']])
            result_graph += hbgd_concept_resource.graph
            hbgd_concept = hbgd_concept_resource.identifier
        else:
            hbgd_hits += 1
        for paper in mention['papers']:
            result_graph.add((URIRef(paper), dc.subject, hbgd_concept))
        i += 1
        ftext.value = '%d/%d (%d HBGD hits vs %d new classes)' % (i, len(mentions), hbgd_hits, new_classes)
        f.value = i
    print 'Missing noun phrases:'
    print '\n'.join(list(all_noun_phrases))
    return result_graph


In [20]:
graph.remove((None,None,None))
graph.load(nt_file, format="n3")
graph.commit()

In [15]:
papers = []
for url in bibliography_url:
    papers.extend([x for x in get_papers(url=url) if 'Abstract' in x and len(x['Abstract'].strip()) > 0])
print len(papers)
f = ipywidgets.FloatProgress(min=0, max=len(papers))
ftext = ipywidgets.Text(value='0', description='%')
display(f)
display(ftext)

mentions, all_noun_phrases = extract(papers)
extracted_graph = construct_classes(mentions, all_noun_phrases)
with open("/Users/jimmccusker/Dropbox/Public/hbgd-extra.ttl",'wb') as out:
    out.write(extracted_graph.serialize(format='turtle'))

321
Missing noun phrases:
subsequent weight gain
adult height
daily calories
anthropometric status
year-old children
neonatal nutrition
identify factors
total body water measurement
increase incomes
postnatal undernutrition
spontaneous intestinal hyperplasia
body protein mass
outcomes relate
normal wild types
bromodeoxyuridine assays
/- kcal/day
developmental velocity
previous analyses
weights ]
constant infusion
population-specific food list
q-q
excessive food intake
subsequent changes
panama nutrition trial cohort
gene level
alanyl-glutamine
innovative approaches
nine-item food access insecurity questionnaire
lesser concentrations
obese women
weight data
important anabolic hormone
carbon dioxide production data
who multicentre
markov
lean individuals
pre-set condition
necessary intervention magnitude
relative height > /= %
/- mg/l
organ weights
effective immune responses
ketone body utilization
fitc-dextran
height centiles
absorptive function
antenatal clinic
nec
unique study
econome

In [21]:
from json import dumps, loads, JSONEncoder, JSONDecoder
import pickle

class PythonObjectEncoder(JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (list, dict, str, unicode, int, float, bool, type(None))):
            return JSONEncoder.default(self, obj)
        elif isinstance(obj, set):
            return JSONEncoder.default(self,list(obj))
        return {'_python_object': pickle.dumps(obj)}

def as_python_object(dct):
    if '_python_object' in dct:
        return pickle.loads(str(dct['_python_object']))
    return dct

mention_out = open('/Users/jimmccusker/Dropbox/Public/hbgd_mentions.pkl','wb')
mention_out.write(pickle.dumps(mentions))
#mention_out.write(json.dumps(mentions, cls=PythonObjectEncoder))
mention_out.close()

In [31]:
response = extract_noun_phrases(papers[0]['Abstract'])

In [68]:
for np in response:
    mentions = extract_mentions(np)
    print mentions

[{u'entity_mention': u'postnatal', u'annotations': [{u'url': u'https://hbgd.tw.rpi.edu/ns/Postnatal', u'score': 41.71094}, {u'url': u'http://www.ebi.ac.uk/efo/EFO_0002948', u'score': 61.96747}]}]
[{u'entity_mention': u'gestational', u'annotations': [{u'url': u'https://hbgd.tw.rpi.edu/ns/SmallForGestationalAge', u'score': 8.844937}, {u'url': u'https://hbgd.tw.rpi.edu/publication/14', u'score': 8.451267}, {u'url': u'https://hbgd.tw.rpi.edu/ns/GestationalAge', u'score': 51.239044}]}, {u'entity_mention': u'gestational age', u'annotations': [{u'url': u'https://hbgd.tw.rpi.edu/ns/GestationalAge', u'score': 79.66369}]}, {u'entity_mention': u'age', u'annotations': [{u'url': u'https://hbgd.tw.rpi.edu/ns/Age', u'score': 28.190567}, {u'url': u'http://semanticscience.org/resource/age', u'score': 22.26794}]}]
[{u'entity_mention': u'sga', u'annotations': [{u'url': u'https://hbgd.tw.rpi.edu/question/SubQuestion84', u'score': 7.2412777}, {u'url': u'http://dbpedia.org/resource/SGA', u'score': 13.168457

In [74]:
mentions = extract_mentions(response[0])
urls = list(get_classes(mentions[0]))

In [79]:
print list(graph.triples((hbgd.GenomeScaleMetabolicModel,RDF.type,None)))


[]


In [90]:
print pd.DataFrame(list(graph.query('select * where {<%s> ?p ?o}'%urls[0])))

                                                   0  \
0    http://www.w3.org/2000/01/rdf-schema#subClassOf   
1                https://hbgd.tw.rpi.edu/ns/raisedBy   
2                https://hbgd.tw.rpi.edu/ns/raisedBy   
3                https://hbgd.tw.rpi.edu/ns/raisedBy   
4    http://www.w3.org/1999/02/22-rdf-syntax-ns#type   
5    http://www.w3.org/1999/02/22-rdf-syntax-ns#type   
6                  http://rdfs.org/ns/void#inDataset   
7   http://open.vocab.org/terms/subjectDiscriminator   
8            http://purl.org/dc/terms/isReferencedBy   
9            http://purl.org/dc/terms/isReferencedBy   
10     http://www.w3.org/2002/07/owl#equivalentClass   
11        http://www.w3.org/2000/01/rdf-schema#label   
12          https://hbgd.tw.rpi.edu/ns/hasDefinition   
13               http://purl.org/dc/terms/identifier   
14    http://www.w3.org/2004/02/skos/core#definition   

                                                    1  
0   http://semanticscience.org/resource/time-in

In [26]:
foo = {}

In [30]:
foo = 'τ'

In [41]:
urllib.quote(foo)

'%CF%84'

In [4]:
stopwords

{'similarity',
 'magnetic',
 'four',
 'asian',
 'integrity',
 'sch',
 'relationships',
 'calculate',
 'twenty two',
 'postmortem',
 'young women',
 'divergent',
 'kent',
 'activation',
 'rise',
 'replication',
 'rural communities',
 'frailty',
 'steady state',
 'site specific',
 'month',
 'vast',
 'sheep placenta',
 'progression',
 'solution',
 'solid',
 'systemic',
 'indices',
 'enhance',
 'markov',
 'pragmatic',
 'estimates',
 'likely',
 'follow',
 'rigorous',
 'cesar',
 'final analysis',
 'aim',
 'wellcome',
 'panama',
 'nec',
 'fatness',
 'causes',
 'establishes',
 'new',
 'net',
 'harbors',
 'asymmetry',
 'active',
 'dry',
 'gln',
 'oman',
 'glb',
 'reports',
 'dri',
 'changes',
 'sectoral',
 'diameter',
 'slices',
 'retards',
 'total',
 'tertiary',
 'insecurity',
 'arms',
 'lwt',
 'program',
 'until',
 'composite',
 'excellent',
 'ward',
 'electrical conductivity',
 'circumstances',
 'abundance',
 'mg',
 'mm',
 'ml',
 'setup',
 'work',
 'mi',
 'mj',
 'deficiencies',
 'era',
 'oni

In [9]:
summary_query = '''
prefix hbgd: <https://hbgd.tw.rpi.edu/ns/>
prefix skos: <http://www.w3.org/2004/02/skos/core>
prefix cmo: <http://purl.org/twc/ontologies/cmo.owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix prov: <http://www.w3.org/ns/prov#>
prefix dc: <http://purl.org/dc/terms/>

select distinct ?c ?label ?altLabel ?definition ?primaryConcept (group_concat(distinct ?paper ; separator = "; ") AS ?papers) where {
  ?c a hbgd:HBGDkiConcept;
      rdfs:label ?label.
FILTER ( lang(?label) = "en" ) 
  optional {
     ?c <http://www.w3.org/2004/02/skos/core#altLabel> ?altLabel.
  }
  optional {
     ?c <http://www.w3.org/2004/02/skos/core#definition> ?definition.
     FILTER ( lang(?definition) = "en" ) 
  }
  optional {
     ?c cmo:hasPrimaryConcept ?primaryConcept.
  }
  optional {
     ?paper dc:subject ?c.
  }
} group by ?c ?label ?altLabel ?definition ?primaryConcept'''

classes = pd.DataFrame(list(extracted_graph.query('''
prefix hbgd: <https://hbgd.tw.rpi.edu/ns/>
select count(?c) where {
  ?c a hbgd:HBGDkiConcept.
}
''')))

ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



ParseException: Expected "?" (at char 51), (line:3, col:8)