In [2]:
%%time
import requests
from rdflib import Graph
from rdflib import Namespace
from rdflib import URIRef
from rdflib import Literal
from rdflib.namespace import RDF, RDFS, OWL, DC, FOAF, XSD, SKOS
from bs4 import BeautifulSoup

ODT = Namespace('http://www.quaat.com/ontologies#')
DCAT = Namespace('http://www.w3.org/ns/dcat#')
DCT = Namespace('http://purl.org/dc/terms/')
ODTX = Namespace('http://www.quaat.com/ontology/ODTX#')
QEX = Namespace('http://www.quaat.com/extended_skos#')

# Get a list of all datasets
r = requests.get('http://78.91.98.234:5000/api/3/action/package_list')
json = r.json()

def get_value(dict, key):
    if key in dict:
        return dict[key]
    return ""

corpus = [] # the collection of texts
dataset_ref = [] # a list of datasets corresponding to the corpus
nsets = 0
# For each item in the list of dataset, extract the contents
for dataset in json['result']:   
    r2 = requests.get('http://78.91.98.234:5000/dataset/' + dataset + '.rdf')    
    graph = Graph()
    graph.parse(format='xml', data=r2.text)
    
    # Extract the relevant text concents and append it to the corpus
    for sdataset in graph.subjects(RDF.type, DCAT.Dataset):                
        d           = dict(graph.predicate_objects(sdataset))
        title       = get_value(d, DCT.title)
        description = get_value(d, DCT.description)
        nsets      += 1
        print ('processing {}'.format(sdataset))
        if title and description:
            text        = '.'.join([title, description])
            
            if bool(BeautifulSoup(text, "html5lib").find()):
                text = BeautifulSoup(text, "html5lib").text
            dataset_ref.append(sdataset)
            corpus.append(text)
            
print ('num sets:', nsets)

processing http://78.91.98.234/dataset/5f7ec8a6-f727-44c8-aa8c-3cc4171c9d5c
processing http://78.91.98.234/dataset/a2f25723-7abc-4d9a-b1e4-1691e1f85839
processing http://78.91.98.234/dataset/d38ec913-559c-4eeb-8ee7-59fef63db90d
processing http://78.91.98.234/dataset/627b13f4-5f4b-4e44-b53e-e37dc6a70ac2
processing http://78.91.98.234/dataset/b618029c-3f30-4ea1-a2be-44387de5cb9d
processing http://78.91.98.234/dataset/f8bf6b37-c303-48b4-b7bd-61ebf94f9822
processing http://78.91.98.234/dataset/d46c7ad6-b2b1-4723-a31f-174070ab6401
processing http://78.91.98.234/dataset/efd73feb-7e7b-43fc-91b7-b9600579dbb6
processing http://78.91.98.234/dataset/ccf58a74-4079-402e-95bc-88baca35e981
processing http://78.91.98.234/dataset/5992974c-62b0-499b-9303-f54bf1f38b03
processing http://78.91.98.234/dataset/fa597582-7517-4428-8fb9-9057c8f818d8
processing http://78.91.98.234/dataset/e97fd128-4db5-4b0a-ab36-658c489b76eb
processing http://78.91.98.234/dataset/125f0f86-8cf5-4630-ae7e-c0fee28c0dac
processing h

processing http://78.91.98.234/dataset/6ae63c70-a463-46d2-ace3-c9f7f59b133e
processing http://78.91.98.234/dataset/f8a068b3-df7c-4293-8ff5-40ca9148b07f
processing http://78.91.98.234/dataset/a9f58cbd-1595-4b18-a2f0-d798b7735ead
processing http://78.91.98.234/dataset/d6ff4fee-bb3c-4dd2-bea0-d9200557147f
processing http://78.91.98.234/dataset/d1a3a50b-0566-48c1-acc0-15049da971b3
processing http://78.91.98.234/dataset/72f463a3-ee6c-484e-a3ef-2730710046b9
processing http://78.91.98.234/dataset/bc49d7f0-7bff-4b32-b9d2-f2077684e4e9
processing http://78.91.98.234/dataset/936edf52-1db0-4613-bd31-6f98290a6b11
processing http://78.91.98.234/dataset/6adb0d93-6cb3-4e4b-861f-489f3b9bd9b2
processing http://78.91.98.234/dataset/884a7a3f-adc2-4477-876d-6020dc727fbd
processing http://78.91.98.234/dataset/ab60e637-ef33-44ea-b1d4-0bb0fea81799
processing http://78.91.98.234/dataset/89cf5416-caf8-46d7-9d2d-7bf57ea9d1ac
processing http://78.91.98.234/dataset/482b30a8-1b35-4ed0-85ec-cc52381fa422
processing h

processing http://78.91.98.234/dataset/6370c400-1fd2-45b0-a018-1362b7303088
processing http://78.91.98.234/dataset/81a17167-add0-4a24-8acc-75f1d87d6d66
processing http://78.91.98.234/dataset/1b85ba90-b675-4831-87fd-4d0de893df18
num sets: 220
CPU times: user 13.9 s, sys: 293 ms, total: 14.2 s
Wall time: 34.7 s


In [3]:
from uuid import uuid4
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

DCT = Namespace('http://purl.org/dc/terms/')

norwegian_stop_words = [
    'og', 'i', 'jeg', 'det', 'at', 'en', 'et', 'den', 'til', 'er', 'som',
    'på', 'de', 'med', 'han', 'av', 'ikke', 'ikkje', 'der', 'så', 'var', 'meg',
    'seg', 'men', 'ett', 'har', 'om', 'vi', 'min', 'mitt', 'ha', 'hadde', 'hun',
    'nå', 'over', 'da', 'ved', 'fra', 'du', 'ut', 'sin', 'dem', 'oss', 'opp',
    'man', 'kan', 'hans', 'hvor', 'eller', 'hva', 'skal', 'selv', 'sjøl', 'her', 'alle',
    'vil', 'bli', 'ble', 'blei', 'blitt', 'kunne', 'inn', 'når', 'være', 'kom', 'noen',
    'noe', 'ville', 'dere', 'som', 'deres', 'kun', 'ja', 'etter', 'ned', 'skulle', 'denne',
    'for', 'deg', 'si', 'sine', 'sitt', 'mot', 'å', 'meget', 'hvorfor', 'dette', 'disse',
    'uten', 'hvordan', 'ingen', 'din', 'ditt', 'blir', 'samme', 'hvilken', 'hvilke', 'sånn', 'inni',
    'mellom', 'vår', 'hver', 'hvem', 'vors', 'hvis', 'både', 'bare', 'enn', 'fordi', 'før',
    'mange', 'også', 'slik', 'vært', 'være', 'båe', 'begge', 'siden', 'dykk', 'dykkar', 'dei',
    'deira', 'deires', 'deim', 'di', 'då', 'eg', 'ein', 'eit', 'eitt', 'elles', 'honom',
    'hjå', 'ho', 'hoe', 'henne', 'hennar', 'hennes', 'hoss', 'hossen', 'ikkje', 'ingi', 'inkje',
    'korleis', 'korso', 'kva', 'kvar', 'kvarhelst', 'kven', 'kvi', 'kvifor', 'me', 'medan', 'mi',
    'mine', 'mykje', 'no', 'nokon', 'noka', 'nokor', 'noko', 'nokre', 'si', 'sia', 'sidan',
    'so', 'somt', 'somme', 'um', 'upp', 'vere', 'vore', 'verte', 'vort', 'varte', 'vart',
    'alle', 'andre', 'arbeid', 'at', 'av', 'bare', 'begge', 'ble', 'blei', 'bli', 'blir',
    'blitt', 'bort', 'bra', 'bruke', 'både', 'båe', 'da', 'de', 'deg', 'dei', 'deim',
    'deira', 'deires', 'dem', 'den', 'denne', 'der', 'dere', 'deres', 'det', 'dette', 'di',
    'din', 'disse', 'ditt', 'du', 'dykk', 'dykkar', 'då', 'eg', 'ein', 'eit', 'eitt',
    'eller', 'elles', 'en', 'ene', 'eneste', 'enhver', 'enn', 'er', 'et', 'ett', 'etter',
    'folk', 'for', 'fordi', 'forsûke', 'fra', 'få', 'før', 'fûr', 'fûrst', 'gjorde', 'gjûre',
    'god', 'gå', 'ha', 'hadde', 'han', 'hans', 'har', 'hennar', 'henne', 'hennes', 'her',
    'hjå', 'ho', 'hoe', 'honom', 'hoss', 'hossen', 'hun', 'hva', 'hvem', 'hver', 'hvilke',
    'hvilken', 'hvis', 'hvor', 'hvordan', 'hvorfor', 'i', 'ikke', 'ikkje', 'ingen', 'ingi', 'inkje',
    'inn', 'innen', 'inni', 'ja', 'jeg', 'kan', 'kom', 'korleis', 'korso', 'kun', 'kunne',
    'kva', 'kvar', 'kvarhelst', 'kven', 'kvi', 'kvifor', 'lage', 'lang', 'lik', 'like', 'makt',
    'man', 'mange', 'me', 'med', 'medan', 'meg', 'meget', 'mellom', 'men', 'mens', 'mer',
    'mest', 'mi', 'min', 'mine', 'mitt', 'mot', 'mye', 'mykje', 'må', 'måte', 'navn',
    'ned', 'nei', 'no', 'noe', 'noen', 'noka', 'noko', 'nokon', 'nokor', 'nokre', 'ny',
    'nå', 'når', 'og', 'også', 'om', 'opp', 'oss', 'over', 'part', 'punkt', 'på',
    'rett', 'riktig', 'samme', 'sant', 'seg', 'selv', 'si', 'sia', 'sidan', 'siden', 'sin',
    'sine', 'sist', 'sitt', 'sjøl', 'skal', 'skulle', 'slik', 'slutt', 'so', 'som', 'somme',
    'somt', 'start', 'stille', 'så', 'sånn', 'tid', 'til', 'tilbake', 'tilstand', 'um', 'under',
    'upp', 'ut', 'uten', 'var', 'vart', 'varte', 'ved', 'verdi', 'vere', 'verte', 'vi',
    'vil', 'ville', 'vite', 'vore', 'vors', 'vort', 'vår', 'være', 'vært', 'vöre', 'å']
    
#tf = TfidfVectorizer(stop_words=norwegian_stop_words, use_idf=True,ngram_range=(1,3))
#tfidf_matrix = tf.fit_transform(corpus)
#feature_names = tf.get_feature_names()
#
#def weighted_keywords(idx, n_results=10):
#    feature_index = tfidf_matrix[idx,:].nonzero()[1]
#    tfidf_scores = zip(feature_index, [tfidf_matrix[idx, x] for x in feature_index])
#    pairs = [(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]]
#    for pair in sorted(pairs, key=lambda v: v[1], reverse=True)[:n_results]:        
#        yield pair



In [4]:
%%time
# Import the norwegian WordNet "OrdVev".
# Not that this is extremely slow
from odt.ordvev import OrdVev
ordvev = OrdVev()

CPU times: user 7min 23s, sys: 4.31 s, total: 7min 27s
Wall time: 7min 29s


In [1]:
%%time
# Read ontology from database
from odt.database import load_ontology
config = {}
config['DB_USERNAME'] = 'nims'
config['DB_PASSWD'] = '******'
config['ONTOLOGY_UUID'] = '5b2ab51401d5412566cf4e94'

uri = 'mongodb://{0}:{1}@ds119969.mlab.com:19969/ontodb'.format(config['DB_USERNAME'],
                                                                config['DB_PASSWD'])
ontology_graph = load_ontology(uri, config['ONTOLOGY_UUID'])
ontology_graph.serialize(format='turtle')

CPU times: user 1.03 s, sys: 29.1 ms, total: 1.06 s
Wall time: 1.62 s


In [7]:
%%time
concepts = [c for c in ontology_graph.subjects(RDF.type, SKOS.Concept)]
print (concepts)
lblmap = {}
# Extract all labels from a given concept
# Extract all labels from a given concept
for concept in concepts:
        pref = [p.value for p in ontology_graph.objects(concept, SKOS.prefLabel) if p.language == 'nb']
        alt = [p.value for p in ontology_graph.objects(concept, SKOS.altLabel) if p.language == 'nb']
        lblmap[concept] = pref+alt
                
def remove_stopwords(tokens, stop_words):
    return [w for w in tokens if w not in stop_words]
    
def normalize(tokens):
    return [w.lower() for w in tokens if w.isalnum() ]

def compute_score(word, concept):
    score = 0.0
    for lbl in lblmap[concept]:
        new_score = ordvev.sim_wup(word, lbl)
        score = max(score, new_score)
    return score

# Helper method to create a link between a dataset and a concept with
# a given similarity-score
def add_similarity_link(graph, dataset, concept, score):
    uuid = uuid4().hex
    simlink = URIRef(QEX[uuid])
    graph.add((simlink, RDF.type, ODT.Similarity))
    graph.add((simlink, ODT.dataset, dataset))
    graph.add((simlink, ODT.concept, concept))
    graph.add((simlink, ODT.score, Literal(score, datatype=XSD.double)))
    return simlink


wt = word_tokenize
#kolumbus_dataset = URIRef('http://78.91.98.234/dataset/0e3f86cf-2334-41f3-8762-935e5f83d638')
#index = dataset_ref.index(kolumbus_dataset)

semantic_threshold = 0.8 # 
min_concepts = 4 # The minimum number of linked concepts

similarity_graph = Graph()
similarity_graph.bind('odt', ODT)

num_datasets = len(dataset_ref)
for index, dataset in enumerate(dataset_ref):
    print ('processing dataset {} of {}'. format(index, num_datasets))
    tokens = wt(corpus[index])
    tokens = normalize(remove_stopwords(tokens, norwegian_stop_words))
    #print (tokens)
    
    # Compute the relevance for each concept
    scorelist = []
    for concept in concepts:
        score = 0.0
        for token in tokens:
            new_score = compute_score(token, concept)
            score = max(new_score, score)
        scorelist.append((concept, score))     
    sorted_scores = sorted(scorelist, key=lambda x: x[1], reverse=True)
    filtered_score = [(c,s) for c,s in sorted_scores if s >= semantic_threshold]
    if len(filtered_score) < 4:
        filtered_score = sorted_scores[:min_concepts]
    
    # Add the scores to a graph
    for concept, score in filtered_score:
        add_similarity_link(similarity_graph, dataset, concept, score)        
        
similarity_graph.serialize(destination='autotag-no-v5.0.rdf', format='xml')

[rdflib.term.URIRef('http://www.quaat.com/ontologies#Sign'), rdflib.term.URIRef('http://www.quaat.com/ontologies#TransportMode'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Representation'), rdflib.term.URIRef('http://www.quaat.com/ontologies#CarSharing'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Schedule'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Company'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Traffic'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Intersection'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Cable'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Train'), rdflib.term.URIRef('http://www.quaat.com/ontologies#WeatherForecast'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Route'), rdflib.term.URIRef('http://www.quaat.com/ontologies#EnvironmentInformation'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Road'), rdflib.term.URIRef('http://www.quaat.com/ontologies#Region'), rdflib.term.URIRef('ht

processing dataset 41 of 204
processing dataset 42 of 204
processing dataset 43 of 204
processing dataset 44 of 204
processing dataset 45 of 204
processing dataset 46 of 204
processing dataset 47 of 204
processing dataset 48 of 204
processing dataset 49 of 204
processing dataset 50 of 204
processing dataset 51 of 204
processing dataset 52 of 204
processing dataset 53 of 204
processing dataset 54 of 204
processing dataset 55 of 204
processing dataset 56 of 204
processing dataset 57 of 204
processing dataset 58 of 204
processing dataset 59 of 204
processing dataset 60 of 204
processing dataset 61 of 204
processing dataset 62 of 204
processing dataset 63 of 204
processing dataset 64 of 204
processing dataset 65 of 204
processing dataset 66 of 204
processing dataset 67 of 204
processing dataset 68 of 204
processing dataset 69 of 204
processing dataset 70 of 204
processing dataset 71 of 204
processing dataset 72 of 204
processing dataset 73 of 204
processing dataset 74 of 204
processing dat