In [1]:
import json

commodities_raw = json.load(open('data/commoditiesV1.json','r')) # got the data from the GLOBALISE team, thx Kay!

In [2]:
commodities_raw['results']['bindings'][0]

{'concept': {'type': 'uri',
  'value': 'https://digitaalerfgoed.poolparty.biz/globalise/25e9792d-b945-447e-97e5-27726800de25'},
 'prefLabelNL': {'xml:lang': 'nl',
  'type': 'literal',
  'value': 'Verwerkte goederen, hoofdzakelijk ingedeeld naar gebruik'},
 'prefLabelEN': {'xml:lang': 'en',
  'type': 'literal',
  'value': 'Manufactured goods classified chiefly by use'},
 'altLabelsNL': {'type': 'literal', 'value': ''},
 'altLabelsEN': {'type': 'literal', 'value': ''},
 'concept_closeMatches': {'type': 'literal', 'value': ''},
 'definitions': {'type': 'literal',
  'value': '[int.]\n\nVerwerkte goederen die hoofdzakelijk naar gebruik zijn geclassificeerd.'},
 'concept_sources': {'type': 'literal',
  'value': 'United Nations Statistics Division. “Standard International Trade Classification, Revision 4,” August 18, 2008. https://unstats.un.org/unsd/publication/SeriesM/SeriesM_34rev4E.pdf.'}}

In [3]:
def convert_into_record(entry):
    uri = entry['concept']['value']
    label = entry['prefLabelNL']['value']
#     label_en = entry['prefLabelEN']['value']
    altlabels_nl = [lbl for lbl in entry['altLabelsNL']['value'].split(';') if lbl]
#     altlabels_en = [lbl for lbl in entry['altLabelsEN']['value'].split(';') if lbl]
    try:
        broad_uri = entry['concept_broader']['value']
        broad_label = entry['concept_broaderLabelNL']['value']
    except: # can't go higher
        broad_uri = ''
        broad_label = ''
    return {'uri': uri,
            'main_label': label,
           'all_labels': ';'.join([label]+altlabels_nl),
           'broader_uri':broad_uri,
           'broader_label':broad_label}

In [4]:
records = [convert_into_record(entry) for entry in commodities_raw['results']['bindings']]

In [5]:
import pandas as pd

In [6]:
df_commodities = pd.DataFrame.from_records(records)
df_commodities

Unnamed: 0,uri,main_label,all_labels,broader_uri,broader_label
0,https://digitaalerfgoed.poolparty.biz/globalis...,"Verwerkte goederen, hoofdzakelijk ingedeeld na...","Verwerkte goederen, hoofdzakelijk ingedeeld na...",,
1,https://digitaalerfgoed.poolparty.biz/globalis...,Niet elders geclasificeerde goederen en transa...,Niet elders geclasificeerde goederen en transa...,,
2,https://digitaalerfgoed.poolparty.biz/globalis...,"Ruwe materialen, oneetbaar, behalve brandstoffen","Ruwe materialen, oneetbaar, behalve brandstoffen",,
3,https://digitaalerfgoed.poolparty.biz/globalis...,"Chemicaliën en verwante producten, n.e.g.","Chemicaliën en verwante producten, n.e.g.",,
4,https://digitaalerfgoed.poolparty.biz/globalis...,"Minerale brandstoffen, smeermiddelen en gerela...","Minerale brandstoffen, smeermiddelen en gerela...",,
...,...,...,...,...,...
3759,https://digitaalerfgoed.poolparty.biz/globalis...,sestienes,sestienes;sestines,https://digitaalerfgoed.poolparty.biz/globalis...,zijde
3760,https://digitaalerfgoed.poolparty.biz/globalis...,sumongi,sumongi;sumongys; soumongij,https://digitaalerfgoed.poolparty.biz/globalis...,zijde
3761,https://digitaalerfgoed.poolparty.biz/globalis...,fluweel,fluweel,https://digitaalerfgoed.poolparty.biz/globalis...,zijde
3762,https://digitaalerfgoed.poolparty.biz/globalis...,Affuitzwalp,Affuitzwalp,https://digitaalerfgoed.poolparty.biz/globalis...,zwalp


In [31]:
df_commodities.to_csv('data/commodities_latest.tsv',sep='\t')

In [None]:
# Old code begins here, remove or refactor later

In [3]:
# the json data received at:
    # endpoint: https://digitaalerfgoed.poolparty.biz/PoolParty/sparql/globalise
    # query:

        '''
        PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
        SELECT DISTINCT ?Concept ?prefLabel ?altLabel
        WHERE
        { ?Concept ?x skos:Concept ; skos:inScheme <https://digitaalerfgoed.poolparty.biz/globalise/be873c02-658f-4764-a010-f00840f7f087> .
        { ?Concept skos:prefLabel ?prefLabel . }
        OPTIONAL { ?Concept skos:altLabel ?altLabel} 
        } GROUP BY ?Concept ?prefLabel ?altLabel
        '''
# NB: the endpoint is being updated now, maybe not available

IndentationError: unexpected indent (<ipython-input-3-abebfc15198b>, line 5)

In [4]:
len(commodities_raw['results']['bindings'])

3764

In [5]:
commodities_raw['results']['bindings']

[{'concept': {'type': 'uri',
   'value': 'https://digitaalerfgoed.poolparty.biz/globalise/25e9792d-b945-447e-97e5-27726800de25'},
  'prefLabelNL': {'xml:lang': 'nl',
   'type': 'literal',
   'value': 'Verwerkte goederen, hoofdzakelijk ingedeeld naar gebruik'},
  'prefLabelEN': {'xml:lang': 'en',
   'type': 'literal',
   'value': 'Manufactured goods classified chiefly by use'},
  'altLabelsNL': {'type': 'literal', 'value': ''},
  'altLabelsEN': {'type': 'literal', 'value': ''},
  'concept_closeMatches': {'type': 'literal', 'value': ''},
  'definitions': {'type': 'literal',
   'value': '[int.]\n\nVerwerkte goederen die hoofdzakelijk naar gebruik zijn geclassificeerd.'},
  'concept_sources': {'type': 'literal',
   'value': 'United Nations Statistics Division. “Standard International Trade Classification, Revision 4,” August 18, 2008. https://unstats.un.org/unsd/publication/SeriesM/SeriesM_34rev4E.pdf.'}},
 {'concept': {'type': 'uri',
   'value': 'https://digitaalerfgoed.poolparty.biz/glob

In [9]:
language_per_label = {}

for item in commodities_raw['results']['bindings']:
    prefLabel = item['prefLabel']['value']
    lang = item['prefLabel']['xml:lang']
    language_per_label[prefLabel] = lang
    if 'altLabel' in item:
        altLabel = item['altLabel']['value']
        lang = item['altLabel']['xml:lang']
        language_per_label[altLabel] = lang


KeyError: 'prefLabel'

In [None]:
records = [{'concept_uri':item['Concept']['value'],
           'prefLabel':item['prefLabel']['value'],
           'altLabel':item['altLabel']['value'] if 'altLabel' in item else ''}
            for item in commodities_raw['results']['bindings']
#                 if item['prefLabel']['xml:lang']=='nl'  # Dutch only
          ] # a nicer format, preparing for pandas
records

In [7]:
labels_by_uri = {}


for record in records:
    concept_uri, prefLabel, altLabel = record['concept_uri'], record['prefLabel'], record['altLabel']
    if concept_uri in labels_by_uri:
        labels_by_uri[concept_uri].add(prefLabel)
    else:
        labels_by_uri[concept_uri] = set([prefLabel])
        
    if altLabel:
        labels_by_uri[concept_uri].add(altLabel)

NameError: name 'records' is not defined

In [8]:
labels_by_uri

{}

In [60]:
labels_by_uri_clean = [{'concept_uri':key,
                        'labels': "; ".join(list(labels_by_uri[key]))
                       }
                        for key in labels_by_uri.keys()
                      ]

In [61]:
labels_by_uri_clean

[{'concept_uri': 'https://digitaalerfgoed.poolparty.biz/globalise/e990e5db-88f1-49ab-b49c-609644eed6ab',
  'labels': 'Voedsel en levende dieren; Food and live animals'},
 {'concept_uri': 'https://digitaalerfgoed.poolparty.biz/globalise/9c5f5586-a433-49bd-ae6c-e48824e3c382',
  'labels': "Dairy products and birds' eggs; Zuivelproducten en vogeleieren"},
 {'concept_uri': 'https://digitaalerfgoed.poolparty.biz/globalise/f9dd952e-f882-4929-91a7-a34f7d83e633',
  'labels': 'Dairy; Zuivel'},
 {'concept_uri': 'https://digitaalerfgoed.poolparty.biz/globalise/4087e492-2e06-427b-ad16-6297e0eea240',
  'labels': 'Butter; Boter'},
 {'concept_uri': 'https://digitaalerfgoed.poolparty.biz/globalise/b4bb6c13-4605-428d-96e9-b526a8ddf830',
  'labels': 'Live animals other than the marine category; Levende dieren buiten de categorie zeedieren'},
 {'concept_uri': 'https://digitaalerfgoed.poolparty.biz/globalise/48917f08-b48a-43d2-8ecc-d2c5eb5636f1',
  'labels': 'Zoogdier; Mammals'},
 {'concept_uri': 'https://

In [62]:
import pandas as pd

In [63]:
df = pd.DataFrame.from_records(labels_by_uri_clean, columns=['concept_uri','labels'])

In [64]:
df

Unnamed: 0,concept_uri,labels
0,https://digitaalerfgoed.poolparty.biz/globalis...,Voedsel en levende dieren; Food and live animals
1,https://digitaalerfgoed.poolparty.biz/globalis...,Dairy products and birds' eggs; Zuivelproducte...
2,https://digitaalerfgoed.poolparty.biz/globalis...,Dairy; Zuivel
3,https://digitaalerfgoed.poolparty.biz/globalis...,Butter; Boter
4,https://digitaalerfgoed.poolparty.biz/globalis...,Live animals other than the marine category; L...
...,...,...
3616,https://digitaalerfgoed.poolparty.biz/globalis...,zonnescherm
3617,https://digitaalerfgoed.poolparty.biz/globalis...,zool
3618,https://digitaalerfgoed.poolparty.biz/globalis...,zoopjesglas
3619,https://digitaalerfgoed.poolparty.biz/globalis...,zoopjeskelk


In [65]:
df.to_csv('data/commodities.tsv',sep='\t')

In [66]:
preferred_label_by_uri = {} 


for record in records:
    concept_uri, prefLabel = record['concept_uri'], record['prefLabel']
    if language_per_label[prefLabel] == 'nl' or concept_uri not in preferred_label_by_uri: 
        # either the Dutch label or the only label - eg. indigo
        preferred_label_by_uri[concept_uri] = prefLabel

In [67]:
json.dump(preferred_label_by_uri,open('preferred_label_by_uri.json','w'))

In [68]:
### Now, let's save broad labels to categorise the commodities

In [69]:
broad_labels_raw = json.load(open('all_broad_labels.json','r'))

In [None]:
# the json data received at:
    # endpoint: https://digitaalerfgoed.poolparty.biz/PoolParty/sparql/globalise
    # query:
    '''
    PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
    SELECT ?concept ?prefLabel ?broadLabel ?parent
    WHERE
    {
     ?concept skos:prefLabel ?prefLabel ;
        skos:broader ?parent .
     ?parent skos:prefLabel ?broadLabel .
    }
    '''

In [83]:

broad_label_by_uri = {}
for item in broad_labels_raw['results']['bindings']:
    uri = item['concept']['value']
    lang = item['broadLabel']['xml:lang']
    label = item['broadLabel']['value']
    if lang == 'nl' or uri not in broad_label_by_uri:
        broad_label_by_uri[uri] = label

In [84]:
json.dump(broad_label_by_uri, open('broad_label_by_uri.json','w'))

In [85]:
broad_label_by_name = {}
for item in broad_labels_raw['results']['bindings']:
    name = item['prefLabel']['value']
    lang = item['broadLabel']['xml:lang']
    label = item['broadLabel']['value']
    if lang == 'nl' or name not in broad_label_by_name:
        broad_label_by_name[name] = label

In [86]:
json.dump(broad_label_by_name, open('broad_label_by_name.json','w'))

In [91]:
broad_label_by_name['Kruidnagel']

'Specerijen'

In [92]:
broad_label_by_uri['https://digitaalerfgoed.poolparty.biz/globalise/511e4b40-26a7-4da2-b758-a57699c697d0']

'Machines en transportmiddelen'

In [None]:
tree_of_uris = {}
for item in broad_labels_raw['results']['bindings']:
    name = item['prefLabel']['value']
    label = item['broadLabel']['value']
    