In [29]:
# Script to build JSON-LD pages that provide multilingual labels and definitions controlled vocabularies
# Steve Baskauf 2021-01-08 CC0

import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,
# the branch is named "pathway"
github_base_url = 'https://raw.githubusercontent.com/tdwg/ac/master/views/code/'
database_name = 'subjectOrientation'

translations_url = github_base_url + database_name + '-translations.csv'

has_broader = True # set to true if any terms have skos:broader values
has_exactMatch = False # set to true of any terms have skos:exactMatch values

label_col_prefix = 'label_'
def_col_prefix = 'definition_'
localname_column_header = 'term_localName'

# ---------------
# Function definitions
# ---------------

# replace URL with link
#
def createLinks(text):
    def repl(match):
        if match.group(1)[-1] == '.':
            return '<a href="' + match.group(1)[:-1] + '">' + match.group(1)[:-1] + '</a>.'
        return '<a href="' + match.group(1) + '">' + match.group(1) + '</a>'

    pattern = '(https?://[^\s,;\)"]*)'
    result = re.sub(pattern, repl, text)
    return result

In [30]:

namespace_iri = 'http://rs.tdwg.org/acviews/values/'

translations_frame = pd.read_csv(translations_url, na_filter=False)
columns = translations_frame.columns

# Extract the list of languages from the translations spreadsheet column headers
languages = []
for column_header in columns:
    if label_col_prefix in column_header:
        language_code = column_header.split(label_col_prefix)[1]
        if language_code != 'en':
            languages.append(column_header.split(label_col_prefix)[1])
print(languages)

HTTPError: HTTP Error 404: Not Found

In [24]:
# Create a dictionary of language dictionaries for the terms
translations_dictionary = {}
for index,row in translations_frame.iterrows():
    language_dictionary = {}
    for language in languages:
        term_dictionary = {'label': row[label_col_prefix + language], 'definition': row[def_col_prefix + language]}
        language_dictionary[language] = term_dictionary
    translations_dictionary[row[localname_column_header]] = language_dictionary
print(json.dumps(translations_dictionary, indent = 2))

{}


In [26]:
term_info = []
term_dict = {}
frame = pd.read_csv(github_base_url + database_name + '.csv', na_filter=False)
for index,row in frame.iterrows():
    term_dict = {}
    term_dict['localname'] = row['term_localName']
    term_dict['iri'] = namespace_iri + row['term_localName']
    term_dict['label'] = []
    term_dict['label'].append({'language': 'en', 'value': row['label']})
    for language in languages:
        term_dict['label'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['label']})
    term_dict['definition'] = []
    term_dict['definition'].append({'language': 'en', 'value': row['definition']})
    for language in languages:
        term_dict['definition'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['definition']})
    term_dict['cv_string'] = row['controlled_value_string']
    if row['skos_inScheme'] != '':
        term_dict['scheme'] = namespace_iri + row['skos_inScheme']
    else:
        term_dict['scheme'] = ''
    term_dict['type'] = row['type']
    if has_broader:
        if row['skos_broader'] != '':
            term_dict['broader'] = namespace_iri + row['skos_broader']
        else:
            term_dict['broader'] = ''
    if has_exactMatch:
        if row['skos_exactMatch'] != '':
            term_dict['match'] = namespace_iri + row['skos_exactMatch']
        else:
            term_dict['match'] = ''
    term_dict['value'] = row['controlled_value_string']
    term_info.append(term_dict)
print(json.dumps(term_info, indent = 2))

[
  {
    "localname": "p",
    "iri": "http://rs.tdwg.org/acviews/values/p",
    "label": [
      {
        "language": "en",
        "value": "subject part concept scheme"
      }
    ],
    "definition": [
      {
        "language": "en",
        "value": "a SKOS concept scheme for ac:subjectOrientation"
      }
    ],
    "cv_string": "",
    "scheme": "",
    "type": "http://www.w3.org/2004/02/skos/core#ConceptScheme",
    "broader": "",
    "value": ""
  },
  {
    "localname": "p0000",
    "iri": "http://rs.tdwg.org/acviews/values/p0000",
    "label": [
      {
        "language": "en",
        "value": "unspecified part"
      }
    ],
    "definition": [
      {
        "language": "en",
        "value": ""
      }
    ],
    "cv_string": "unspecifiedPart",
    "scheme": "http://rs.tdwg.org/acviews/values/p",
    "type": "http://www.w3.org/2004/02/skos/core#Concept",
    "broader": "",
    "value": "unspecifiedPart"
  },
  {
    "localname": "p0001",
    "iri": "http://rs.tdw

In [27]:
context_dict = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
  }

graph_list = []
for term in term_info:
    term_dict = {}
    term_dict['@id'] = term['iri']
    term_dict['@type'] = term['type']
    if term['value'] != '':
        term_dict['rdf:value'] = term['value']
    if term['scheme'] != '':
        term_dict['skos:inScheme'] = term['scheme']
    if has_broader:
        if term['broader'] != '':
            term_dict['skos:broader'] = term['broader']
    if has_exactMatch:
        if term['match'] != '':
            term_dict['skos:exactMatch'] = term['match']
    label_list = []
    for lang_label in term['label']:
        label_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
    term_dict['skos:prefLabel'] = label_list
    
    def_list = []
    for lang_label in term['definition']:
        def_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
    term_dict['skos:definition'] = def_list
    
    graph_list.append(term_dict)

output = {"@context": context_dict, "@graph": graph_list}
#jsonld_output = json.dumps(output, indent = 2) # output as escaped characters
jsonld_output = json.dumps(output, indent = 2, ensure_ascii=False) # output at UTF-8 strings
print(jsonld_output)

{
  "@context": {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
  },
  "@graph": [
    {
      "@id": "http://rs.tdwg.org/acviews/values/p",
      "@type": "http://www.w3.org/2004/02/skos/core#ConceptScheme",
      "skos:prefLabel": [
        {
          "@language": "en",
          "@value": "subject part concept scheme"
        }
      ],
      "skos:definition": [
        {
          "@language": "en",
          "@value": "a SKOS concept scheme for ac:subjectOrientation"
        }
      ]
    },
    {
      "@id": "http://rs.tdwg.org/acviews/values/p0000",
      "@type": "http://www.w3.org/2004/02/skos/core#Concept",
      "rdf:value": "unspecifiedPart",
      "skos:inScheme": "http://rs.tdwg.org/acviews/values/p",
      "skos:prefLabel": [
        {
          "@language": "en",
          "@value": "unspecified part"
    

In [28]:
# outputObject = open(out_filename, 'wt', encoding='utf-8') # use with escaped characters
for out_filename in [database_name + '.json', database_name + '.jsonld']:
    outputObject = open(out_filename, 'w', encoding='utf-8') # use with UTF-8 strings
    outputObject.write(jsonld_output)
    outputObject.close()
    
print('done')

done
