In [None]:
# Script to build JSON-LD pages that provide multilingual labels and definitions controlled vocabularies
# Steve Baskauf 2021-01-08 CC0

import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. In this example,
# the branch is named "pathway"
github_base_url = 'https://raw.githubusercontent.com/tdwg/ac/master/views/code/'


has_broader = True # set to true if any terms have skos:broader values
has_exactMatch = False # set to true of any terms have skos:exactMatch values

label_col_prefix = 'label_'
def_col_prefix = 'definition_'
localname_column_header = 'term_localName'

namespace_iri = 'http://rs.tdwg.org/acviews/values/'
collections_namespace = 'http://rs.tdwg.org/acviews/collections/'

# ---------------
# Function definitions
# ---------------

# main function
#
def create_cv(database_name):
    translations_url = github_base_url + database_name + '-translations.csv'
    translations_frame = pd.read_csv(translations_url, na_filter=False)
    columns = translations_frame.columns

    # Extract the list of languages from the translations spreadsheet column headers
    languages = []
    for column_header in columns:
        if label_col_prefix in column_header:
            language_code = column_header.split(label_col_prefix)[1]
            if language_code != 'en':
                languages.append(column_header.split(label_col_prefix)[1])
    #print(languages)
    
    # Create a dictionary of language dictionaries for the terms
    translations_dictionary = {}
    for index,row in translations_frame.iterrows():
        language_dictionary = {}
        for language in languages:
            term_dictionary = {'label': row[label_col_prefix + language], 'definition': row[def_col_prefix + language]}
            language_dictionary[language] = term_dictionary
        translations_dictionary[row[localname_column_header]] = language_dictionary
    #print(json.dumps(translations_dictionary, indent = 2))
    
    # Create a list of term info dictionaries
    term_info = []
    term_dict = {}
    frame = pd.read_csv(github_base_url + database_name + '.csv', na_filter=False)
    for index,row in frame.iterrows():
        term_dict = {}
        term_dict['localname'] = row['term_localName']
        term_dict['iri'] = namespace_iri + row['term_localName']
        term_dict['label'] = []
        term_dict['label'].append({'language': 'en', 'value': row['label']})
        for language in languages:
            term_dict['label'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['label']})
        term_dict['definition'] = []
        term_dict['definition'].append({'language': 'en', 'value': row['definition']})
        for language in languages:
            term_dict['definition'].append({'language': language, 'value': translations_dictionary[row['term_localName']][language]['definition']})
        term_dict['cv_string'] = row['controlled_value_string']
        if row['skos_inScheme'] != '':
            term_dict['scheme'] = namespace_iri + row['skos_inScheme']
        else:
            term_dict['scheme'] = ''
        term_dict['type'] = row['type']
        if has_broader:
            if row['skos_broader'] != '':
                term_dict['broader'] = namespace_iri + row['skos_broader']
            else:
                term_dict['broader'] = ''
        if has_exactMatch:
            if row['skos_exactMatch'] != '':
                term_dict['match'] = namespace_iri + row['skos_exactMatch']
            else:
                term_dict['match'] = ''
        term_dict['value'] = row['controlled_value_string']
        term_info.append(term_dict)
    #print(json.dumps(term_info, indent = 2))

    # Generate the JSON-LD for the terms
    context_dict = {
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "skos": "http://www.w3.org/2004/02/skos/core#",
        "xsd": "http://www.w3.org/2001/XMLSchema#"
      }

    graph_list = []
    for term in term_info:
        term_dict = {}
        term_dict['@id'] = term['iri']
        term_dict['@type'] = term['type']
        if term['value'] != '':
            term_dict['rdf:value'] = term['value']
        if term['scheme'] != '':
            term_dict['skos:inScheme'] = term['scheme']
        if has_broader:
            if term['broader'] != '':
                term_dict['skos:broader'] = term['broader']
        if has_exactMatch:
            if term['match'] != '':
                term_dict['skos:exactMatch'] = term['match']
        label_list = []
        for lang_label in term['label']:
            label_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
        term_dict['skos:prefLabel'] = label_list

        def_list = []
        for lang_label in term['definition']:
            def_list.append({'@language': lang_label['language'], '@value': lang_label['value']})
        term_dict['skos:definition'] = def_list

        graph_list.append(term_dict)

    output = {"@context": context_dict, "@graph": graph_list}
    #jsonld_output = json.dumps(output, indent = 2) # output as escaped characters
    jsonld_output = json.dumps(output, indent = 2, ensure_ascii=False) # output at UTF-8 strings
    #print(jsonld_output)
    
    # outputObject = open(out_filename, 'wt', encoding='utf-8') # use with escaped characters
    for out_filename in [database_name + '.json', database_name + '.jsonld']:
        outputObject = open(out_filename, 'w', encoding='utf-8') # use with UTF-8 strings
        outputObject.write(jsonld_output)
        outputObject.close()


In [None]:
database_names = ['subjectOrientation', 'subjectPart']
for database_name in database_names:
    create_cv(database_name)
    print('Created', database_name + '.json')
print('done')

In [None]:
database_name = 'part_collection_join'

# Create the context dictionary
context_dict = {
    "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
    "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
    "skos": "http://www.w3.org/2004/02/skos/core#",
    "xsd": "http://www.w3.org/2001/XMLSchema#"
  }

# Load the data about the collections
frame = pd.read_csv(github_base_url + 'skos_collections.csv', na_filter=False)
collection_list = []
for index,term in frame.iterrows():
    term_dict = {}
    term_dict['@id'] = term['term_localName']
    term_dict['skos:prefLabel'] = [{'@language': 'en', '@value': term['label']}]
    # ** Need to fix this to add multilingual labels
    # term_dict['skos:prefLabel'] = {'@language': lang_label['language'], '@value': lang_label['value']}
    collection_list.append(term_dict)

# create list of collections descriptions
collections_info = []

frame = pd.read_csv(github_base_url + database_name + '.csv', na_filter=False)

# create the lists of members
for collection in collection_list:
    collection_dict = {'@id': collections_namespace + collection['@id']}
    collection_dict['@type'] = 'http://www.w3.org/2004/02/skos/core#Collection'
    collection_dict['tdwgutility:ofConcept'] = namespace_iri + collection['@id']
    collection_dict['skos:prefLabel'] = collection['skos:prefLabel']
    values_list = []

    for index,term in frame.iterrows():
        if collection['@id'] == term['collection_id']:
            values_list.append(namespace_iri + term['member_id'])
    if len(values_list) != 0:
        collection_dict['skos:member'] = values_list
        collections_info.append(collection_dict)

output = {"@context": context_dict, "@graph": collections_info}
#jsonld_output = json.dumps(output, indent = 2) # output as escaped characters
jsonld_output = json.dumps(output, indent = 2, ensure_ascii=False) # output at UTF-8 strings

#print(jsonld_output)
filename = 'views_collections'
for out_filename in [filename + '.json', filename + '.jsonld']:
    outputObject = open(out_filename, 'w', encoding='utf-8') # use with UTF-8 strings
    outputObject.write(jsonld_output)
    outputObject.close()
print('done')