In [None]:
# Script to build JSON-LD pages that provide multilingual labels and definitions controlled vocabularies
# Steve Baskauf 2020 CC0

import re
import requests   # best library to manage HTTP transactions
import csv        # library to read/write/parse CSV files
import json       # library to convert JSON to Python data structures
import pandas as pd

# -----------------
# Configuration section
# -----------------

# Read in mutable configuration variables
with open('config.json', 'rt', encoding='utf-8') as file_object:
    config_text = file_object.read()
config = json.loads(config_text)
database_names = config['database_names']

# load language codes
translations_frame = pd.read_csv('common-language-codes.csv', na_filter=False)
common_languages = translations_frame.alpha2

translations_frame = pd.read_csv('rare-language-codes.csv', na_filter=False)
rare_languages = translations_frame.alpha2

# This is the base URL for raw files from the branch of the repo that has been pushed to GitHub. 
github_base_url = 'https://raw.githubusercontent.com/tdwg/rs.tdwg.org/master/'

#translations_url = github_base_url + database_name + '/' + database_name +'-translations.csv'

label_col_prefix = 'label_'
def_col_prefix = 'definition_'
localname_column_header = 'term_localName'

# ---------------
# Function definitions
# ---------------

def write_csv(filename, list_of_lists):
    with open(filename, 'w', newline='', encoding='utf-8') as file_object:
        writer_object = csv.writer(file_object)
        for row_list in list_of_lists:
            writer_object.writerow(row_list)

def generate_csv(database_name, language, path):
    print('    ', language)

    # Build the table
    output_table = []

    # Header for translator data section
    header_row = ['translator orcid', 'translator name (Latin character set)', 'translator name (local character set)', 'affiliation (for acknowledgement) Latin character set', 'affiliation local character set']
    output_table.append(header_row)
    for blank_line in range(0, 4):
        output_table.append([str(blank_line), '', '', '', '', ''])

    # Header for translations data section
    another_header_row = ['term_localName', 'label_en', 'label_' + language, 'definition_en', 'definition_' + language]
    output_table.append(another_header_row)

    # Read in the data for the English labels and definitions
    frame = pd.read_csv(github_base_url + database_name + '/' + database_name + '.csv', na_filter=False)
    # Add the data for each one in its own row
    for index,row in frame.iterrows():
        output_table.append([row['term_localName'], row['label'], '', row['definition'], ''])

    # Write the output for each language and controlled vocabulary
    write_csv('../Google Drive/My Drive/tdwg_cv/' + path + database_name + '_' + language + '.csv', output_table)

            
# ---------------
# Generate all of the CSVs
# ---------------

for database_name in database_names:
    print(database_name)
    path = database_name + '/'
    
    for language in common_languages:
        path = database_name + '/'
        generate_csv(database_name, language, path)

    for language in rare_languages:
        path = database_name + '/rare/'
        generate_csv(database_name, language, path)
        
print('done')