diff --git a/README.md b/README.md index f5e8ef5..aa74eb8 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,94 @@ update_resource.update_resource(ckan_host=CKAN_HOST, resource_id=resource_id) ``` +### Check diff before updating a dataset + +#### CLI + +Running the `diff` command + +```bash +dpckan dataset diff --datapackage some-path/datapackage.json +Differences detected: + - On field title + - CKAN value A vowel letters dataset for tests CHANGED + - DataPackage value A vowel letters dataset for tests +Equal fields: version, url, license_id, owner_org, tags, notes + +``` + +#### Via Python code + +Using the python `diff_dataset` function + +```python +import os +from dpckan.diff_dataset import diff_dataset + +CKAN_HOST = os.environ.get('CKAN_HOST') +CKAN_KEY = os.environ.get('CKAN_KEY') +datapackage_path = 'local/path/para/datapackage.json' + +# A chamada de funções via código Python exige passagem de todos os argumentos +diffs, oks = diff_dataset( + ckan_host=CKAN_HOST, + ckan_key=CKAN_KEY, + datapackage=datapackage_path +) + +diffs +[{'field_name': 'title', 'ckan_value': 'A vowel letters dataset for tests CHANGED', 'datapackage_value': 'A vowel letters dataset for tests'}] + +oks +['version', 'url', 'license_id', 'owner_org', 'tags', 'notes'] + +``` + + +### Check diff for resources + +#### CLI + +Running the `diff` command for a resource + +```bash +dpckan resource diff --datapackage some-path/datapackage.json --resource-name="This is the actual data" +Differences detected: + - On field format + - CKAN value: CSV + - DataPackage value: csv +Equal fields: description +``` + +#### Via Python code + +Using the python `diff_dataset` function + +```python +import os +from dpckan.diff_resource import diff_resource + +CKAN_HOST = os.environ.get('CKAN_HOST') +CKAN_KEY = os.environ.get('CKAN_KEY') +datapackage_path = 'local/path/para/datapackage.json' +datapackage_path = 'dpckan/tests/data-samples/datapackage-example/datapackage.json' +resource_name = 'This is the actual data' + +# A chamada de funções via código Python exige passagem de todos os argumentos +diffs, oks = diff_resource( + ckan_host=CKAN_HOST, + ckan_key=CKAN_KEY, + datapackage=datapackage_path, + resource_name=resource_name +) + +diffs +[{'field_name': 'format', 'ckan_value': 'CSV', 'datapackage_value': 'csv'}] + +oks +['description'] +``` + ## Desenvolvimento ### Contribuir para o projeto diff --git a/dpckan/cli.py b/dpckan/cli.py index 9b107dd..a6052b5 100644 --- a/dpckan/cli.py +++ b/dpckan/cli.py @@ -3,6 +3,9 @@ from dpckan.update_dataset import update_cli from dpckan.create_resource import create_resource_cli from dpckan.update_resource import update_resource_cli +from dpckan.diff_dataset import diff_dataset_cli +from dpckan.diff_resource import diff_resource_cli + @click.group(context_settings=dict(help_option_names=["-h", "--help"])) def cli(): @@ -20,6 +23,7 @@ def dataset(): dataset.add_command(create_cli) dataset.add_command(update_cli) +dataset.add_command(diff_dataset_cli) @cli.group() def resource(): @@ -30,3 +34,4 @@ def resource(): resource.add_command(create_resource_cli, 'create') resource.add_command(update_resource_cli, 'update') +resource.add_command(diff_resource_cli, 'diff') diff --git a/dpckan/diff_dataset.py b/dpckan/diff_dataset.py new file mode 100644 index 0000000..3d9c70f --- /dev/null +++ b/dpckan/diff_dataset.py @@ -0,0 +1,98 @@ +import sys +import click +from ckanapi import RemoteCKAN +from dpckan.validations import run_validations +from dpckan.functions import ( + load_complete_datapackage, + is_dataset_published, + dataset_diff +) + + +def diff_dataset(ckan_host, ckan_key, datapackage): + """ + Detect changes between datapackage an the created dataset. + + Parâmetros: + + ------- + + ckan_host: string + + host ou ambiente da instância CKAN para a qual se deseja publicar conjunto de dados. + Exemplo: https://demo.ckan.org/ + + ckan_key: string + + Chave CKAN do usuário e ambiente para a qual se deseja publicar conjunto de dados. + + datapackage: string + + Caminho local para arquivo datapackage.json. + + Retorna: + + ------- + + A list of (non-expected) differences between the datapackage and the CKAN dataset + + """ + package = load_complete_datapackage(datapackage) + run_validations(ckan_host, ckan_key, package) + + ckan_instance = RemoteCKAN(ckan_host, apikey=ckan_key) + if not is_dataset_published(ckan_instance, package): + raise Exception('Conjunto de dados nao existente.') + + ckan_instance = RemoteCKAN(ckan_host, apikey=ckan_key) + return dataset_diff(ckan_instance, package) + + +@click.command(name='diff') +@click.option('--ckan-host', '-H', envvar='CKAN_HOST', required=True, + help="Ckan host, exemplo: https://demo.ckan.org/") # -H para respeitar convenção de -h ser help +@click.option('--ckan-key', '-k', envvar='CKAN_KEY', required=True, + help="Ckan key autorizando o usuário a realizar publicações/atualizações em datasets") +@click.option('--datapackage', '-dp', required=True, default='datapackage.json') +def diff_dataset_cli(ckan_host, ckan_key, datapackage): + """ + Detect changes between datapackage an the created dataset. + + Parâmetros: + + ---------- + + ckan_host: string (não obrigatório caso variável CKAN_HOST esteja cadastrada na máquina ou em arquivo .env) + + host ou ambiente da instância CKAN para a qual se deseja publicar conjunto de dados. + Exemplo: https://demo.ckan.org/ + + ckan_key: string (não obrigatório caso variável CKAN_KEY esteja cadastrada na máquina ou em arquivo .env) + + Chave CKAN do usuário e ambiente para a qual se deseja publicar conjunto de dados. + + datapackage: string (não obrigatório caso comando seja executado no mesmo diretório do arquivo datapackage.json) + + Caminho local para arquivo datapackage.json. + + Retorna: + + ------- + + A list of (non-expected) differences between the datapackage and the CKAN dataset + """ + + diffs, oks = diff_dataset(ckan_host, ckan_key, datapackage) + if len(diffs) == 0: + click.echo("There are no differences") + else: + click.echo("Differences detected:") + for diff in diffs: + click.echo(f" - On field {diff['field_name']}") + click.echo(f" - CKAN value {diff['ckan_value']}") + click.echo(f" - DataPackage value {diff['datapackage_value']}") + + if len(oks) == 0: + click.echo("No equal field found") + else: + click.echo("Equal fields: {}".format(', '.join(oks))) diff --git a/dpckan/diff_resource.py b/dpckan/diff_resource.py new file mode 100644 index 0000000..831446e --- /dev/null +++ b/dpckan/diff_resource.py @@ -0,0 +1,106 @@ +import sys +import click +from ckanapi import RemoteCKAN +from dpckan.validations import run_validations +from dpckan.functions import ( + load_complete_datapackage, + is_dataset_published, + resource_diff +) + + +def diff_resource(ckan_host, ckan_key, datapackage, resource_name): + """ + Detect changes between datapackage an the created dataset. + + Parâmetros: + + ------- + + ckan_host: string + + host ou ambiente da instância CKAN para a qual se deseja publicar conjunto de dados. + Exemplo: https://demo.ckan.org/ + + ckan_key: string + + Chave CKAN do usuário e ambiente para a qual se deseja publicar conjunto de dados. + + datapackage: string + + Caminho local para arquivo datapackage.json. + + Retorna: + + ------- + + A list of (non-expected) differences between the datapackage and the CKAN dataset + + """ + package = load_complete_datapackage(datapackage) + run_validations(ckan_host, ckan_key, package) + + ckan_instance = RemoteCKAN(ckan_host, apikey=ckan_key) + if not is_dataset_published(ckan_instance, package): + raise Exception('Conjunto de dados nao existente.') + + ckan_instance = RemoteCKAN(ckan_host, apikey=ckan_key) + return resource_diff(ckan_instance, package, resource_name) + + +@click.command() +@click.option('--ckan-host', '-H', envvar='CKAN_HOST', required=True, + help="Ckan host, exemplo: https://demo.ckan.org/") # -H para respeitar convenção de -h ser help +@click.option('--ckan-key', '-k', envvar='CKAN_KEY', required=True, + help="Ckan key autorizando o usuário a realizar publicações/atualizações em datasets") +@click.option('--datapackage', '-dp', required=True, default='datapackage.json') +@click.option('--resource-name', '-rn', required=True) +def diff_resource_cli(ckan_host, ckan_key, datapackage, resource_name): + """ + Detect changes between datapackage resource an a CKAN resource. + + Parâmetros: + + ---------- + + ckan_host: string (não obrigatório caso variável CKAN_HOST esteja cadastrada na máquina ou em arquivo .env) + + host ou ambiente da instância CKAN para a qual se deseja publicar conjunto de dados. + Exemplo: https://demo.ckan.org/ + + ckan_key: string (não obrigatório caso variável CKAN_KEY esteja cadastrada na máquina ou em arquivo .env) + + Chave CKAN do usuário e ambiente para a qual se deseja publicar conjunto de dados. + + datapackage: string (não obrigatório caso comando seja executado no mesmo diretório do arquivo datapackage.json) + + Caminho local para arquivo datapackage.json. + + resource_name: string + + Nome do recurso, presente no arquivo datapackage.json, que será atualizado. + + Retorna: + + ------- + + A list of (non-expected) differences between the datapackage and the CKAN dataset + """ + + diffs, oks = diff_resource(ckan_host, ckan_key, datapackage, resource_name) + if len(diffs) == 0: + click.echo("There are no differences") + else: + click.echo("Differences detected:") + for diff in diffs: + click.echo(f" - On field {diff['field_name']}") + if diff.get('ckan_value'): + click.echo(f" - CKAN value: {diff['ckan_value']}") + if diff.get('datapackage_value'): + click.echo(f" - DataPackage value: {diff['datapackage_value']}") + if diff.get('error'): + click.echo(f" - Error: {diff['error']}") + if len(oks) == 0: + click.echo("No equal field found") + else: + click.echo("Equal fields: {}".format(', '.join(oks))) diff --git a/dpckan/functions.py b/dpckan/functions.py index 115effb..f4b246e 100644 --- a/dpckan/functions.py +++ b/dpckan/functions.py @@ -139,3 +139,115 @@ def frictionless_to_ckan(datapackage): if 'id' in dataset.keys(): dataset.update({ "id" : datapackage.name}) return dataset + + +def resource_diff(ckan_instance, datapackage, resource_name): + dp_dataset = f2c.package(datapackage) + ckan_dataset = ckan_instance.action.package_show(id=datapackage.name) + + # to return results + diffs = [] + oks = [] + + for res in dp_dataset.get('resources', []): + if res['title'] == resource_name: + dp_resource = res + break + else: + dp_resource = None + + # it's not easy to detect which is the right resource, because we use the title as name + for res in ckan_dataset.get('resources', []): + if res['name'] == resource_name: + ckan_resource = res + break + else: + ckan_resource = None + + if ckan_resource is None or dp_resource is None: + if ckan_resource is None: + diffs.append({'field_name': 'ALL', 'error': 'Resource not found in CKAN'}) + if dp_resource is None: + diffs.append({'field_name': 'ALL', 'error': 'Resource not found in DataPackage'}) + return diffs, [] + + fields = ["format", "description"] + # TODO use hash functions for the resource file + + for field in fields: + if dp_resource.get(field) == ckan_resource.get(field): + oks.append(field) + else: + diffs.append( + { + 'field_name': field, + 'ckan_value': ckan_resource.get(field), + 'datapackage_value': dp_resource.get(field) + } + ) + + return diffs, oks + + +def dataset_diff(ckan_instance, datapackage): + dp_dataset = frictionless_to_ckan(datapackage) + ckan_dataset = ckan_instance.action.package_show(id = datapackage.name) + + diffs = [] + oks = [] + # TODO, add more + fields = ["title", "version", "url", "license_id"] + + for field in fields: + if dp_dataset.get(field) == ckan_dataset.get(field): + oks.append(field) + else: + diffs.append( + { + 'field_name': field, + 'ckan_value': ckan_dataset.get(field), + 'datapackage_value': dp_dataset.get(field) + } + ) + + # check org (dataset use ID, datapackage uses name) + if dp_dataset['owner_org'] == ckan_dataset['organization']['name']: + oks.append('owner_org') + else: + diffs.append( + { + 'field_name': 'owner_org', + 'ckan_value': ckan_dataset['organization']['name'], + 'datapackage_value': dp_dataset['owner_org'] + } + ) + + # Analyze tags + dp_tags = sorted([t['name'] for t in dp_dataset['tags']]) + ckan_tags = sorted([t['name'] for t in ckan_dataset['tags']]) + if dp_tags == ckan_tags: + oks.append('tags') + else: + diffs.append( + { + 'field_name': 'tags', + 'ckan_value': ckan_tags, + 'datapackage_value': dp_tags + } + ) + + # Analyze notes + dp_notes = dp_dataset['notes'].replace('\n', '') + ckan_notes = ckan_dataset['notes'].replace('\n', '') + if dp_tags == ckan_tags: + oks.append('notes') + else: + diffs.append( + { + 'field_name': 'notes', + 'ckan_value': ckan_tags, + 'datapackage_value': dp_tags + } + ) + + return diffs, oks \ No newline at end of file diff --git a/dpckan/tests/data-samples/data-package-as-dataset-before-import.json b/dpckan/tests/data-samples/data-package-as-dataset-before-import.json new file mode 100644 index 0000000..ae4aa21 --- /dev/null +++ b/dpckan/tests/data-samples/data-package-as-dataset-before-import.json @@ -0,0 +1,32 @@ +{ + "name": "volwel-letters-dataset-test", + "resources": [{ + "name": "letters", + "profile": "tabular-data-resource", + "title": "This is the actual data", + "format": "csv", + "encoding": "UTF-8", + "description": "This file contains the actual data", + "url": "data/letters-vowels.csv" + }], + "title": "A vowel letters dataset for tests", + "version": "0.1.0", + "owner_org": "secretaria-de-estado-de-planejamento-e-gestao-seplag", + "notes": "This is a dataset that say if a given letter is a vowel", + "url": "https://github.com/fjuniorr", + "license_id": "CC0-1.0", + "license_title": "CC0 1.0", + "license_url": "https://creativecommons.org/publicdomain/zero/1.0/", + "tags": [{ + "name": "letters" + }, { + "name": "english grammar" + }], + "extras": [{ + "key": "profile", + "value": "tabular-data-package" + }, { + "key": "contributors", + "value": "[{'title': 'Gabriel Braico Dornas', 'role': 'publisher', 'organization': 'controladoria-geral-do-estado-cge'}]" + }] +} \ No newline at end of file diff --git a/dpckan/tests/data-samples/real-imported-dataset.json b/dpckan/tests/data-samples/real-imported-dataset.json new file mode 100644 index 0000000..48cbc63 --- /dev/null +++ b/dpckan/tests/data-samples/real-imported-dataset.json @@ -0,0 +1,106 @@ +{ + "help": "http://ckan:5000/api/3/action/help_show?name=package_show", + "success": true, + "result": { + "author": "", + "author_email": "", + "creator_user_id": "5636013e-922a-4f24-a507-868d3260cc76", + "id": "9401ec36-02f9-4475-a30e-4a5224264c2d", + "isopen": false, + "license_id": "CC0-1.0", + "license_title": "CC0-1.0", + "maintainer": "", + "maintainer_email": "", + "metadata_created": "2021-10-07T14:30:06.902161", + "metadata_modified": "2021-10-07T15:04:30.897894", + "name": "volwel-letters-dataset-test", + "notes": "This is a dataset that say if a given letter is a vowel\r\n# Testes para pacote dpckan \r\n\r\nOs testes implementados durante o desenvolvimento do pacote [dpckan](https://github.com/dados-mg/dpkgckanmg) s\u00e3o realizados com aux\u00edlio deste reposit\u00f3rio.\r\n\r\nCada arquivo de teste est\u00e1 ligado a um branch deste reposit\u00f3rio. \r\n", + "num_resources": 2, + "num_tags": 2, + "organization": { + "id": "7bc7f744-e279-4faa-8d23-2a707c2aabc2", + "name": "secretaria-de-estado-de-planejamento-e-gestao-seplag", + "title": "secretaria-de-estado-de-planejamento-e-gestao-seplag", + "type": "organization", + "description": "", + "image_url": "", + "created": "2021-10-07T14:29:55.881746", + "is_organization": true, + "approval_status": "approved", + "state": "active" + }, + "owner_org": "7bc7f744-e279-4faa-8d23-2a707c2aabc2", + "private": false, + "state": "active", + "title": "A vowel letters dataset for tests g", + "type": "dataset", + "url": "https://github.com/fjuniorr", + "version": "0.1.0", + "extras": [{ + "key": "contributors", + "value": "[{\"title\": \"Gabriel Braico Dornas\", \"role\": \"publisher\", \"organization\": \"controladoria-geral-do-estado-cge\"}]" + }, { + "key": "profile", + "value": "tabular-data-package" + }], + "resources": [{ + "cache_last_updated": null, + "cache_url": null, + "created": "2021-10-07T14:30:07.993648", + "datastore_active": false, + "description": null, + "format": "JSON", + "hash": "", + "id": "6fbbf5b0-37ef-4600-82d9-18d6c2a7b9cd", + "last_modified": "2021-10-07T14:30:07.960186", + "metadata_modified": "2021-10-07T14:30:08.442473", + "mimetype": "application/json", + "mimetype_inner": null, + "name": "datapackage.json", + "package_id": "9401ec36-02f9-4475-a30e-4a5224264c2d", + "position": 0, + "resource_type": null, + "size": 1872, + "state": "active", + "url": "http://ckan:5000/dataset/9401ec36-02f9-4475-a30e-4a5224264c2d/resource/6fbbf5b0-37ef-4600-82d9-18d6c2a7b9cd/download/datapackage.json", + "url_type": "upload" + }, { + "cache_last_updated": null, + "cache_url": null, + "created": "2021-10-07T14:30:08.458102", + "datastore_active": true, + "description": "This file contains the actual data", + "format": "CSV", + "hash": "", + "id": "f6c43186-b1e5-418f-863f-0a87db650d08", + "last_modified": "2021-10-07T14:30:08.417161", + "metadata_modified": "2021-10-07T14:30:08.442861", + "mimetype": "text/csv", + "mimetype_inner": null, + "name": "This is the actual data", + "package_id": "9401ec36-02f9-4475-a30e-4a5224264c2d", + "position": 1, + "resource_type": null, + "size": 77, + "state": "active", + "url": "http://ckan:5000/dataset/9401ec36-02f9-4475-a30e-4a5224264c2d/resource/f6c43186-b1e5-418f-863f-0a87db650d08/download/letters-vowels.csv", + "url_type": "upload" + }], + "tags": [{ + "display_name": "english grammar", + "id": "3aa886cb-3558-4145-a5b3-a9b5d83981b3", + "name": "english grammar", + "state": "active", + "vocabulary_id": null + }, { + "display_name": "letters", + "id": "c6feb233-d95e-4195-b17c-436c21bf877e", + "name": "letters", + "state": "active", + "vocabulary_id": null + }], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } +} \ No newline at end of file