# Using ckanapi

`ckanapi` is a python module for consuming data publishes using CKAN - which `data.gov` happens to use.

In [None]:
from ckanapi import RemoteCKAN
import json

data_gov = RemoteCKAN('http://catalog.data.gov/', get_only=True)

response = data_gov.action.package_search()

In [3]:
response.keys()

dict_keys(['results', 'search_facets', 'count', 'facets'])

In [4]:
response['facets'], response['search_facets'], response['count']

({}, {}, 193661)

In [5]:
results = response['results']
len(results)

10

Lets see what each result contains:

In [6]:
results[0].keys()

dict_keys(['private', 'title', 'version', 'tracking_summary', 'license_title', 'owner_org', 'id', 'type', 'isopen', 'author_email', 'name', 'author', 'maintainer', 'extras', 'relationships_as_object', 'groups', 'resources', 'relationships_as_subject', 'revision_timestamp', 'metadata_modified', 'num_tags', 'license_id', 'metadata_created', 'tags', 'url', 'organization', 'notes', 'state', 'num_resources', 'revision_id', 'maintainer_email'])

Looks like each result may have multiple resources

In [8]:
results[0]['num_resources']

5

Produce a count of the different format types in the returned resources

In [9]:
from collections import Counter
formats_counted = Counter([rsc['format'] for r in results for rsc in r['resources']])
formats_counted

Counter({'': 10,
         'CSV': 10,
         'Esri REST': 1,
         'Excel': 1,
         'HTML': 7,
         'JSON': 3,
         'KMZ': 1,
         'PDF': 1,
         'RDF': 2,
         'TAR': 2,
         'XML': 3,
         'api': 1,
         'zipped csv': 1})

But if we want to query for resources that are only `csv` then `data.gov` breaks.

In [11]:
#This is valid, but produces a 503 error
#response = data_gov.action.resource_search(query="format:csv")

We can instead use `requests` to directly query and older version of the CKAN api.

In [13]:
import requests

#go with the legacy api
response = requests.get('http://catalog.data.gov/api/search/dataset', params={'all_fields':1, 'res_format':'CSV'}).json()
results = response['results']
response['count']

14243

In [14]:
[dataset['title'] for dataset in results]

['05to09 Iquery Hep Cchronic Data',
 '05to12 Iquery Crypto Data',
 '05to12 Iquery Hep AData',
 '05to12 Iquery Hep Bchronic Data',
 '05to12 Iquery Histo Data',
 '05to12 Iquery Leg Data',
 '05to12 Iquery List Data',
 '05to12 Iquery Lyme Data',
 '05to12 Iquery Mumps Data',
 '05to12 Iquery NMen Data']

The result structure is different from `CKAN 3`

In [15]:
[dataset['res_url'][dataset['res_format'].index('CSV')] for dataset in results]

['https://data.illinois.gov/api/views/rry2-usaj/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/5cng-jyjz/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/rjxh-tv66/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/pwfa-6r2g/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/h79e-pjdp/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/iyx8-kpar/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/udvf-7ip4/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/wcrd-n5pw/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/jshf-mnjc/rows.csv?accessType=DOWNLOAD',
 'https://data.illinois.gov/api/views/36yw-y8bn/rows.csv?accessType=DOWNLOAD']

In [16]:
results[0].keys()

dict_keys(['title', 'res_description', 'metadata_created', 'data_dict', 'capacity', 'id', 'site_id', '_version_', 'dataset_type', 'name', 'res_url', 'maintainer', 'extras', 'groups', 'res_format', 'index_id', 'metadata_modified', 'license_id', 'tags', 'indexed_ts', 'entity_type', 'notes', 'state', 'revision_id', 'maintainer_email'])