In [1]:
from datatalker.ogdp import OGDProxy, DocumentAdapter
from pymongo import MongoClient

ogd = OGDProxy(api_key="579b464db66ec23bdd000001edd87d62b40343d54d7f4653d5d391a7")
client = MongoClient("mongodb://localhost:27017/")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# sudo docker volume create --driver local --opt device=/data102/.docker/volumes --opt type=none  mongodb_data
# docker run --name mongodb -d -p 27017:27017 -v mongodb_data:/data/db mongodb/mongodb-community-server

In [2]:
db = client["datatalker"]
ctlgs = db["ogd_catalogs"]
rsrcs = db["ogd_resources"]

## Using Catalogs

Schema Note 1: 0 and 1 is used to represent boolean for the following fields.
  - having_api
  - high_value_dataset
  - is_api_available
  - is_priced
  - is_webservice
  - from_api


Schema Note 2: ogdp_custom_field's value is always 'test'.


Schema Note 3: Some fields provide useful information for determining the spatial and temporal coverage of the dataset, and the jurisdiction of the data collecting government body.
  - field_asset_jurisdiction:name # administrative geography covered by the dataset
  - field_ds_govt_type # central government or state government
  - field_group_name:name # a convinient field to group multiple datasets under a title
  - field_ministry_department:name # central government ministry or department, only provided when government type is central
  - field_state_department:name # state department, only provided when the government type is state
  - data_time_period_from # starting point of the time period for which the data is available
  - data_time_period_to # ending point of the time period for which the data is available
  - frequency # the time interval at which the data is collected


Schema Note 4: Some fields provide a semantic description of the dataset.
  - title # title of the catalog
  - body:value # short description of the catalog
  - field_search_keywords # search optimization keywords for the dataset
  - keywords # simillar to field_serach_keywords
  - field_sector:name # A high level label to classify datasets by their information/problem domain.


Schema Note 4: Ignore all the keys which start with underscore or 'search' such as _id or search_api_language. Addtionally, it's okay to ignore the following fields as they are of little use.
  - odgp_custom_field
  - ogdp_module_domain_access
  - ogdp_module_domain_name


Schema Note 5: Some fields provide optionally useful information from the point of view of a data maintainer or user analytics.
  - changed # timestamp when the dataset was last modified
  - created # timestamp when the dataset was created
  - api_request_count
  - high_value_dataset # features as a high value dataset on the data portal
  - published_date # date at which the dataset was made public
  - view_count # number of times users viewed this dataset


Schema Note 6: The following fields are only useful internally when working with the API, and should not become part of any user facing content.
  - uuid # Catalog UUID
  - nid
  - vid
  - node_alias # url path to navigate to the catalog webpage


API Usage Note 1: Records can only be fetched using the '/catalog' path when the catalog API is avaliable.
// pseudocode for fetching catalog data
// if catalog.is_api_available[0] == 1:
//  data = fetch_records("https://api.data.gov.in/catalog/{catalog_uuid}") 


API Usage Note 2: To fetch the data for catalog with no available API, one can dig into the resources pointing to the dataset.
// SELECT * FROM resources WHERE catalog_uuid={catalog_uuid};

In [4]:
ctlg_ignore_keys = [
 '_freq_wise_not_updated',
 '_freq_wise_schedule',
 '_freq_wise_total_resources',
 '_freq_wise_updated',
 '_id',
 'api_request_count',
 
]

In [3]:
ctlg = ctlgs.find_one({"uuid": "6141ea17-a69d-4713-b600-0a43c8fd9a6c"})

In [6]:
DocumentAdapter.from_catalog(ctlg)

{'type': 'dataset',
 'website': 'https://data.gov.in/catalog/current-daily-price-various-commodities-various-markets-mandi',
 'interface': 'ogd:catalog',
 'title': 'Current daily price of various commodities from various markets (Mandi)',
 'content': "Title: Current daily price of various commodities from various markets (Mandi)\nDescription: The data refers to prices of various commodities. It has the wholesale maximum price, minimum price and modal price on daily basis. This dataset is generated through the AGMARKNET Portal (http://agmarknet.gov.in), which disseminates daily market information of various commodities.\nKeywords: ['mandi', 'Price', 'Agriculture', 'Commodity', 'Market', 'Rate']\nFrequency: Daily\nData Source: None\nOpen Government Data Site: ['data.gov.in']Government Type: ['Central']\nAsset Jurisdiction Name: ['All India']\nDs Govt Type: ['Central']\nGroup Name Name: ['Agricultural Marketing']\nMinistry Department Name: ['Ministry of Agriculture and Farmers Welfare', '

In [6]:
ctlg_unique_keys = set()
for doc in ctlgs.find():
    ctlg_unique_keys.update(doc.keys())
ctlg_unique_keys

{'_freq_wise_not_updated',
 '_freq_wise_schedule',
 '_freq_wise_total_resources',
 '_freq_wise_updated',
 '_id',
 'api_request_count',
 'body:value',
 'changed',
 'created',
 'data_source',
 'data_time_period_from',
 'data_time_period_to',
 'field_asset_jurisdiction:name',
 'field_ds_govt_type',
 'field_group_name:name',
 'field_ministry_department:name',
 'field_search_keywords',
 'field_sector:name',
 'field_state_department:name',
 'frequency',
 'frequency_data_series',
 'from_api',
 'govt_type',
 'having_api',
 'high_value_dataset',
 'is_api_available',
 'is_priced',
 'is_webservice',
 'keywords',
 'nid',
 'node_alias',
 'ogdp_custom_field',
 'ogdp_view_count',
 'ogpl_module_domain_access',
 'ogpl_module_domain_name',
 'published_date',
 'search_api_datasource',
 'search_api_id',
 'search_api_language',
 'title',
 'uuid',
 'vid',
 'view_count'}

## Using Resources

In [7]:
rsrc_uniq_keys = set()
for doc in rsrcs.find():
    rsrc_uniq_keys.update(doc.keys())
rsrc_uniq_keys

{'_id',
 'active',
 'aggregation',
 'catalog_uuid',
 'created',
 'created_date',
 'data_fetch_date',
 'default_sort',
 'desc',
 'doc',
 'domain',
 'external_ws',
 'external_ws_url',
 'field',
 'field_dependent',
 'field_exposed',
 'field_name',
 'index_name',
 'is_public',
 'order',
 'org',
 'org_type',
 'primary_field',
 'public_type',
 'query',
 'resource_uuid',
 'script',
 'sector',
 'source',
 'status',
 'target_bucket',
 'target_type',
 'timestamp',
 'title',
 'type',
 'updated',
 'updated_date',
 'user_id',
 'visualizable'}

In [17]:
rsrcs.count_documents({
    "ai_long_text": {"$exists": 1},
    "source": {"$ne": "visualize.data.gov.in"},
  })

17805