In [1]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")

db = client["datatalker"]
ctlgs = db["ogd_catalogs"]
rsrcs = db["ogd_resources"]

In [28]:
ctlgs.distinct("is_api_available")

[0, 1, '0', '1']

In [6]:
agmarknet_ctlg_uuid = "6141ea17-a69d-4713-b600-0a43c8fd9a6c"
some_jjm_ctlg_uuid = "1f13a8ca-dd4b-4bf6-9d14-f86cff8ae49a"
ctlg = ctlgs.find_one({"uuid": agmarknet_ctlg_uuid})
ctlg

{'_id': ObjectId('6813021260284fbf048a3df2'),
 'api_request_count': [2],
 'having_api': [1],
 'high_value_dataset': [1],
 'is_api_available': [0],
 'is_priced': [0],
 'is_webservice': [1],
 'node_alias': ['/catalog/current-daily-price-various-commodities-various-markets-mandi'],
 'ogdp_custom_field': ['test'],
 'ogdp_view_count': [1292388],
 'published_date': [1369368430],
 'body:value': ['The data refers to prices of various commodities. It has the wholesale maximum price, minimum price and modal price on daily basis. This dataset is generated through the AGMARKNET Portal (http://agmarknet.gov.in), which disseminates daily market information of various commodities.'],
 'changed': [1746068420],
 'created': [1369290300],
 'field_asset_jurisdiction:name': ['All India'],
 'field_ds_govt_type': ['Central'],
 'field_group_name:name': ['Agricultural Marketing'],
 'field_ministry_department:name': ['Ministry of Agriculture and Farmers Welfare',
  'Department of Agriculture and Farmers Welfare

In [4]:
catalog_schema_notes = """
Your job is to write a professional long descripton of the Catalog using the following notes on its schema. Highlight it's content, spatio-temporal span, data collection methodology or body, and potential use cases.

Schema Note 1: 0 and 1 is used to represent boolean for the following fields.
  - having_api
  - high_value_dataset
  - is_api_available
  - is_priced
  - is_webservice
  - from_api


Schema Note 2: ogdp_custom_field's value is always 'test'.


Schema Note 3: Some fields provide useful information for determining the spatial and temporal coverage of the dataset, and the jurisdiction of the data collecting government body.
  - field_asset_jurisdiction:name # administrative geography covered by the dataset
  - field_ds_govt_type # central government or state government
  - field_group_name:name # a convinient field to group multiple datasets under a title
  - field_ministry_department:name # central government ministry or department, only provided when government type is central
  - field_state_department:name # state department, only provided when the government type is state
  - data_time_period_from # starting point of the time period for which the data is available
  - data_time_period_to # ending point of the time period for which the data is available
  - frequency # the time interval at which the data is collected


Schema Note 4: Some fields provide a semantic description of the dataset.
  - title # title of the catalog
  - body:value # short description of the catalog
  - field_search_keywords # search optimization keywords for the dataset
  - keywords # simillar to field_serach_keywords
  - field_sector:name # A high level label to classify datasets by their information/problem domain.


Schema Note 4: Ignore all the keys which start with underscore or 'search' such as _id or search_api_language. Addtionally, it's okay to ignore the following fields as they are of little use.
  - odgp_custom_field
  - ogdp_module_domain_access
  - ogdp_module_domain_name


Schema Note 5: Some fields provide optionally useful information from the point of view of a data maintainer or user analytics.
  - changed # timestamp when the dataset was last modified
  - created # timestamp when the dataset was created
  - api_request_count
  - high_value_dataset # features as a high value dataset on the data portal
  - published_date # date at which the dataset was made public
  - view_count # number of times users viewed this dataset


Schema Note 6: The following fields are only useful internally when working with the API, and should not become part of any user facing content.
  - uuid # Catalog UUID
  - nid
  - vid
  - node_alias # url path to navigate to the catalog webpage


API Usage Note 1: Records can only be fetched using the '/catalog' path when the catalog API is avaliable.
// pseudocode for fetching catalog data
// if catalog.is_api_available[0] == 1:
//  data = fetch_records("https://api.data.gov.in/catalog/{catalog_uuid}") 


API Usage Note 2: To fetch the data for catalog with no available API, one can dig into the resources pointing to the dataset.
// SELECT * FROM resources WHERE catalog_uuid={catalog_uuid};
"""

In [2]:
import dspy
lm = dspy.LM(model="ollama_chat/gemma3", api_base="http://localhost:11434")
dspy.configure(lm=lm)
lm("hi!")

  from .autonotebook import tqdm as notebook_tqdm


['Hi there! How’s your day going so far? 😊 \n\nIs there anything you’d like to chat about, or were you just saying hello?']

In [5]:
WriteDescription = dspy.Signature(
    "catalog_json -> textual_description",
    catalog_schema_notes
)

In [6]:
writer = dspy.ChainOfThought(WriteDescription)

In [56]:
import textwrap

with dspy.context(lm=dspy.LM("ollama_chat/gemma3", api_base="http://localhost:11434")):
  print(textwrap.fill(ctlg["body:value"][0], width=100))
  gen = writer(
      catalog_json=ctlg,
  )
  print()
  print(textwrap.fill(gen.textual_description, width=100))
gen

The data refers to prices of various commodities. It has the wholesale maximum price, minimum price
and modal price on daily basis. This dataset is generated through the AGMARKNET Portal
(http://agmarknet.gov.in), which disseminates daily market information of various commodities.

The Current Daily Price of Various Commodities from Various Markets (Mandi) catalog provides daily
price data for agricultural commodities across India. The data is sourced from the AGMARKNET Portal
and includes wholesale maximum, minimum, and modal prices. Updates are daily, and the geographical
scope is All India. The dataset is maintained by the Ministry of Agriculture and Farmers Welfare and
the Directorate of Marketing and Inspection (DMI). It’s a high-value dataset with 1,280,866 views.
The catalog is accessible via a webservice, but the API is not available. The data is updated daily
and is used to track price trends in agricultural markets.


Prediction(
    reasoning='The provided JSON data describes a catalog of daily commodity prices sourced from the AGMARKNET Portal. The catalog provides wholesale maximum, minimum, and modal prices for various agricultural commodities across different markets (mandi) in India. Key information includes the data source (AGMARKNET), the frequency of updates (Daily), the geographical scope (All India), the government type (Central), and the number of views (1,280,866). The catalog is considered a high-value dataset and is accessible via a webservice. The API is not available. The data is maintained by the Ministry of Agriculture and Farmers Welfare and the Directorate of Marketing and Inspection (DMI). The catalog is used to track price trends in agricultural markets.',
    textual_description='The Current Daily Price of Various Commodities from Various Markets (Mandi) catalog provides daily price data for agricultural commodities across India. The data is sourced from the AGMARKNET Portal 

In [7]:
ctlgs.count_documents({})

12451

In [10]:
from tqdm import tqdm

for ctlg in tqdm(ctlgs.find({"ai_long_text": {"$exists": 0}}), total=4455):
  ai_writeup = writer(catalog_json=ctlg)
  ctlgs.update_one({"uuid": ctlg["uuid"]}, {
    "$set": {"ai_long_text": ai_writeup.textual_description}
  })

  0%|          | 0/4455 [00:00<?, ?it/s]

100%|██████████| 4455/4455 [4:22:44<00:00,  3.54s/it]  


In [1]:
from chromadb import PersistentClient
from datatalker.config import CHROMADB_DIR
from datatalker.ogdp import DocumentAdapter
from chromadb.utils.embedding_functions.ollama_embedding_function import (
    OllamaEmbeddingFunction
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ollama_ef = OllamaEmbeddingFunction(
  url="http://localhost:11434",
  model_name="bge-m3"
)

In [3]:
chroma = PersistentClient(
    path=CHROMADB_DIR
)
datasets = chroma.get_or_create_collection(
  "datasets_bge-m3",
  embedding_function=ollama_ef
)

In [4]:
datasets.peek()

{'ids': ['ogd:catalog:afa77408-8ea8-438b-852c-4853f087b999',
  'ogd:catalog:ef753aab-38e5-47d2-9fa5-e3d635cd9a1e',
  'ogd:catalog:b8cb3c3e-25bc-4a49-9066-033e0068385a',
  'ogd:catalog:848af7e0-4c64-4b95-9fb5-343495a463b5',
  'ogd:catalog:4b339ac7-18d7-4576-9ea9-95f0501cd612',
  'ogd:catalog:9166e625-3bea-4d2d-aea7-52cc1b447f08',
  'ogd:catalog:43418445-20f0-44eb-8251-d163bcecf8af',
  'ogd:catalog:49cf9a4a-9f02-4931-bd50-2fe432f0bcab',
  'ogd:catalog:148a6379-76d6-4652-9817-4676f502247c',
  'ogd:catalog:560e038e-ce8e-42d5-aab9-e53e1417bd55'],
 'embeddings': array([[-0.03835353, -0.03534721, -0.03108723, ..., -0.03816152,
          0.00314629, -0.01740362],
        [-0.05079083, -0.02483686, -0.04155719, ..., -0.02326389,
          0.01268724, -0.00756219],
        [-0.01459817,  0.02926313, -0.03315291, ..., -0.0179025 ,
         -0.05203446,  0.00308013],
        ...,
        [-0.02487734,  0.00925774, -0.04612325, ..., -0.00696173,
          0.0022013 , -0.04153474],
        [-0.03178

In [6]:
def prep_chroma_inputs(catalog):
  doc = DocumentAdapter.from_catalog(catalog)
  page_content = doc.pop("content")
  page_content += f"\nAI Summary: {catalog['ai_long_text']}"
  doc_id = f"ogd:catalog:{doc['uuid']}"
  return {
    "document": page_content,
    "metadata": doc,
    "id": doc_id
  }

In [None]:
prep_chroma_inputs(ctlgs.find_one({}))

{'document': 'Title: Delhi Government School Details\nDescription: The catalog contains district wise Number of Schools, Student Enrollment , Pass Percentage and Quality Index for Class X and XII \nKeywords: [\'student enrollment\', \'School\', \'Pass Percentage \']\nFrequency: Unknown\nOpen Government Data Site: [\'delhi.data.gov.in\']Government Type: None\nAsset Jurisdiction Name: [\'Delhi (NCT)\']\nGroup Name Name: [\'School Statistics\']\nSector Name: [\'Education\']\nAI Summary: The Delhi Government School Details catalog offers a comprehensive overview of educational institutions within the Delhi National Capital Territory (NCT) of Delhi. This catalog provides district-level data, including the number of schools, student enrollment figures, pass percentages, and a quality index, specifically for Classes X and XII. The data is valuable for researchers, policymakers, and the public seeking insights into the educational landscape of Delhi. The catalog is categorized under "School St

In [8]:
from tqdm import tqdm

BATCH_SIZE = 100
documents, metadatas, ids = [], [], []

for ctlg in tqdm(ctlgs.find({}), total=12451):
    data = prep_chroma_inputs(ctlg)

    documents.append(data["document"])
    metadatas.append(data["metadata"])
    ids.append(data["id"])

    if len(ids) >= BATCH_SIZE:
        datasets.add(documents=documents, metadatas=metadatas, ids=ids)
        documents, metadatas, ids = [], [], []

100%|██████████| 12451/12451 [07:00<00:00, 29.61it/s]


In [8]:
ctlg_rsrcs = list(rsrcs.find({"catalog_uuid": ctlg["uuid"]}))
len(ctlg_rsrcs)

0

In [1]:
resource_notes = """
Your job is to write a professional long descripton of the Resource using the following notes on its schema. Highlight it's content, spatio-temporal span, variables, data collection methodology or body, and potential use cases.

#### Resrouce Schema Notes
##### Schema Note 1: Boolean values respresented as 0 and 1
The following fields represent boolean values using '0' or '1':
- active — whether the resource is currently active.
- visualizable — whether the resource supports visualization (charts, graphs, etc.).
- external_ws — indicates whether the data is fetched from an external web service.

##### Schema Note 2: Link to Catalogs
- catalog_uuid — a reference to the parent catalog this resource belongs to. Use this to group or trace resources back to their catalog definition.

##### Schema Note 3: Metadata for temporal tracking
Several fields provide timestamps and date information:
- created, updated — Unix timestamps indicating creation and last modification time.
- created_date, updated_date — ISO 8601 formatted timestamps for the same.
- timestamp, data_fetch_date — may be used to track dataset ingestion or updates (usage context dependent).

##### Schema Note 4: Schema description of the resource
These fields help describe and identify the resource meaningfully:
- title — title of the resource.
- desc — short description or summary of the resource.
- sector — high-level thematic categories (e.g., "Agriculture").
- source — origin platform or system (e.g., "data.gov.in").

##### Schema Note 5: Organizational context
These fields indicate the governing or contributing organizations:
- org — hierarchical list of government organizations involved in the dataset.
- org_type — identifies if the organization is Central or State.

##### Schema Note 6: Field-level metadata
These fields define the schema of the actual data:
- field — list of fields present in the resource, each with name, id, and type (e.g., keyword, date).
- field_exposed — fields that are exposed via the public API/UI. May include extra metadata such as mandatory, format.
- field_dependent — defines hierarchical relationships (e.g., State → District).
- order — default sort order of fields for displaying or querying data.
- primary_field — may designate the main identifier or dimension (if present).

##### Schema Note 7: Internal indexing and linking
These fields are primarily used for system indexing, Elasticsearch compatibility, or internal routing and should not be part of user-facing content:
- _id, resource_uuid, index_name
- target_bucket — includes host, index, type, and internal field mappings.
- target_type
- query, script, doc, domain, user_id

##### Schema Note 8: Resource access and type
These fields govern how the resource is accessed and its visibility:
- is_public — (if present) whether the resource is public.
- public_type — may indicate category/type of public access.
- status — current state of the resource (e.g., "active", "deprecated").
- external_ws_url — URL of the external web service if external_ws == 1.

#### API Usage Note
Resources can be queried directly using their uuid via internal APIs.
// data = fetch_records("https://api.data.gov.in/resource/{resource_uuid}") 
"""

In [3]:
WriteResourceDescription = dspy.Signature(
    "resource_json -> textual_description",
    resource_notes
)
writer = dspy.ChainOfThought(WriteResourceDescription)

In [14]:
import textwrap

rsrc = rsrcs.find_one({
    "source": {"$ne": "visualize.data.gov.in"}
})
print(textwrap.fill(rsrc["title"], width=100))
gen = writer(
    resource_json=rsrc,
)
print()
print(textwrap.fill(gen.textual_description, width=100))

State/ UT-wise Male  Health Assistant at PHCs as on 31-03-2017

This dataset provides a snapshot of Male Health Assistant staffing levels at Primary Health Centers
(PHCs) across India as of March 31, 2017. It details the number of sanctioned positions, the number
of positions currently filled, and the resulting shortfall in staffing. The data is organized by
State/Union Territory and is sourced from data.gov.in. The dataset includes fields for tracking the
number of sanctioned positions, positions in actual occupation, and the resulting shortfall,
offering insights into the distribution of health professionals within the Indian healthcare system.
The data was last updated on November 29, 2018.


In [15]:
rsrcs.count_documents({
    "source": {"$ne": "visualize.data.gov.in"}
})

277677

In [19]:
rsrc

{'_id': ObjectId('6813b724abbc9676584c53b7'),
 'index_name': '50876343-1069-4d70-bb69-9a30172ed48c',
 'title': 'State/ UT-wise Male  Health Assistant at PHCs as on 31-03-2017',
 'desc': 'State/ UT-wise Male  Health Assistant at PHCs as on 31-03-2017',
 'created': 1521789177,
 'updated': 1543515042,
 'visualizable': '1',
 'source': 'data.gov.in',
 'org_type': 'Central',
 'org': ['Ministry of Health and Family Welfare',
  'Department of Health and Family Welfare'],
 'sector': ['Health'],
 'catalog_uuid': '6efa5981-5ec1-490d-9f9f-07c9a3e3f559',
 'status': 1,
 'field': [{'id': 'document_id', 'name': 'document_id', 'type': 'double'},
  {'id': '_s_no_', 'name': 'S. No.', 'type': 'keyword'},
  {'id': 'state_ut', 'name': 'State/ UT', 'type': 'keyword'},
  {'id': 'required___r_', 'name': 'Required - [R]', 'type': 'double'},
  {'id': 'sanctioned___s_', 'name': 'Sanctioned - [S]', 'type': 'double'},
  {'id': 'in_position___p_', 'name': 'In Position - [P]', 'type': 'double'},
  {'id': 'vacant___s_