In [1]:
# TODO
def prices_fetch(args):
	# Stub: Return fake mandi prices
	return {"data": [{"commodity": "wheat", "price": 2000}], "source_stamp": "mandi_stub"}
# TODO


In [2]:
# If needed — safe to re-run
%pip install --quiet requests pydantic langchain langchain-core python-dateutil pandas


Note: you may need to restart the kernel to use updated packages.


In [26]:
import os
import json
import math
import pathlib
from datetime import datetime, date
from typing import Any, Dict, List, Optional

import requests
from dateutil import parser as dateparser
from pydantic import BaseModel, Field, validator

# LangChain imports (works for LC >= 0.2.x)
try:
    from langchain_core.tools import StructuredTool
except ImportError:
    # fallback for older LC
    from langchain.tools import StructuredTool

# ---- Constants ----
OGD_RESOURCE_ID = "9ef84268-d588-465a-a308-a864a43d0070"
OGD_BASE = "https://api.data.gov.in/resource/"  # final endpoint = OGD_BASE + OGD_RESOURCE_ID
DEFAULT_LIMIT = 500  # we’ll page if needed
STATIC_DIR = pathlib.Path("static_json")  # fallback folder
MANDI_STATIC_GLOB = "*.json"             # e.g., agmarknet_*.json

# ENV VAR for your data.gov.in API key
OGD_API_KEY = os.getenv("DATA_GOV_IN_API_KEY", "").strip()


In [28]:
print(OGD_API_KEY)




In [5]:
class MandiArgs(BaseModel):
    state: str = Field(..., description="State name (e.g., 'Karnataka')")
    district: str = Field(..., description="District name (e.g., 'Belagavi')")
    commodity: str = Field(..., description="Commodity name (e.g., 'Tomato')")
    market: Optional[str] = Field(None, description="Market/APMC name")
    variety: Optional[str] = Field(None, description="Variety name")
    start_date: Optional[str] = Field(None, description="YYYY-MM-DD inclusive")
    end_date: Optional[str] = Field(None, description="YYYY-MM-DD inclusive")
    max_rows: int = Field(1000, description="Hard cap on rows to return after filtering")

    @validator("start_date", "end_date")
    def _date_fmt(cls, v):
        if v is None: 
            return v
        # accept several forms; normalize to YYYY-MM-DD
        dt = dateparser.parse(v).date()
        return dt.isoformat()


C:\Users\Hp\AppData\Local\Temp\ipykernel_19496\2233489019.py:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator("start_date", "end_date")


In [6]:
def _to_float(x):
    if x is None: 
        return None
    try:
        return float(str(x).strip())
    except:
        return None

def _norm_str(x):
    if x is None: 
        return None
    s = str(x).strip()
    return s if s else None

def _in_date_range(d: str, start: Optional[str], end: Optional[str]) -> bool:
    if not d:
        return False
    try:
        dd = dateparser.parse(d).date()
    except Exception:
        return False
    if start:
        if dd < dateparser.parse(start).date():
            return False
    if end:
        if dd > dateparser.parse(end).date():
            return False
    return True

def _map_row_to_schema(r: Dict[str, Any], source_url: str) -> Dict[str, Any]:
    # API fields commonly exposed by AGMARKNET via OGD: state, district, market, commodity, variety,
    # modal_price, min_price, max_price, arrival_date, arrival, and sometimes unit, etc.
    # We normalize into your target schema keys and types.
    return {
        "state": _norm_str(r.get("state")) or _norm_str(r.get("State")),
        "district": _norm_str(r.get("district")) or _norm_str(r.get("District")),
        "market": _norm_str(r.get("market")) or _norm_str(r.get("Market")),
        "arrival_date": _norm_str(r.get("arrival_date")) or _norm_str(r.get("date")) or _norm_str(r.get("Date")),
        "commodity": _norm_str(r.get("commodity")) or _norm_str(r.get("Commodity")),
        "variety": _norm_str(r.get("variety")) or _norm_str(r.get("Variety")),
        "min_price_rs_per_qtl": _to_float(r.get("min_price") or r.get("Min Price")),
        "max_price_rs_per_qtl": _to_float(r.get("max_price") or r.get("Max Price")),
        "modal_price_rs_per_qtl": _to_float(r.get("modal_price") or r.get("Modal Price")),
        "arrival_qty": _to_float(r.get("arrival") or r.get("Arrivals") or r.get("arrival_qty")),
        "source_url": source_url,
        "last_checked": date.today().isoformat(),
    }


In [7]:
def _fetch_from_ogd(args: MandiArgs) -> Dict[str, Any]:
    if not OGD_API_KEY:
        raise RuntimeError("DATA_GOV_IN_API_KEY not set in environment.")

    # Build base URL and params
    url = f"{OGD_BASE}{OGD_RESOURCE_ID}"
    params = {
        "api-key": OGD_API_KEY,
        "format": "json",
        "limit": DEFAULT_LIMIT,
        "offset": 0,
        # NB: Official OGD supports `filters[field]=value` style filtering.
        f"filters[state]": args.state,
        f"filters[district]": args.district,
        f"filters[commodity]": args.commodity,
    }
    if args.market:
        params["filters[market]"] = args.market
    if args.variety:
        params["filters[variety]"] = args.variety

    # We’ll page until we either reach max_rows or results run out.
    rows: List[Dict[str, Any]] = []
    total_fetched = 0
    while True:
        resp = requests.get(url, params=params, timeout=30)
        resp.raise_for_status()
        payload = resp.json()
        batch = payload.get("records") or payload.get("data") or []
        if not batch:
            break

        rows.extend(batch)
        total_fetched += len(batch)
        if total_fetched >= args.max_rows:
            break

        # next page
        params["offset"] = params.get("offset", 0) + params["limit"]

        # simple stop if fewer than a full page returned
        if len(batch) < params["limit"]:
            break

    # Client-side date filter (API date filtering is inconsistent across resources)
    filtered = []
    for r in rows:
        mapped = _map_row_to_schema(r, source_url=url)
        if _in_date_range(mapped["arrival_date"], args.start_date, args.end_date):
            filtered.append(mapped)

    return {
        "data": filtered[: args.max_rows],
        "source_stamp": url,
    }


In [8]:
def _fetch_from_static(args: MandiArgs) -> Dict[str, Any]:
    if not STATIC_DIR.exists():
        return {"data": [], "source_stamp": str(STATIC_DIR)}

    all_rows: List[Dict[str, Any]] = []
    for fp in STATIC_DIR.glob(MANDI_STATIC_GLOB):
        try:
            with open(fp, "r", encoding="utf-8") as f:
                raw = json.load(f)
        except Exception:
            continue

        # raw can be: list of dicts OR {records: [...]} OR {data: [...]}
        if isinstance(raw, dict):
            candidates = raw.get("records") or raw.get("data") or raw.get("rows") or []
        elif isinstance(raw, list):
            candidates = raw
        else:
            candidates = []

        for r in candidates:
            mapped = _map_row_to_schema(r, source_url=str(fp))
            # basic filters
            if mapped["state"] and mapped["state"].lower() != args.state.lower():
                continue
            if mapped["district"] and mapped["district"].lower() != args.district.lower():
                continue
            if mapped["commodity"] and mapped["commodity"].lower() != args.commodity.lower():
                continue
            if args.market and mapped["market"] and mapped["market"].lower() != args.market.lower():
                continue
            if args.variety and mapped["variety"] and mapped["variety"].lower() != args.variety.lower():
                continue
            if not _in_date_range(mapped["arrival_date"], args.start_date, args.end_date):
                continue
            all_rows.append(mapped)

    return {
        "data": all_rows[: args.max_rows],
        "source_stamp": str(STATIC_DIR.resolve()),
    }


In [9]:
def prices_fetch(args: Dict[str, Any]) -> Dict[str, Any]:
    """
    Contract:
      input: dict with keys: state, district, commodity, [market], [variety], [start_date], [end_date], [max_rows]
      output: { "data": [ ...schema rows... ], "source_stamp": "<API URL or file path>" }
    """
    parsed = MandiArgs(**args)

    # Try OGD API first; if not available, fallback to static JSON.
    try:
        if OGD_API_KEY:
            return _fetch_from_ogd(parsed)
        else:
            # No key, go static
            return _fetch_from_static(parsed)
    except Exception as e:
        # Robust fallback
        try:
            return _fetch_from_static(parsed)
        except Exception:
            raise e


In [10]:
MandiPricesTool = StructuredTool.from_function(
    func=prices_fetch,
    name="mandi_prices_lookup",
    description=(
        "Fetch daily mandi (wholesale) prices from AGMARKNET/OGD (or local static fallback). "
        "Inputs: state, district, commodity, [market], [variety], [start_date], [end_date], [max_rows]. "
        "Returns: {'data': [...], 'source_stamp': '...'} matching the Fasal-Setu mandi schema."
    ),
)


In [13]:
# Example demo call — adjust district/commodity to something that exists in your sample/static files
demo_args = {
    "state": "Karnataka",
    "district": "Belagavi",
    "commodity": "Tomato",
    "start_date": (date.today().replace(day=max(1, date.today().day-7))).isoformat(),
    "end_date": date.today().isoformat(),
    "max_rows": 50,
}

res = prices_fetch(demo_args)

print("=== Mandi Tool Demo ===")
print("Args:", demo_args)
print("Source:", res.get("source_stamp"))
print("Rows:", len(res.get("data", [])))
for i, row in enumerate(res.get("data", [])[:5], 1):
    print(f"\n#{i} {row['arrival_date']} | {row['state']} > {row['district']} > {row.get('market')}")
    print(f"   {row['commodity']} ({row.get('variety')})  modal={row['modal_price_rs_per_qtl']}  "
          f"min={row['min_price_rs_per_qtl']}  max={row['max_price_rs_per_qtl']}  arrival={row.get('arrival_qty')}")


=== Mandi Tool Demo ===
Args: {'state': 'Karnataka', 'district': 'Belagavi', 'commodity': 'Tomato', 'start_date': '2025-08-08', 'end_date': '2025-08-15', 'max_rows': 50}
Source: static_json
Rows: 0


In [15]:
# Cell D2 — Live API smoke test (adjust args to a real combo that exists frequently)
test_args = {
    "state": "Bihar",
    "district": "Patna",
    "commodity": "Tomato",
    "start_date": (date.today().replace(day=max(1, date.today().day-7))).isoformat(),
    "end_date": date.today().isoformat(),
    "max_rows": 50,
}
out = prices_fetch(test_args)
print("Source:", out["source_stamp"])
print("Rows:", len(out["data"]))
print("First row:", out["data"][0] if out["data"] else None)


Source: static_json
Rows: 0
First row: None


In [35]:
import os, json, time
from datetime import date
from typing import Any, Dict, List, Optional

import requests
from pydantic import BaseModel, Field, validator
from dateutil import parser as dateparser

try:
    from langchain_core.tools import StructuredTool
except ImportError:
    from langchain.tools import StructuredTool

# === data.gov.in (AGMARKNET) ===
OGD_RESOURCE_ID = "9ef84268-d588-465a-a308-a864a43d0070"  # Current Daily Price of Various Commodities...
OGD_BASE = "https://api.data.gov.in/resource/"
OGD_API_KEY = os.getenv("DATA_GOV_IN_API_KEY", "").strip()

DEFAULT_LIMIT = 500
DEFAULT_SLEEP_BETWEEN_PAGES = 0.25  # be nice to API

# Optional alias normalization (helps Belagavi/Belgaum etc.)
ALIASES = {
    "district": {
        "belgaum": "belagavi",
        "bangalore rural": "bengaluru rural",
        "bangalore": "bengaluru urban",
    },
    "market": {
        "belgaum apmc": "belagavi apmc",
        "belgaum": "belagavi",
    },
    "commodity": {
        "tomatoes": "tomato",
    },
}


In [36]:
def _canon(val: Optional[str], kind: str) -> Optional[str]:
    if not val: return val
    key = val.strip().lower()
    return ALIASES.get(kind, {}).get(key, val)

def _norm_str(x):
    if x is None: return None
    s = str(x).strip()
    return s if s else None

def _to_float(x):
    try:
        return float(str(x).strip())
    except Exception:
        return None

def _in_date_range(d: Optional[str], start: Optional[str], end: Optional[str]) -> bool:
    if not d:
        return False
    dd = dateparser.parse(d).date()
    if start and dd < dateparser.parse(start).date():
        return False
    if end and dd > dateparser.parse(end).date():
        return False
    return True


In [37]:
def _ogd_call(params: Dict[str, Any]) -> Dict[str, Any]:
    if not OGD_API_KEY:
        raise RuntimeError("DATA_GOV_IN_API_KEY is not set. Get a key on data.gov.in and set it in env.")
    url = f"{OGD_BASE}{OGD_RESOURCE_ID}"
    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    return resp.json()


In [38]:
class AgmarknetQuery(BaseModel):
    commodity: str = Field(..., description="Commodity, e.g., 'Tomato'")
    state: str = Field(..., description="State, e.g., 'Karnataka'")
    market: Optional[str] = Field(None, description="Market/APMC name")
    district: Optional[str] = Field(None, description="District name")
    variety: Optional[str] = Field(None, description="Variety name")
    start_date: Optional[str] = Field(None, description="YYYY-MM-DD")
    end_date: Optional[str] = Field(None, description="YYYY-MM-DD")
    max_rows: int = Field(1000, description="Cap returned rows")

    @validator("start_date", "end_date")
    def _datefmt(cls, v):
        if v is None: return v
        return dateparser.parse(v).date().isoformat()

def agmarknet_request(
    commodity: str,
    state: str,
    market: Optional[str] = None,
    district: Optional[str] = None,
    variety: Optional[str] = None,
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    max_rows: int = 1000,
) -> Dict[str, Any]:
    """
    Fetch raw AGMARKNET rows from data.gov.in for the given filters.
    Mimics the GitHub repo interface (commodity/state/market) but uses the official API.
    Returns: { "records": [...], "source_url": "<API URL with resource id>" }
    """
    q = AgmarknetQuery(
        commodity=commodity, state=state, market=market, district=district,
        variety=variety, start_date=start_date, end_date=end_date, max_rows=max_rows
    )

    # Build filters (API uses filters[field]=value)
    base_params = {
        "api-key": OGD_API_KEY,
        "format": "json",
        "limit": DEFAULT_LIMIT,
        "offset": 0,
        "filters[commodity]": _canon(q.commodity, "commodity") or q.commodity,
        "filters[state]": q.state,
    }
    if q.district:
        base_params["filters[district]"] = _canon(q.district, "district") or q.district
    if q.market:
        base_params["filters[market]"] = _canon(q.market, "market") or q.market
    if q.variety:
        base_params["filters[variety]"] = q.variety

    all_rows: List[Dict[str, Any]] = []
    url = f"{OGD_BASE}{OGD_RESOURCE_ID}"

    while True:
        payload = _ogd_call(base_params)
        batch = payload.get("records") or payload.get("data") or []
        if not batch:
            break

        # Client-side date filter (resource date filtering is inconsistent)
        for r in batch:
            # try common date keys from AGMARKNET feed
            d = r.get("arrival_date") or r.get("date") or r.get("Date")
            if (q.start_date or q.end_date):
                if not _in_date_range(d, q.start_date, q.end_date):
                    continue
            all_rows.append(r)
            if len(all_rows) >= q.max_rows:
                break

        if len(all_rows) >= q.max_rows:
            break

        # pagination
        prev = base_params["offset"]
        base_params["offset"] = prev + base_params["limit"]
        if len(batch) < base_params["limit"]:
            break
        time.sleep(DEFAULT_SLEEP_BETWEEN_PAGES)

    return {"records": all_rows, "source_url": url}


C:\Users\Hp\AppData\Local\Temp\ipykernel_19496\692113237.py:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator("start_date", "end_date")


In [39]:
def _map_row_to_schema(r: Dict[str, Any], source_url: str) -> Dict[str, Any]:
    state = _norm_str(r.get("state") or r.get("State"))
    district = _norm_str(r.get("district") or r.get("District"))
    market = _norm_str(r.get("market") or r.get("Market"))
    commodity = _norm_str(r.get("commodity") or r.get("Commodity"))
    variety = _norm_str(r.get("variety") or r.get("Variety"))
    arrival_date = _norm_str(r.get("arrival_date") or r.get("date") or r.get("Date"))

    # alias normalization
    district = _canon(district, "district") if district else district
    market   = _canon(market, "market") if market else market
    commodity= _canon(commodity, "commodity") if commodity else commodity

    return {
        "state": state,
        "district": district,
        "market": market,
        "arrival_date": arrival_date,
        "commodity": commodity,
        "variety": variety,
        "min_price_rs_per_qtl": _to_float(r.get("min_price") or r.get("Min Price")),
        "max_price_rs_per_qtl": _to_float(r.get("max_price") or r.get("Max Price")),
        "modal_price_rs_per_qtl": _to_float(r.get("modal_price") or r.get("Modal Price")),
        "arrival_qty": _to_float(r.get("arrival") or r.get("Arrivals") or r.get("arrival_qty")),
        "source_url": source_url,
        "last_checked": date.today().isoformat(),
    }


In [40]:
class MandiArgs(BaseModel):
    state: str
    district: Optional[str] = None
    commodity: str
    market: Optional[str] = None
    variety: Optional[str] = None
    start_date: Optional[str] = None
    end_date: Optional[str] = None
    max_rows: int = 1000

    @validator("start_date", "end_date")
    def _datefmt(cls, v):
        if v is None: return v
        return dateparser.parse(v).date().isoformat()

def prices_fetch(args: Dict[str, Any]) -> Dict[str, Any]:
    """
    Inputs: state, district, commodity, [market], [variety], [start_date], [end_date], [max_rows]
    Returns:
      {
        "data": [ ...schema rows... ],
        "source_stamp": "<API URL>"
      }
    """
    a = MandiArgs(**args)
    raw = agmarknet_request(
        commodity=a.commodity,
        state=a.state,
        market=a.market,
        district=a.district,
        variety=a.variety,
        start_date=a.start_date,
        end_date=a.end_date,
        max_rows=a.max_rows,
    )
    data = [_map_row_to_schema(r, raw["source_url"]) for r in raw["records"]]
    return {"data": data, "source_stamp": raw["source_url"]}


C:\Users\Hp\AppData\Local\Temp\ipykernel_19496\585644801.py:11: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator("start_date", "end_date")


In [41]:
MandiPricesTool = StructuredTool.from_function(
    func=prices_fetch,
    name="mandi_prices_lookup",
    description=(
        "Fetch mandi prices from AGMARKNET via data.gov.in (API only). "
        "Inputs: state, district, commodity, [market], [variety], [start_date], [end_date], [max_rows]. "
        "Returns {data: [...], source_stamp: '...'} in Fasal-Setu schema."
    ),
)


In [45]:
# Set your key for this notebook session (or export in your shell)
# os.environ["DATA_GOV_IN_API_KEY"] = "YOUR_KEY_HERE"

print("API key present:", bool(os.getenv("DATA_GOV_IN_API_KEY")))

demo_args = {
    "state": "Karnataka",
    "district": "Belagavi",     # try also: "Belgaum"
    "commodity": "Tomato",
    # widen window if you see 0 rows
    "start_date": None,
    "end_date": None,
    "max_rows": 100,
}

out = prices_fetch(demo_args)
print("Source:", out["source_stamp"])
print("Rows:", len(out["data"]))
for i, r in enumerate(out["data"][:5], 1):
    print(f"{i}. {r['arrival_date']} | {r['district']} > {r['market']} | {r['commodity']} ({r.get('variety')}) "
          f"modal={r['modal_price_rs_per_qtl']} min={r['min_price_rs_per_qtl']} max={r['max_price_rs_per_qtl']}")


API key present: True


RuntimeError: DATA_GOV_IN_API_KEY is not set. Get a key on data.gov.in and set it in env.

#SCRAPPING

In [46]:
%pip install --quiet requests beautifulsoup4 lxml pydantic langchain langchain-core python-dateutil pandas


Note: you may need to restart the kernel to use updated packages.


In [47]:
import re
import time
from datetime import date
from typing import Any, Dict, List, Optional

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
from pydantic import BaseModel, Field, validator

try:
    from langchain_core.tools import StructuredTool
except ImportError:
    from langchain.tools import StructuredTool

AGMARKNET_SEARCH_URL = "https://agmarknet.gov.in/SearchCmmMkt.aspx"

# polite scraping defaults
DEFAULT_TIMEOUT = 30
DEFAULT_SLEEP = 0.8  # seconds between GET and POST
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; FasalSetuScraper/1.0; +https://example.com)",
    "Accept-Language": "en-US,en;q=0.9",
}


In [48]:
def _norm(x: Optional[str]) -> Optional[str]:
    if x is None:
        return None
    s = str(x).strip()
    return s if s else None

def _to_float(x):
    try:
        return float(str(x).strip().replace(",", ""))
    except Exception:
        return None

def _canon(val: Optional[str]) -> Optional[str]:
    return _norm(val)

def _in_date_range(d: Optional[str], start: Optional[str], end: Optional[str]) -> bool:
    if not d:
        return False
    dd = dateparser.parse(d).date()
    if start and dd < dateparser.parse(start).date():
        return False
    if end and dd > dateparser.parse(end).date():
        return False
    return True


In [49]:
def _extract_hidden_fields(soup: BeautifulSoup) -> Dict[str, str]:
    fields = {}
    for name in ["__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "__VIEWSTATEENCRYPTED"]:
        el = soup.find("input", {"name": name})
        if el and el.has_attr("value"):
            fields[name] = el["value"]
    # ASP.NET needs these even if empty
    fields["__EVENTTARGET"] = ""
    fields["__EVENTARGUMENT"] = ""
    return fields

def _find_field_name(soup: BeautifulSoup, label_text_candidates: List[str], fallback_regex: str) -> Optional[str]:
    """
    Try to locate the 'name' attribute of a <select>/<input> whose <label> or nearby text
    matches any of label_text_candidates. Otherwise fallback to regex on name/id.
    """
    # 1) Try labels
    for lab in soup.find_all(["label", "span"]):
        text = (lab.get_text() or "").strip().lower()
        if any(k in text for k in label_text_candidates):
            # look for a nearby select/input
            nxt = lab.find_next(["select", "input"])
            if nxt and nxt.has_attr("name"):
                return nxt["name"]

    # 2) Regex fallback on name/id
    pat = re.compile(fallback_regex, re.I)
    for sel in soup.find_all(["select", "input"]):
        nm = sel.get("name") or sel.get("id") or ""
        if pat.search(nm):
            return sel.get("name") or sel.get("id")

    return None


In [50]:
EXPECTED_HEADERS = {
    "state": ["state"],
    "district": ["district"],
    "market": ["market", "mkt"],
    "arrival_date": ["date", "arrival date"],
    "commodity": ["commodity"],
    "variety": ["variety"],
    "min_price": ["min price", "min"],
    "max_price": ["max price", "max"],
    "modal_price": ["modal price", "modal"],
    "arrival": ["arrival", "arrivals", "qty", "quantity"],
}

def _header_match(h: str, keys: List[str]) -> bool:
    ht = (h or "").strip().lower()
    return any(k in ht for k in keys)

def _find_results_table(soup: BeautifulSoup):
    # heuristic: choose the table whose header row contains at least 5 of the expected columns
    best = None
    best_score = -1
    for tbl in soup.find_all("table"):
        headers = [th.get_text(strip=True) for th in tbl.find_all("th")]
        if not headers:
            # try first row <tr><td> as header-like
            first_tr = tbl.find("tr")
            if not first_tr:
                continue
            headers = [td.get_text(strip=True) for td in first_tr.find_all(["td", "th"])]
        score = 0
        for key, keys in EXPECTED_HEADERS.items():
            if any(_header_match(h, keys) for h in headers):
                score += 1
        if score > best_score:
            best_score = score
            best = (tbl, headers)
    return best  # (table, headers)


In [51]:
def _map_to_schema(row: Dict[str, Any], source_url: str) -> Dict[str, Any]:
    return {
        "state": _canon(row.get("state")),
        "district": _canon(row.get("district")),
        "market": _canon(row.get("market")),
        "arrival_date": _canon(row.get("arrival_date")),
        "commodity": _canon(row.get("commodity")),
        "variety": _canon(row.get("variety")),
        "min_price_rs_per_qtl": _to_float(row.get("min_price")),
        "max_price_rs_per_qtl": _to_float(row.get("max_price")),
        "modal_price_rs_per_qtl": _to_float(row.get("modal_price")),
        "arrival_qty": _to_float(row.get("arrival")),
        "source_url": source_url,
        "last_checked": date.today().isoformat(),
    }


In [59]:
def agmarknet_scrape(
    commodity: str,
    state: str,
    district: Optional[str] = None,
    market: Optional[str] = None,
    start_date: Optional[str] = None,  # YYYY-MM-DD
    end_date: Optional[str] = None,    # YYYY-MM-DD
) -> Dict[str, Any]:
    """
    Scrape AGMARKNET SearchCmmMkt.aspx like the GitHub repo approach (no data.gov.in API).
    Returns: { "records": [raw_rows], "source_url": <page URL> }
    """
    session = requests.Session()
    session.headers.update(HEADERS)

    # Step 1: GET the search page to collect WebForms tokens and control names
    r = session.get(AGMARKNET_SEARCH_URL, timeout=DEFAULT_TIMEOUT)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    hidden = _extract_hidden_fields(soup)

    # Try to discover likely control names for dropdowns/inputs
    name_state = _find_field_name(soup, ["state"], r"state|ddlstate")
    name_district = _find_field_name(soup, ["district"], r"district|ddldistrict")
    name_market = _find_field_name(soup, ["market"], r"market|ddlmarket|apmc")
    name_commodity = _find_field_name(soup, ["commodity"], r"commodity|ddlcommodity")
    name_from = _find_field_name(soup, ["from", "from date", "start"], r"from|start|fromdate")
    name_to = _find_field_name(soup, ["to", "to date", "end"], r"to|end|todate")

    # A common submit button name/id on this page is often like 'btnGo' / 'btnSearch'
    # We'll try to find any submit button.
    btn = soup.find("input", {"type": "submit"}) or soup.find("button", {"type": "submit"})
    submit_name = (btn.get("name") if btn and btn.has_attr("name") else "btnGo")

    # Build payload (ASP.NET requires ALL visible form inputs; at minimum hidden + your fields)
    payload = dict(hidden)  # copies __VIEWSTATE, etc.
    if name_state:
        payload[name_state] = state
    if name_district and district:
        payload[name_district] = district
    if name_market and market:
        payload[name_market] = market
    if name_commodity:
        payload[name_commodity] = commodity
    if name_from and start_date:
        payload[name_from] = start_date
    if name_to and end_date:
        payload[name_to] = end_date

    # ASP.NET WebForms usually needs a submit name in payload
    payload[submit_name] = "Search"

    # Small delay to be polite
    time.sleep(DEFAULT_SLEEP)

    # Step 2: POST the form
    r2 = session.post(AGMARKNET_SEARCH_URL, data=payload, timeout=DEFAULT_TIMEOUT, headers=HEADERS)
    r2.raise_for_status()
    soup2 = BeautifulSoup(r2.text, "lxml")

    # Step 3: locate the results table
    tbl_headers = _find_results_table(soup2)
    if not tbl_headers:
        # Sometimes results are rendered in update panels — try another pass: look for any table with 'Commodity'
        candidates = soup2.find_all("table")
        if not candidates:
            raise RuntimeError("Could not locate any results table on AGMARKNET page.")
        # fallback: pick the first table as a last resort
        tbl = candidates[0]
        headers = [th.get_text(strip=True) for th in tbl.find_all("th")]
    else:
        tbl, headers = tbl_headers

    # Normalize header names and build column index map
    head_texts = [h.strip() for h in headers] if headers else []
    if not head_texts:
        # try first row as header
        first_tr = tbl.find("tr")
        head_texts = [td.get_text(strip=True) for td in first_tr.find_all(["td","th"])] if first_tr else []

    col_map = {}
    for idx, h in enumerate(head_texts):
        ht = h.lower()
        if any(k in ht for k in EXPECTED_HEADERS["state"]): col_map["state"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["district"]): col_map["district"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["market"]): col_map["market"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["arrival_date"]): col_map["arrival_date"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["commodity"]): col_map["commodity"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["variety"]): col_map["variety"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["min_price"]): col_map["min_price"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["max_price"]): col_map["max_price"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["modal_price"]): col_map["modal_price"] = idx
        if any(k in ht for k in EXPECTED_HEADERS["arrival"]): col_map["arrival"] = idx

    # Step 4: iterate data rows
    rows = []
    for tr in tbl.find_all("tr"):
        tds = tr.find_all("td")
        if not tds or len(tds) < max(col_map.values(), default=0)+1:
            continue
        # Heuristic: skip header-like rows
        if tds[0].find("th"): 
            continue

        def pick(key):
            i = col_map.get(key, None)
            return tds[i].get_text(strip=True) if i is not None else None

        row = {
            "state": pick("state"),
            "district": pick("district"),
            "market": pick("market"),
            "arrival_date": pick("arrival_date"),
            "commodity": pick("commodity"),
            "variety": pick("variety"),
            "min_price": pick("min_price"),
            "max_price": pick("max_price"),
            "modal_price": pick("modal_price"),
            "arrival": pick("arrival"),
        }

        # If a date window was passed, filter here (site may ignore date fields if selections are partial)
        if (start_date or end_date) and not _in_date_range(row.get("arrival_date"), start_date, end_date):
            continue

        rows.append(row)

    return {"records": rows, "source_url": AGMARKNET_SEARCH_URL}


In [60]:
class ScrapeArgs(BaseModel):
    state: str
    district: Optional[str] = None
    commodity: str
    market: Optional[str] = None
    start_date: Optional[str] = None  # YYYY-MM-DD
    end_date: Optional[str] = None
    max_rows: int = 1000

    @validator("start_date", "end_date")
    def _datefmt(cls, v):
        if v is None:
            return v
        return dateparser.parse(v).date().isoformat()

def prices_fetch_scrape(args: Dict[str, Any]) -> Dict[str, Any]:
    """
    Scraping-based mandi lookup (AGMARKNET HTML), returns your schema + source_stamp.
    """
    a = ScrapeArgs(**args)
    raw = agmarknet_scrape(
        commodity=a.commodity,
        state=a.state,
        district=a.district,
        market=a.market,
        start_date=a.start_date,
        end_date=a.end_date,
    )
    # map & cap
    mapped = [_map_to_schema(r, raw["source_url"]) for r in raw["records"]]
    return {"data": mapped[: a.max_rows], "source_stamp": raw["source_url"]}


C:\Users\Hp\AppData\Local\Temp\ipykernel_19496\1067131509.py:10: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator("start_date", "end_date")


In [61]:
MandiPricesScrapeTool = StructuredTool.from_function(
    func=prices_fetch_scrape,
    name="mandi_prices_scrape",
    description=(
        "Scrape AGMARKNET (SearchCmmMkt.aspx) for daily mandi prices — repo-style scraper. "
        "Inputs: state, district, commodity, [market], [start_date], [end_date], [max_rows]. "
        "Returns {data: [...], source_stamp: '...'} in Fasal-Setu schema."
    ),
)


In [64]:
demo_args = {
    "state": "Bihar",
    "district": "",       # try alternate spellings if needed ("Belgaum")
    "commodity": "Rice",
    # "market": "Belagavi APMC",  # optional
    # "start_date": "2025-08-01",
    # "end_date": "2025-08-15",
    "max_rows": 50,
}

out = prices_fetch_scrape(demo_args)
print("Source:", out["source_stamp"])
print("Rows:", len(out["data"]))
for i, r in enumerate(out["data"][:5], 1):
    print(f"{i}. {r['arrival_date']} | {r['state']} > {r['district']} > {r['market']} "
          f"| {r['commodity']} ({r.get('variety')}) "
          f"modal={r['modal_price_rs_per_qtl']} min={r['min_price_rs_per_qtl']} max={r['max_price_rs_per_qtl']}")


Source: https://agmarknet.gov.in/SearchCmmMkt.aspx
Rows: 4
1. None | None > None > None | None (None) modal=None min=None max=None
2. None | None > None > None | None (None) modal=None min=None max=None
3. None | None > None > None | None (None) modal=None min=None max=None
4. None | None > None > None | None (None) modal=None min=None max=None
