In [1]:
from __future__ import annotations
import json
from pathlib import Path
from typing import Dict, Any, Optional, List

import pandas as pd
from joblib import load

from langchain.tools import StructuredTool

from langgraph.prebuilt import create_react_agent
from langchain.chat_models.base import init_chat_model

In [2]:
MODELS_DIR = Path("../models")

with open(MODELS_DIR / "metadata.json", "r") as f:
    METADATA = json.load(f)

SCHEMA = METADATA["schema"]
CAT_COLS: List[str] = SCHEMA["categorical"]
NUM_COLS: List[str] = SCHEMA["numerical"]
ALL_FEATURES: List[str] = CAT_COLS + NUM_COLS
POS_LABEL: str = SCHEMA["positive_class"]
MAPPING = SCHEMA["mapping"]
INV_MAPPING = {v: k for k, v in MAPPING.items()}

PIPES: Dict[str, Any] = {
    name: load(cfg["path"]) for name, cfg in METADATA["models"].items()
}

In [3]:
from pydantic import BaseModel, Field

class RecordInput(BaseModel):
    age: int = Field(..., description="Age in years")
    workclass: str = Field(..., description="Type of work class")
    final_weight: int = Field(..., description="Census final weight")
    education_number: int = Field(..., description="Numeric encoding of education level")
    marital_status: str = Field(..., description="Marital status")
    occupation: str = Field(..., description="Occupation category")
    relationship: str = Field(..., description="Relationship status")
    race: str = Field(..., description="Race of the individual")
    sex: str = Field(..., description="Sex of the individual")
    capital_gain: int = Field(..., description="Capital gain value")
    capital_loss: int = Field(..., description="Capital loss value")
    hours_per_week: int = Field(..., description="Hours worked per week")
    native_country: str = Field(..., description="Native country")

class ModelPrediction(BaseModel):
    model: str
    pred: str
    label: str
    prob_gt_50k: float

from typing import Any, Dict, Mapping, Union
from pydantic import BaseModel

def _to_dict(obj: Union[Mapping[str, Any], BaseModel]) -> Dict[str, Any]:
    # Works for Pydantic v2 (.model_dump) and v1 (.dict)
    if isinstance(obj, BaseModel):
        if hasattr(obj, "model_dump"):
            return obj.model_dump()
        return obj.dict()
    return dict(obj)  # Mapping -> dict

def coerce_df(rec: Union[Dict[str, Any], BaseModel]) -> pd.DataFrame:
    data = _to_dict(rec)
    row = {k: data.get(k, None) for k in ALL_FEATURES}
    df = pd.DataFrame([row], columns=ALL_FEATURES)
    for c in ("occupation", "workclass", "native_country"):
        if c in df.columns and (pd.isna(df.at[0, c]) or df.at[0, c] == ""):
            df.at[0, c] = "is_missing_from_data"
    return df

def pipe_predict(pipe, df: pd.DataFrame, model_key: str) -> ModelPrediction:
    pred = pipe.predict(df)[0]
    label = INV_MAPPING[int(pred)] if isinstance(pred, (int, float)) else str(pred)
    prediction = "<=50K" if label == 0 else ">50K"
    prob = float(pipe.predict_proba(df)[:, 1][0]) if hasattr(pipe, "predict_proba") else (1.0 if label == POS_LABEL else 0.0)
    return ModelPrediction(model=model_key, pred = prediction, label=label, prob_gt_50k=prob)

def make_tool(model_key: str) -> StructuredTool:
    pipe = PIPES[model_key]

    def _run(**kwargs) -> Dict[str, Any]:
        inp = RecordInput(**kwargs)
        df = coerce_df(inp)
        out = pipe_predict(pipe, df, model_key)
        if hasattr(out, "model_dump"):
            return out.model_dump()
        return out.dict()

    return StructuredTool.from_function(
        name=f"{model_key}_predict",
        description=(f"Predict income using trained pipeline '{model_key}'. "
                     "Provide all feature fields; returns JSON {model,label,prob_gt_50k}."),
        args_schema=RecordInput,
        func=_run,
    )

TOOLS: List[StructuredTool] = [make_tool(k) for k in PIPES.keys()]

class Empty(BaseModel): pass
def schema_fn(_: Empty = None) -> str:
    return json.dumps(
        {"categorical": CAT_COLS, "numerical": NUM_COLS, "positive_class": POS_LABEL}
    )
TOOLS.append(
    StructuredTool.from_function(
        name="get_feature_schema",
        description="Return required feature columns and positive class label.",
        func=schema_fn,
    )
)

In [4]:
available_tools = "\n".join([f"- {tool.name}: {tool.description}" for tool in TOOLS])

SYS_PROMPT = f"""
You are an ensemble meta-classifier for solving the problem of predicting whether a person's income exceeds $50K/year based on census data.

You will use multiple trained models (tools) to predict income based on input features. Each model has been trained on the same dataset and can provide a prediction and probability.

You will:
1. Call one or more tools to gather predictions.Each tool predicts income and returns valid JSON:
    {{
      "model": "<name>",
      "label": "<=50K or >50K>",
      "prob_gt_50k": <float between 0 and 1>
    }}
2. Compare the outputs from different models.
3. Choose a final income label and probability based on the evidence.
4. Prefer consistent, higher-confidence predictions.

Output strictly in the following JSON format (no text, no explanations):

{{
  "final_label": "<=50K or >50K>",
  "final_prob_gt_50k": <float between 0 and 1>,
  "evidence": [
    {{"model": "<name>", "label": "<=50K or >50K>", "prob_gt_50k": <float>}},
    ...
  ]
}}

Constraints:
- Call at most 6 tools total.
- Once you call a prediction tool, you dont need to call it again as it will not change.
- If you have enough evidence to make a decision, STOP and produce the final JSON immediately.
- If you have called 6 tools and still don't have a clear decision, produce the final JSON with all evidence collected so far.
- If no tools return a valid prediction, return an empty "evidence" array and set "final_label" to "unknown" and "final_prob_gt_50k" to 0.0.
- If a tool call fails or is slow, SKIP it (no retries).
- You must finish in ≤ 6 tool calls and ≤ 200 output tokens.

Available tools:
{available_tools}
""".strip()

# print(SYS_PROMPT)

llm = init_chat_model(model="gpt-oss:20b", model_provider="ollama", temperature=0.1)
# llm = init_chat_model(model="qwen3:8b", model_provider="ollama", temperature=0.1)
# llm = init_chat_model(model="llama3.1:8b", model_provider="ollama", temperature=0.1)  
answer_agent = create_react_agent(
                model=llm,
                tools=TOOLS,
                prompt=SYS_PROMPT,
                debug=True
    )

config={"recursion_limit": 20} 


In [5]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [6]:
import re
import json

def parse_agent_output(output_str: str) -> dict:
    think_match = re.search(r"<think>(.*?)</think>", output_str, re.DOTALL)
    thought_process = think_match.group(1).strip() if think_match else None
    
    json_match = re.search(r"\{.*\}", output_str, re.DOTALL)
    json_data = json.loads(json_match.group(0)) if json_match else {}
    
    return {
        "thought_process": thought_process,
        "label": json_data.get("final_label"),
        "prob_gt_50k": json_data.get("final_prob_gt_50k"),
        "evidence": json_data.get("evidence", [])
    }


In [8]:
sample_record = {
    "age": 39,
    "workclass": "Private",
    "final_weight": 77516,
    "education_number": 13,
    "marital_status": "Never-married",
    "occupation": "Tech-support",
    "relationship": "Not-in-family",
    "race": "White",
    "sex": "Male",
    "capital_gain": 0,
    "capital_loss": 0,
    "hours_per_week": 40,
    "native_country": "United-States",
}

user_payload = {"record": sample_record}
user_message = (
    "Predict the income class for this record. "
    "First call a few model tools, then give a single final JSON.\n"
    + json.dumps(user_payload)
)

result = answer_agent.invoke({"messages": [{"role": "user", "content": user_message}]},config=config)

final_text = result["messages"][-1].content
print("\n=== AGENT OUTPUT ===\n", final_text)

try:
    final_json = parse_agent_output(final_text)
    print("\nParsed FINAL JSON:", final_json)
except Exception as e:
    import traceback
    print(f"Error parsing JSON: {e}")
    traceback.print_exc()
    pass

[1m[values][0m {'messages': [HumanMessage(content='Predict the income class for this record. First call a few model tools, then give a single final JSON.\n{"record": {"age": 39, "workclass": "Private", "final_weight": 77516, "education_number": 13, "marital_status": "Never-married", "occupation": "Tech-support", "relationship": "Not-in-family", "race": "White", "sex": "Male", "capital_gain": 0, "capital_loss": 0, "hours_per_week": 40, "native_country": "United-States"}}', additional_kwargs={}, response_metadata={}, id='e7cf18eb-50cd-4cee-823f-88dc576e0ef9')]}
[1m[updates][0m {'agent': {'messages': [AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'gpt-oss:20b', 'created_at': '2025-08-16T15:51:39.67248601Z', 'done': True, 'done_reason': 'stop', 'total_duration': 41098363965, 'load_duration': 47333598, 'prompt_eval_count': 2039, 'prompt_eval_duration': 189394384, 'eval_count': 214, 'eval_duration': 40857573236, 'model_name': 'gpt-oss:20b'}, id='run--29facb6e-b7

In [9]:
print(final_text)

{"final_label": ">50K","final_prob_gt_50k": 0.35298973664673033,"evidence":[{"model":"logreg","label":"0","prob_gt_50k":0.35298973664673033},{"model":"rf_200","label":"0","prob_gt_50k":0.15}]}


In [10]:
final_json = parse_agent_output(final_text)
print("\nParsed FINAL JSON:", final_json)


Parsed FINAL JSON: {'thought_process': None, 'label': '>50K', 'prob_gt_50k': 0.35298973664673033, 'evidence': [{'model': 'logreg', 'label': '0', 'prob_gt_50k': 0.35298973664673033}, {'model': 'rf_200', 'label': '0', 'prob_gt_50k': 0.15}]}


In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

def evaluate(y_true, y_pred, y_prob=None):
    if y_prob is None:
        y_prob = np.where(y_pred == 1, 1.0, 0.0)
    return {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision_pos": float(precision_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "recall_pos": float(recall_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "f1_pos": float(f1_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "roc_auc": float(roc_auc_score(y_true, y_prob)),
    }

In [12]:
test_df = pd.read_csv("../data/test.csv")
X_test = test_df.drop(columns=["income"])
y_test = test_df["income"]

In [13]:
X_test.shape, y_test.shape

((6508, 13), (6508,))

In [14]:
# y_test.unique()
POS_LABEL = ">50K"
y_true_bin = (y_test.astype(str).str.strip() == POS_LABEL).astype(int).to_numpy()

In [None]:
results = {}
for tool in TOOLS:
    if not tool.name.endswith("_predict"):
        continue
    
    preds = []
    probs = []
    
    for _, row in X_test.iterrows():
        rec_json = row.to_dict()
        raw = tool.run(rec_json)  
        out = json.loads(raw)
        label = out["label"]
        prob = out["prob_gt_50k"]

        preds.append(1 if label.strip() == ">50K" else 0)
        probs.append(prob)
    
    metrics = evaluate(y_true_bin, np.array(preds), np.array(probs))
    results[tool.name] = metrics

import pprint
pprint.pprint(results)

In [None]:
preds = []
probs = []
final_results = []

for idx, row in X_test.iterrows():
    print("="*10,idx,"="*10)
    user_payload = {"record": row.to_dict()}
    user_message = (
        "Predict the income class for this record. "
        "First call a few model tools, then give a single final JSON.\n"
        + json.dumps(user_payload)
    )

    result = answer_agent.invoke({"messages": [{"role": "user", "content": user_message}]})

    final_text = result["messages"][-1].content
    label = 1.0
    prob = 0.0
    try:
        final_json = parse_agent_output(final_text)
        label = final_json["label"]
        prob = final_json["prob_gt_50k"]
        final_results.append(final_json)
        print("\nParsed FINAL JSON:", final_json)
    except Exception as e:
        import traceback
        print(f"Error parsing JSON: {e}")
        traceback.print_exc()
    
    preds.append(label)
    probs.append(prob)

metrics = evaluate(y_true_bin, np.array(preds), np.array(probs))

[1m[values][0m {'messages': [HumanMessage(content='Predict the income class for this record. First call a few model tools, then give a single final JSON.\n{"record": {"age": 39, "workclass": "Self-emp-inc", "final_weight": 163057, "education_number": 10, "marital_status": "Divorced", "occupation": "Craft-repair", "relationship": "Not-in-family", "race": "White", "sex": "Male", "capital_gain": 0, "capital_loss": 0, "hours_per_week": 99, "native_country": "United-States"}}', additional_kwargs={}, response_metadata={}, id='a12e1214-c490-4366-bf65-ad95cf99b32e')]}
[1m[updates][0m {'agent': {'messages': [AIMessage(content='', additional_kwargs={}, response_metadata={'model': 'gpt-oss:20b', 'created_at': '2025-08-16T15:56:28.857373838Z', 'done': True, 'done_reason': 'stop', 'total_duration': 34209636852, 'load_duration': 36880101, 'prompt_eval_count': 2043, 'prompt_eval_duration': 9518964282, 'eval_count': 158, 'eval_duration': 24649887231, 'model_name': 'gpt-oss:20b'}, id='run--a0445978

In [None]:
preds

In [None]:
results["agent"] = metrics

In [None]:
import pprint
pprint.pprint(results)