In [1]:
from __future__ import annotations
import json
from pathlib import Path
from typing import List

import pandas as pd
from joblib import load

from langchain.tools import StructuredTool

from langgraph.prebuilt import create_react_agent
from langchain.chat_models.base import init_chat_model

In [2]:
MODELS_DIR = Path("../models")

with open(MODELS_DIR / "metadata.json", "r") as f:
    METADATA = json.load(f)

SCHEMA = METADATA["schema"]
CAT_COLS: List[str] = SCHEMA["categorical"]
NUM_COLS: List[str] = SCHEMA["numerical"]
ALL_FEATURES: List[str] = CAT_COLS + NUM_COLS
POS_LABEL: str = SCHEMA["positive_class"]
MAPPING = SCHEMA["mapping"]
INV_MAPPING = {v: k for k, v in MAPPING.items()}

valid_models = ["logreg","dt_depth5","knn_5"]

ALL_MODELS = {
    name: load(cfg["path"]) for name, cfg in METADATA["models"].items()
}

In [3]:
PIPES = {k: v for k, v in ALL_MODELS.items() if k in valid_models}
PIPES

{'logreg': Pipeline(steps=[('prep',
                  ColumnTransformer(transformers=[('num', StandardScaler(),
                                                   ['age', 'final_weight',
                                                    'education_number',
                                                    'capital_gain',
                                                    'capital_loss',
                                                    'hours_per_week']),
                                                  ('cat',
                                                   OneHotEncoder(handle_unknown='ignore'),
                                                   ['workclass',
                                                    'marital_status',
                                                    'occupation', 'relationship',
                                                    'race', 'sex',
                                                    'native_country'])])),
                 ('model'

In [4]:
# from pydantic import BaseModel, Field

# class RecordInput(BaseModel):
#     age: int = Field(..., description="Age in years")
#     workclass: str = Field(..., description="Type of work class")
#     final_weight: int = Field(..., description="Census final weight")
#     education_number: int = Field(..., description="Numeric encoding of education level")
#     marital_status: str = Field(..., description="Marital status")
#     occupation: str = Field(..., description="Occupation category")
#     relationship: str = Field(..., description="Relationship status")
#     race: str = Field(..., description="Race of the individual")
#     sex: str = Field(..., description="Sex of the individual")
#     capital_gain: int = Field(..., description="Capital gain value")
#     capital_loss: int = Field(..., description="Capital loss value")
#     hours_per_week: int = Field(..., description="Hours worked per week")
#     native_country: str = Field(..., description="Native country")

# class ModelPrediction(BaseModel):
#     model: str
#     pred: str
#     label: str
#     prob_gt_50k: float

# from typing import Any, Dict, Mapping, Union
# from pydantic import BaseModel

# def _to_dict(obj: Union[Mapping[str, Any], BaseModel]) -> Dict[str, Any]:
#     # Works for Pydantic v2 (.model_dump) and v1 (.dict)
#     if isinstance(obj, BaseModel):
#         if hasattr(obj, "model_dump"):
#             return obj.model_dump()
#         return obj.dict()
#     return dict(obj)  # Mapping -> dict

# def coerce_df(rec: Union[Dict[str, Any], BaseModel]) -> pd.DataFrame:
#     data = _to_dict(rec)
#     row = {k: data.get(k, None) for k in ALL_FEATURES}
#     df = pd.DataFrame([row], columns=ALL_FEATURES)
#     for c in ("occupation", "workclass", "native_country"):
#         if c in df.columns and (pd.isna(df.at[0, c]) or df.at[0, c] == ""):
#             df.at[0, c] = "is_missing_from_data"
#     return df

# def pipe_predict(pipe, df: pd.DataFrame, model_key: str) -> ModelPrediction:
#     pred = pipe.predict(df)[0]
#     label = INV_MAPPING[int(pred)] if isinstance(pred, (int, float)) else str(pred)
#     prediction = "<=50K" if label == 0 else ">50K"
#     prob = float(pipe.predict_proba(df)[:, 1][0]) if hasattr(pipe, "predict_proba") else (1.0 if label == POS_LABEL else 0.0)
#     return ModelPrediction(model=model_key, pred = prediction, label=label, prob_gt_50k=prob)

# def make_tool(model_key: str) -> StructuredTool:
#     pipe = PIPES[model_key]

#     def _run(**kwargs) -> Dict[str, Any]:
#         inp = RecordInput(**kwargs)
#         df = coerce_df(inp)
#         out = pipe_predict(pipe, df, model_key)
#         if hasattr(out, "model_dump"):
#             return out.model_dump()
#         return out.dict()

#     return StructuredTool.from_function(
#         name=f"{model_key}_predict",
#         description=(f"Predict income using trained pipeline '{model_key}'. "
#                      "Provide all feature fields; returns JSON {model,label,prob_gt_50k}."),
#         args_schema=RecordInput,
#         func=_run,
#     )

# TOOLS: List[StructuredTool] = [make_tool(k) for k in PIPES.keys()]

# # class Empty(BaseModel): pass
# # def schema_fn(_: Empty = None) -> str:
# #     return json.dumps(
# #         {"categorical": CAT_COLS, "numerical": NUM_COLS, "positive_class": POS_LABEL}
# #     )
# # TOOLS.append(
# #     StructuredTool.from_function(
# #         name="get_feature_schema",
# #         description="Return required feature columns and positive class label.",
# #         func=schema_fn,
# #     )
# # )

In [5]:
test_df = pd.read_csv("../data/test.csv")
X_test = test_df.drop(columns=["income"])
y_test = test_df["income"]

In [6]:
def batch_predict_all(PIPES, X):
    out = {}
    for k in PIPES.keys():
        pipe = PIPES[k]
        out[k] = (pipe.predict_proba(X)[:, 1].astype(float)
                  if hasattr(pipe, "predict_proba")
                  else (pipe.predict(X).astype(int) == 1).astype(float))
    return out

probs_by_model = batch_predict_all(ALL_MODELS, X_test)



In [7]:
len(probs_by_model), len(X_test), len(X_test.columns), len(valid_models)

(6, 6508, 13, 3)

In [None]:
# available_tools = "\n".join([f"- {tool.name}: {tool.description}" for tool in TOOLS])

SYS_PROMPT = f"""
You are an ensemble meta-classifier for solving the problem of predicting whether a person's income exceeds $50K/year based on census data.

I will give the output of several weak classifiers (trained models) that provide probabilities for the positive class (>50K income). Your task is to combine these outputs to make a final prediction.

Think step-by-step about how to combine the outputs of these models to make a final prediction. Since these are weak classifiers, you may need to use the general knowledge of the problem and input data to improve the final prediction.
Logreg - Logistic regression model
DT5 - Decision tree model with max depth 5
KNN5 - K-nearest neighbors model with k=5

Response format:
```
{{
    "final_prediction": "<=50K" or ">50K",
    "prob_gt_50k": 0.0 or 1.0,
    "reasoning": "Explain your reasoning here",
}}
""".strip()

# llm = init_chat_model(model="gpt-oss:20b", model_provider="ollama", temperature=0.1)
llm = init_chat_model(model="qwen3:8b", model_provider="ollama", temperature=0.1)
# llm = init_chat_model(model="llama3.1:8b", model_provider="ollama", temperature=0.1)


In [60]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [61]:
import re
import json

def parse_agent_output(output_str: str) -> dict:
    think_match = re.search(r"<think>(.*?)</think>", output_str, re.DOTALL)
    thought_process = think_match.group(1).strip() if think_match else None
    
    json_match = re.search(r"\{.*\}", output_str, re.DOTALL)
    json_data = json.loads(json_match.group(0)) if json_match else {}
    
    return {
        "thought_process": thought_process,
        "label": json_data.get("final_prediction"),
        "prob_gt_50k": json_data.get("final_prob_gt_50k"),
        "evidence": json_data.get("reasoning", [])
    }


In [62]:
index = 1010
sample_record = X_test.iloc[index].to_dict()

user_payload = {"record": sample_record}
user_message = (
    "Predict the income class for this record. "
    + json.dumps(user_payload)
)

for model, prediction in probs_by_model.items():
    if model not in valid_models:
        continue
    user_message += f"\nModel: {model} - Prediction: {float(prediction[index]):.4f}"

print(f"User payload: {user_message}")
out = llm.invoke([("system", SYS_PROMPT), ("user", user_message)])
text = out.content.strip()

print("="*10)
print(text)

User payload: Predict the income class for this record. {"record": {"age": 71, "workclass": "Local-gov", "final_weight": 337064, "education_number": 14, "marital_status": "Widowed", "occupation": "Prof-specialty", "relationship": "Not-in-family", "race": "White", "sex": "Female", "capital_gain": 0, "capital_loss": 0, "hours_per_week": 40, "native_country": "United-States"}}
Model: logreg - Prediction: 0.6215
Model: dt_depth5 - Prediction: 0.1453
Model: knn_5 - Prediction: 0.0000
To make a final prediction, I'll combine the outputs of these models by considering their strengths and weaknesses.

The Logistic Regression (Logreg) model has a high probability of >50K income (0.6215), which suggests that it's confident in its prediction. This is likely due to its ability to capture complex relationships between features.

The Decision Tree (DT5) model, on the other hand, has a relatively low probability of >50K income (0.1453). This might be because decision trees can suffer from overfitting

In [71]:
import re

def extract_simple(text: str):
    print(text)
    # final_prediction
    m = re.search(r'final_prediction.*?(<=50K|>50K)', text)
    final_prediction = m.group(1) if m else None

    # prob_gt_50k
    m = re.search(r'prob_gt_50k.*?([0-9]*\.?[0-9]+)', text)
    prob_gt_50k = float(m.group(1)) if m else None
    label = ">50K" if prob_gt_50k and prob_gt_50k > 0.5 else "<=50K"
    final_prediction = label if not final_prediction else final_prediction

    # reasoning
    m = re.search(r'reasoning.*?:\s*"(.*?)"', text, flags=re.DOTALL)
    reasoning = m.group(1).strip() if m else None

    return {
        "final_prediction": final_prediction,
        "prob_gt_50k": prob_gt_50k,
        "reasoning": reasoning
    }

In [72]:
final_text = extract_simple(text)
print(final_text)

To make a final prediction, I'll combine the outputs of these models by taking their average probability for the positive class (>50K income). This is a simple and effective way to combine the predictions of multiple models.

Here's my reasoning:

* The logistic regression model (Logreg) predicts a probability of 0.7672, which means it has a moderate confidence in predicting that the person's income exceeds $50K.
* The decision tree model with max depth 5 (DT5) predicts a lower probability of 0.6603, indicating less confidence in its prediction.
* The K-nearest neighbors model with k=5 (KNN5) predicts a very high probability of 1.0000, which is likely an outlier and may not be reliable.

Taking the average of these probabilities gives us:

(0.7672 + 0.6603 + 1.0000) / 3 = 0.8098

Based on this combined prediction, I conclude that the person's income exceeds $50K.

Here is my response in the required format:
```
{
    "final_prediction": ">50K",
    "prob_gt_50k": 0.8098,
    "reasoning

In [73]:
# final_json = parse_agent_output(final_text)
# print("\nParsed FINAL JSON:", final_json)

In [92]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

def evaluate(y_true, y_pred, y_prob=None):
    if y_prob is None:
        y_prob = np.where(y_pred == 1, 1.0, 0.0)
    
    # Convert y_prob to numpy array and handle non-numeric values
    y_prob = np.array(y_prob)
    
    # Check for non-numeric values and convert them
    if y_prob.dtype == 'object' or not np.issubdtype(y_prob.dtype, np.number):
        print(f"Warning: y_prob contains non-numeric values. Data type: {y_prob.dtype}")
        print(f"Sample values: {y_prob[:5]}")
        
        # Try to convert to float, replacing non-convertible values with NaN
        y_prob_numeric = []
        for val in y_prob:
            try:
                if val is None or val == 'None' or val == '':
                    y_prob_numeric.append(np.nan)
                else:
                    y_prob_numeric.append(float(val))
            except (ValueError, TypeError):
                y_prob_numeric.append(np.nan)
        
        y_prob = np.array(y_prob_numeric, dtype=float)
    
    # Now handle NaN values in probability scores
    if np.any(np.isnan(y_prob)):
        # Option 1: Remove NaN entries from all arrays
        valid_mask = ~np.isnan(y_prob)
        y_true_clean = np.array(y_true)[valid_mask]
        y_pred_clean = np.array(y_pred)[valid_mask]
        y_prob_clean = np.array(y_prob)[valid_mask]
        
        # Check if we have enough valid samples
        if len(y_prob_clean) == 0:
            print("Warning: All probability scores are NaN. Cannot compute ROC AUC.")
            roc_auc_value = np.nan
        elif len(np.unique(y_true_clean)) < 2:
            print("Warning: Only one class present in valid samples. Cannot compute ROC AUC.")
            roc_auc_value = np.nan
        else:
            roc_auc_value = float(roc_auc_score(y_true_clean, y_prob_clean))
    else:
        # No NaN values, proceed normally
        if len(np.unique(y_true)) < 2:
            print("Warning: Only one class present. Cannot compute ROC AUC.")
            roc_auc_value = np.nan
        else:
            roc_auc_value = float(roc_auc_score(y_true, y_prob))
    
    return {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision_pos": float(precision_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "recall_pos": float(recall_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "f1_pos": float(f1_score(y_true, y_pred, pos_label=1, zero_division=0)),
        "roc_auc": roc_auc_value,
    }

In [75]:
X_test.shape, y_test.shape

((6508, 13), (6508,))

In [76]:
# y_test.unique()
POS_LABEL = ">50K"
y_true_bin = (y_test.astype(str).str.strip() == POS_LABEL).astype(int).to_numpy()

In [77]:
# results = {}
# for tool in TOOLS:
#     if not tool.name.endswith("_predict"):
#         continue
    
#     preds = []
#     probs = []
    
#     for _, row in X_test.iterrows():
#         rec_json = row.to_dict()
#         raw = tool.run(rec_json)  
#         out = json.loads(raw)
#         label = out["label"]
#         prob = out["prob_gt_50k"]

#         preds.append(1 if label.strip() == ">50K" else 0)
#         probs.append(prob)
    
#     metrics = evaluate(y_true_bin, np.array(preds), np.array(probs))
#     results[tool.name] = metrics

# import pprint
# pprint.pprint(results)

In [79]:
preds = []
probs = []
final_results = []

for idx, row in X_test.iterrows():
    print("="*10,idx,"="*10)

    user_payload = {"record": row.to_dict()}
    user_message = (
        "Predict the income class for this record. "
        + json.dumps(user_payload)
    )

    for model, prediction in probs_by_model.items():
        if model not in valid_models:
            continue
        user_message += f"\nModel: {model} - Prediction: {float(prediction[idx]):.4f}"

    print(f"User payload: {user_message}")
    out = llm.invoke([("system", SYS_PROMPT), ("user", user_message)])
    text = out.content.strip()

    label = 1.0
    prob = 0.0
    try:
        final_json = extract_simple(text)
        label = final_json["final_prediction"]
        prob = final_json["prob_gt_50k"]
        final_results.append(final_json)
        print("\nParsed FINAL JSON:", final_json)
    except Exception as e:
        import traceback
        print(f"Error parsing JSON: {e}")
        traceback.print_exc()
    
    preds.append(label)
    probs.append(prob)

User payload: Predict the income class for this record. {"record": {"age": 39, "workclass": "Self-emp-inc", "final_weight": 163057, "education_number": 10, "marital_status": "Divorced", "occupation": "Craft-repair", "relationship": "Not-in-family", "race": "White", "sex": "Male", "capital_gain": 0, "capital_loss": 0, "hours_per_week": 99, "native_country": "United-States"}}
Model: logreg - Prediction: 0.6735
Model: dt_depth5 - Prediction: 0.1019
Model: knn_5 - Prediction: 0.4000
To make a final prediction, I will combine the outputs of these models by taking into account their strengths and weaknesses.

The logistic regression model (Logreg) has a high probability of >50K income (0.6735), which suggests that it is confident in its prediction. This model is often good at capturing complex relationships between variables, so its high confidence is a good sign.

On the other hand, the decision tree model (DT5) has a relatively low probability of >50K income (0.1019). Decision trees can be

KeyboardInterrupt: 

In [82]:
preds = np.array(preds, dtype=str)
preds_bin = (np.char.strip(preds) == POS_LABEL).astype(int)

In [83]:
processed_record_count = len(preds)
print(f"Processed {processed_record_count} records.")

Processed 2576 records.


In [93]:
metrics = evaluate(y_true_bin[:processed_record_count], preds_bin[:processed_record_count], np.array(probs)[:processed_record_count])

Sample values: [0.4771 0.6218 0.82825 0.0863 0.5687]


In [94]:
results = {}
results["agent"] = metrics

In [95]:
metrics_by_model = {}

for model, probs in probs_by_model.items():
    probs_600 = probs[:processed_record_count]
    preds_600 = (probs_600 >= 0.5).astype(int)

    results[model] = evaluate(y_true_bin[:processed_record_count], preds_600, probs_600)

In [96]:
df = pd.DataFrame(results).T
df = df.reset_index().rename(columns={'index': 'Model'})

df = df.round(3)

In [97]:
df

Unnamed: 0,Model,accuracy,precision_pos,recall_pos,f1_pos,roc_auc
0,agent,0.743,0.485,0.8,0.604,0.897
1,dummy_most_frequent,0.755,0.0,0.0,0.0,0.5
2,logreg,0.808,0.575,0.821,0.676,0.899
3,dt_depth5,0.847,0.776,0.524,0.626,0.876
4,rf_200,0.847,0.737,0.584,0.652,0.9
5,knn_5,0.825,0.67,0.56,0.61,0.847
6,lgbm,0.863,0.774,0.621,0.689,0.922
