In [None]:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Loan Approval Model - Notebook-Friendly Single-File Script (Patched, Protected-Feature Handling)

- No argparse; main() is parameter-driven.
- Protected/Derived/Given handling via feature_classification.csv:
    * Only "Derived" + "Given" features are used for training.
    * All "Protected" features are excluded and logged.
    * Saves a feature_tags.csv showing tag per feature and whether used.
- Robust preprocessing pipeline:
    * Median imputation (numeric), most_frequent (categorical)
    * Outlier capping per-feature: Z-score (≈normal) else IQR-based
    * RobustScaler (median/IQR scaling)
- Dynamic, safe feature selection to avoid KeyError.
- GridSearchCV with compatible solver/penalty combinations.
- StratifiedKFold with safe n_splits based on minority class size.
- Robust target casting to avoid IntCastingNaNError.
- Returns artifacts & model; also saves JSON + pickle to out_dir.
"""

import json
import pickle
from pathlib import Path
import time
from typing import List, Tuple, Optional, Dict, Any

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (f1_score, precision_score, recall_score, roc_auc_score,
                             average_precision_score, classification_report)

# -----------------------------
# Global config (edit these variables)
# -----------------------------

DEFAULT_DATA = "Data/Loan_dataset_india_20000.csv"
DEFAULT_TARGET = "credit_approved"
RANDOM_STATE = 42

# Path to the feature classification file (columns: Feature, Classification)
# Values in Classification should be one of: Protected, Derived, Given (case-insensitive).
FEATURE_CLASSIFICATION: Optional[str] = "data/feature_classification.csv"

# I/O and training knobs
OUT_DIR: str = "Data"
TEST_SIZE: float = 0.3
CV_FOLDS: int = 1
SMOKE: bool = False
SAMPLE_NROWS: Optional[int] = None


# -----------------------------
# Utilities
# -----------------------------

def make_ohe():
    """Return OneHotEncoder compatible with sklearn>=1.2 (sparse_output) and older versions (sparse)."""
    try:
        return OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown='ignore', sparse=False)


def safe_intersection(names: List[str], cols: List[str]) -> List[str]:
    """Return items of names that exist in cols, preserving order."""
    colset = set(cols)
    return [n for n in names if n in colset]


def read_feature_classification(csv_path: Optional[str]) -> Optional[pd.DataFrame]:
    """Read feature classification CSV if provided and exists; otherwise return None."""
    if not csv_path:
        return None
    p = Path(csv_path)
    if not p.exists():
        print(f"[warn] feature_classification not found at: {csv_path}. Proceeding without tags.")
        return None
    try:
        df = pd.read_csv(p)
        # Normalize names
        colmap = {c.lower(): c for c in df.columns}
        feat_col = colmap.get("feature", "Feature" if "Feature" in df.columns else df.columns[0])
        cls_col = colmap.get("classification", "Classification" if "Classification" in df.columns else df.columns[1])
        df = df.rename(columns={feat_col: "Feature", cls_col: "Classification"})
        df["Feature"] = df["Feature"].astype(str)
        df["Classification"] = (df["Classification"].astype(str)
                                .str.strip().str.lower()
                                .map({"protected": "protected", "derived": "derived", "given": "given"})
                                .fillna("given"))
        return df
    except Exception as e:
        print(f"[warn] Failed to read/parse feature_classification '{csv_path}': {e}")
        return None


def tag_features_from_classification(all_cols: List[str], fc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
    """
    Create a tag dataframe with columns: feature, tag, raw_tag.
    If fc_df is None or feature missing, default to 'given'.
    """
    tag_map = {}
    if fc_df is not None:
        tag_map = dict(zip(fc_df["Feature"].astype(str), fc_df["Classification"]))

    rows = []
    for c in all_cols:
        t = tag_map.get(c, "given")
        tag = t if t in {"given", "derived", "protected"} else "given"
        rows.append({"feature": c, "tag": tag, "raw_tag": t})
    return pd.DataFrame(rows)


def infer_feature_types(df: pd.DataFrame,
                        target: str) -> Tuple[List[str], List[str]]:
    """Infer numerical and categorical features from df (excluding target)."""
    cols = [c for c in df.columns if c != target]
    numeric_cols = df[cols].select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in cols if c not in numeric_cols]
    return numeric_cols, cat_cols


class OutlierCapper(BaseEstimator, TransformerMixin):
    """
    Caps numeric features column-wise using either:
        - Z-score (mean ± 3*std) if abs(skew) < 1 (roughly normal), else
        - IQR method (Q1 - 1.5*IQR, Q3 + 1.5*IQR) for skewed distributions.
    Thresholds are learned on training data and applied to transform.
    """
    def __init__(self):
        self.bounds_: List[Tuple[Optional[float], Optional[float]]] = []

    def fit(self, X, y=None):
        X_arr = self._to_array(X)
        self.bounds_ = []
        for i in range(X_arr.shape[1]):
            col = X_arr[:, i]
            col_nonan = col[~np.isnan(col)]
            if col_nonan.size == 0:
                self.bounds_.append((None, None))
                continue
            skewness = pd.Series(col_nonan).skew()
            if abs(skewness) < 1:
                m, s = float(np.mean(col_nonan)), float(np.std(col_nonan, ddof=0))
                if s == 0 or np.isnan(s):
                    low, high = m, m
                else:
                    low, high = m - 3*s, m + 3*s
            else:
                Q1, Q3 = np.percentile(col_nonan, 25), np.percentile(col_nonan, 75)
                IQR = Q3 - Q1
                if IQR == 0:
                    low, high = Q1, Q3
                else:
                    low, high = Q1 - 1.5*IQR, Q3 + 1.5*IQR
            self.bounds_.append((low, high))
        return self

    def transform(self, X):
        X_arr = self._to_array(X)
        X_capped = X_arr.copy()
        for i, (low, high) in enumerate(self.bounds_):
            if low is not None and high is not None:
                X_capped[:, i] = np.clip(X_arr[:, i], low, high)
        return X_capped

    @staticmethod
    def _to_array(X):
        if isinstance(X, pd.DataFrame):
            return X.values.astype(float)
        return np.asarray(X, dtype=float)


def build_preprocessor(numeric_cols: List[str], cat_cols: List[str]) -> ColumnTransformer:
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('outlier', OutlierCapper()),
        ('scaler', RobustScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', make_ohe())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder='drop'
    )
    return preprocessor


def build_pipeline(preprocessor: ColumnTransformer) -> Pipeline:
    clf = LogisticRegression(random_state=RANDOM_STATE)
    pipe = Pipeline(steps=[
        ('prep', preprocessor),
        ('clf', clf)
    ])
    return pipe


def get_param_grid(smoke: bool = False) -> List[Dict[str, Any]]:
    if smoke:
        return [
            {
                'clf__solver': ['lbfgs'],
                'clf__penalty': ['l2'],
                'clf__C': [1.0],
                'clf__class_weight': [None, 'balanced'],
                'clf__max_iter': [200]
            }
        ]
    return [
        {
            'clf__solver': ['liblinear'],
            'clf__penalty': ['l1', 'l2'],
            'clf__C': [0.1, 1.0, 10.0],
            'clf__class_weight': [None, 'balanced'],
            'clf__max_iter': [200]
        },
        {
            'clf__solver': ['lbfgs'],
            'clf__penalty': ['l2'],
            'clf__C': [0.1, 1.0, 10.0],
            'clf__class_weight': [None, 'balanced'],
            'clf__max_iter': [200]
        },
        {
            'clf__solver': ['saga'],
            'clf__penalty': ['l1', 'l2', 'elasticnet'],
            'clf__l1_ratio': [0.0, 0.5, 1.0],
            'clf__C': [0.1, 1.0, 10.0],
            'clf__class_weight': [None, 'balanced'],
            'clf__max_iter': [500]
        }
    ]


def evaluate(y_true, y_pred, y_proba=None) -> Dict[str, Any]:
    metrics = {
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
    }
    if y_proba is not None:
        try:
            metrics['roc_auc'] = roc_auc_score(y_true, y_proba)
        except Exception:
            metrics['roc_auc'] = None
        try:
            metrics['pr_auc'] = average_precision_score(y_true, y_proba)
        except Exception:
            metrics['pr_auc'] = None
    return metrics


def apply_feature_tags_and_filter(df: pd.DataFrame,
                                  target: str,
                                  fc_path: Optional[str]) -> Tuple[pd.DataFrame, pd.DataFrame, List[str], List[str]]:
    """
    Returns: (df_filtered, tag_df, allowed_features, protected_excluded)
    - tag_df columns: feature, tag, raw_tag, used_in_training (bool)
    """
    all_cols = [c for c in df.columns if c != target]
    fc_df = read_feature_classification(fc_path)
    tags = tag_features_from_classification(all_cols, fc_df)
    allowed = tags[tags['tag'].isin(['given', 'derived'])]['feature'].tolist()
    protected_excluded = tags[tags['tag'] == 'protected']['feature'].tolist()
    # Filter df to allowed features + target
    df_filtered = pd.concat([df[allowed], df[target]], axis=1)
    tags['used_in_training'] = tags['feature'].isin(allowed)
    return df_filtered, tags, allowed, protected_excluded


def run_training(df: pd.DataFrame,
                 target: str,
                 cv_folds: int = 5,
                 test_size: float = 0.2,
                 random_state: int = RANDOM_STATE,
                 metadata: Optional[str] = None,
                 smoke: bool = False,
                 feature_classification: Optional[str] = FEATURE_CLASSIFICATION,
                 out_dir: str = OUT_DIR) -> Dict[str, Any]:
    assert target in df.columns, f"Target column '{target}' not found in data."

    # Drop any clearly ID-like columns if present (non-informative)
    drop_like = [c for c in df.columns if c.lower() in {'loan_id', 'id', 'application_id'}]
    if drop_like:
        df = df.drop(columns=drop_like)

    # --- Robust target handling to avoid IntCastingNaNError ---
    y_raw = df[target]
    mapping = {
        'y': 1, 'yes': 1, 'true': 1, 't': 1, 'approved': 1, 'approve': 1, '1': 1,
        'n': 0, 'no': 0, 'false': 0, 'f': 0, 'rejected': 0, 'reject': 0, '0': 0
    }
    y_mapped = None
    if not pd.api.types.is_numeric_dtype(y_raw):
        y_lower = y_raw.astype(str).str.strip().str.lower()
        y_mapped = y_lower.map(mapping)
        if y_mapped.isna().mean() > 0.1:
            y_mapped = None

    if y_mapped is not None:
        y_series = y_mapped
    else:
        y_num = pd.to_numeric(y_raw, errors='coerce')
        y_series = y_num

    # Drop rows with missing target
    mask_valid = y_series.notna()
    if mask_valid.sum() < len(y_series):
        print(f"[warn] Dropping {len(y_series) - mask_valid.sum()} rows with missing target '{target}'.")
    df = df.loc[mask_valid].copy()
    y_series = y_series.loc[mask_valid]

    # If series is float but actually binary like {0.0,1.0}, cast to int
    uniques = pd.Series(y_series.dropna().unique())
    if set(uniques.tolist()).issubset({0, 1, 0.0, 1.0}):
        y = y_series.astype(int)
    else:
        if not pd.api.types.is_integer_dtype(y_series):
            y = y_series.astype('category').cat.codes
        else:
            y = y_series

    n_classes = pd.Series(y).nunique()
    if n_classes < 2:
        raise ValueError(f"Target '{target}' has only one class after cleaning; cannot train.")

    # --- Apply feature tags & filter protected ---
    df_tagged, tag_df, allowed_features, protected_excluded = apply_feature_tags_and_filter(
        df=df, target=target, fc_path=feature_classification
    )

    # Train/test split
    X = df_tagged.drop(columns=[target])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Infer features from TRAIN ONLY to avoid leakage and to ensure presence
    num_cols, cat_cols = infer_feature_types(X_train, target="")

    if not num_cols and not cat_cols:
        num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
        cat_cols = [c for c in X_train.columns if c not in num_cols]

    preprocessor = build_preprocessor(num_cols, cat_cols)
    pipe = build_pipeline(preprocessor)
    param_grid = get_param_grid(smoke=smoke)

    # Safe StratifiedKFold
    min_class_count = int(pd.Series(y_train).value_counts().min())
    n_splits = max(2, min(int(cv_folds), min_class_count))
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    print("[info] Starting GridSearchCV ...")
    grid = GridSearchCV(
        pipe,
        param_grid=param_grid,
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=1,
        error_score='raise'
    )

    grid.fit(X_train, y_train)

    best = grid.best_estimator_
    y_pred = best.predict(X_test)
    y_proba = None
    if hasattr(best, "predict_proba"):
        try:
            y_proba = best.predict_proba(X_test)[:, 1]
        except Exception:
            y_proba = None

    metrics = evaluate(y_test, y_pred, y_proba=y_proba)
    print("\n[report] Classification report on test:")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Save feature tags alongside artifacts
    out_dir_path = Path(out_dir)
    out_dir_path.mkdir(parents=True, exist_ok=True)
    tag_path = out_dir_path / "feature_tags.csv"
    tag_df.to_csv(tag_path, index=False)

    artifacts = {
        'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"),
        'target': target,
        'numeric_features': num_cols,
        'categorical_features': cat_cols,
        'allowed_features': allowed_features,
        'protected_features_excluded': protected_excluded,
        'feature_tags_csv': str(tag_path),
        'best_params': grid.best_params_,
        'best_score_mean_cv_f1': float(grid.best_score_),
        'test_metrics': metrics
    }
    return artifacts, best


def main(data_path: str = DEFAULT_DATA,
         target: str = DEFAULT_TARGET,
         metadata: Optional[str] = None,
         test_size: float = TEST_SIZE,
         cv_folds: int = CV_FOLDS,
         random_state: int = RANDOM_STATE,
         smoke: bool = SMOKE,
         sample_nrows: Optional[int] = SAMPLE_NROWS,
         out_dir: str = OUT_DIR,
         feature_classification: Optional[str] = FEATURE_CLASSIFICATION):
    """
    Notebook-friendly main(). Pass parameters directly.
    Returns (artifacts: dict, model: estimator).
    """
    data_path = Path(data_path)
    assert data_path.exists(), f"Data file not found: {data_path}"
    print(f"[info] Loading data: {data_path}")

    read_kwargs: Dict[str, Any] = {}
    if smoke and sample_nrows is None:
        read_kwargs['nrows'] = 2000  # small sample for quicker run
    elif sample_nrows is not None:
        read_kwargs['nrows'] = int(sample_nrows)

    df = pd.read_csv(data_path, **read_kwargs)

    # Basic cleaning: strip column names
    df.columns = [str(c).strip() for c in df.columns]

    # If target not found, try common fallbacks
    if target not in df.columns:
        candidates = [c for c in df.columns if c.lower() in {"credit_approved", "approved", "label", "target"}]
        if candidates:
            print(f"[warn] Target '{target}' not found. Using '{candidates[0]}' instead.")
            target = candidates[0]
        else:
            raise ValueError(f"Target column '{target}' not found and no fallback detected.")

    artifacts, model = run_training(
        df=df,
        target=target,
        cv_folds=cv_folds,
        test_size=test_size,
        random_state=random_state,
        metadata=metadata if metadata else None,
        smoke=smoke,
        feature_classification=feature_classification,
        out_dir=out_dir
    )

    # Save artifacts
    out_dir_path = Path(out_dir)
    out_dir_path.mkdir(parents=True, exist_ok=True)
    artifacts_path = out_dir_path / "run_artifacts.json"
    model_path = out_dir_path / "best_model.pkl"

    with open(artifacts_path, "w") as f:
        json.dump(artifacts, f, indent=2)

    with open(model_path, "wb") as f:
        pickle.dump(model, f)

    # Print summary
    print("\n=== Run Summary ===")
    print(json.dumps(artifacts, indent=2))
    print(f"\nArtifacts saved to: {artifacts_path}")
    print(f"Model saved to: {model_path}")
    print(f"Feature tags saved to: {artifacts.get('feature_tags_csv')}")

    return artifacts, model


if __name__ == "__main__":
    # Call main() using the variables defined at the top of the file
    main(
        data_path=DEFAULT_DATA,
        target=DEFAULT_TARGET,
        metadata=None,
        test_size=TEST_SIZE,
        cv_folds=CV_FOLDS,
        random_state=RANDOM_STATE,
        smoke=SMOKE,
        sample_nrows=SAMPLE_NROWS,
        out_dir=OUT_DIR,
        feature_classification=FEATURE_CLASSIFICATION
    )
    


## code wiht threshold

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Loan Approval Model - L2 Balanced Fast Grid + Threshold Constraints

- Only L2 penalty
- class_weight='balanced' (fixed)
- Narrow, fast grid (lbfgs/saga × C in {0.01, 0.1, 1.0, 10.0})
- Higher max_iter for convergence
- Threshold tuning to require precision >= MIN_PRECISION and recall >= MIN_RECALL (if achievable)
"""

import json, pickle, time
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Any

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (f1_score, precision_score, recall_score, roc_auc_score,
                             average_precision_score, classification_report, precision_recall_curve)

# ---------------- Config ----------------
DEFAULT_DATA = "Data/Loan_dataset_india_110000_final_update copy(in).csv"
DEFAULT_TARGET = "credit_approved"
FEATURE_CLASSIFICATION: Optional[str] = "data/feature_classification.csv"
OUT_DIR = "Data"

# speed/quality knobs
CV_FOLDS = 5  # faster
FAST_MODE = False        # if True -> only lbfgs (4 combos). If False -> lbfgs & saga (8 combos)
MAX_ITER = 50  # higher convergence budget
C_GRID = [10, 100, 1000]

# threshold constraints
MIN_PRECISION = 0.60
MIN_RECALL = 0.60

RANDOM_STATE = 42
TEST_SIZE = 0.3
SMOKE = False
SAMPLE_NROWS: Optional[int] = None

# --------------- Utils ------------------
def make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def read_feature_classification(csv_path: Optional[str]) -> Optional[pd.DataFrame]:
    if not csv_path: return None
    p = Path(csv_path)
    if not p.exists(): return None
    df = pd.read_csv(p)
    colmap = {c.lower(): c for c in df.columns}
    feat = colmap.get("feature", list(df.columns)[0])
    cls  = colmap.get("classification", list(df.columns)[1])
    df = df.rename(columns={feat:"Feature", cls:"Classification"})
    df["Feature"] = df["Feature"].astype(str)
    df["Classification"] = (df["Classification"].astype(str).str.strip().str.lower()
                            .map({"protected":"protected","derived":"derived","given":"given"}).fillna("given"))
    return df

def tag_features_from_classification(all_cols: List[str], fc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
    tag_map = {} if fc_df is None else dict(zip(fc_df["Feature"].astype(str), fc_df["Classification"]))
    rows = []
    for c in all_cols:
        t = tag_map.get(c, "given")
        tag = t if t in {"given","derived","protected"} else "given"
        rows.append({"feature": c, "tag": tag, "raw_tag": t})
    return pd.DataFrame(rows)


def apply_feature_tags_and_filter(df: pd.DataFrame, target: str, fc_path: Optional[str]):
    cols = [c for c in df.columns if c != target]
    tags = tag_features_from_classification(cols, read_feature_classification(fc_path))
    allowed = tags[tags["tag"].isin(["given","derived"])]["feature"].tolist()
    protected = tags[tags["tag"]=="protected"]["feature"].tolist()
    out = pd.concat([df[allowed], df[target]], axis=1)
    tags["used_in_training"] = tags["feature"].isin(allowed)
    return out, tags, allowed, protected

class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self): self.bounds_ = []
    def fit(self, X, y=None):
        A = self._arr(X); self.bounds_ = []
        for i in range(A.shape[1]):
            col = A[:, i]; mask = ~np.isnan(col); v = col[mask]
            if v.size == 0: self.bounds_.append((None,None)); continue
            skew = pd.Series(v).skew()
            if abs(skew) < 1:
                m, s = float(np.mean(v)), float(np.std(v, ddof=0))
                low, high = (m, m) if (s==0 or np.isnan(s)) else (m-3*s, m+3*s)
            else:
                q1, q3 = np.percentile(v,25), np.percentile(v,75); iqr = q3-q1
                low, high = (q1,q3) if iqr==0 else (q1-1.5*iqr, q3+1.5*iqr)
            self.bounds_.append((low, high))
        return self
    def transform(self, X):
        A = self._arr(X); B = A.copy()
        for i,(lo,hi) in enumerate(self.bounds_):
            if lo is not None and hi is not None: B[:,i] = np.clip(A[:,i], lo, hi)
        return B
    @staticmethod
    def _arr(X): return X.values.astype(float) if isinstance(X,pd.DataFrame) else np.asarray(X, dtype=float)

def infer_feature_types(df: pd.DataFrame, target: str):
    cols = [c for c in df.columns if c != target]
    num = df[cols].select_dtypes(include=[np.number]).columns.tolist()
    cat = [c for c in cols if c not in num]
    return num, cat

def build_preprocessor(num_cols, cat_cols):
    num_tf = Pipeline([("imputer", SimpleImputer(strategy="median")),
                       ("cap", OutlierCapper()),
                       ("scale", RobustScaler())])
    cat_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                       ("ohe", make_ohe())])
    return ColumnTransformer([("num", num_tf, num_cols),
                              ("cat", cat_tf, cat_cols)], remainder="drop")

def build_pipeline(prep):
    clf = LogisticRegression(penalty="l2",
                             class_weight="balanced",
                             max_iter=MAX_ITER,
                             random_state=RANDOM_STATE)
    return Pipeline([("prep", prep), ("clf", clf)])

def get_param_grid():
    solvers = ["lbfgs"] if FAST_MODE else ["lbfgs","saga"]
    return [{
        "clf__solver": solvers,
        "clf__penalty": ["l2"],
        "clf__C": C_GRID,
        "clf__class_weight": ["balanced"],
        "clf__max_iter": [MAX_ITER]
    }]

def choose_threshold(y_true, y_proba, min_p, min_r):
    p, r, thr = precision_recall_curve(y_true, y_proba)
    cand = []
    for pi, ri, ti in zip(p[:-1], r[:-1], thr):
        if pi >= min_p and ri >= min_r:
            f1 = 2*pi*ri/(pi+ri+1e-12); cand.append((ti,pi,ri,f1))
    if cand:
        ti,pi,ri,f1 = max(cand, key=lambda x:x[3])
        return {"threshold": float(ti), "precision": float(pi), "recall": float(ri),
                "f1": float(f1), "constraints_met": True}
    f1s = [2*pi*ri/(pi+ri+1e-12) for pi,ri in zip(p[:-1], r[:-1])]
    bi = int(np.argmax(f1s)) if f1s else 0
    bt = float(thr[bi]) if len(thr)>0 else 0.5
    return {"threshold": bt, "precision": float(p[bi]), "recall": float(r[bi]),
            "f1": float(f1s[bi]) if f1s else 0.0, "constraints_met": False}

def evaluate(y_true, y_pred, y_proba=None):
    m = {"f1": f1_score(y_true,y_pred,zero_division=0),
         "precision": precision_score(y_true,y_pred,zero_division=0),
         "recall": recall_score(y_true,y_pred,zero_division=0)}
    if y_proba is not None:
        try: m["roc_auc"] = roc_auc_score(y_true,y_proba)
        except: m["roc_auc"] = None
        try: m["pr_auc"] = average_precision_score(y_true,y_proba)
        except: m["pr_auc"] = None
    return m

def main(data_path=DEFAULT_DATA, target=DEFAULT_TARGET, feature_classification=FEATURE_CLASSIFICATION):
    data_path = Path(data_path); assert data_path.exists(), f"Missing data: {data_path}"
    read_kwargs = {}
    # if SMOKE and SAMPLE_NROWS is None: read_kwargs["nrows"]=2000
    # if SAMPLE_NROWS is not None: read_kwargs["nrows"]=int(SAMPLE_NROWS)
    # df = pd.read_csv(data_path, **read_kwargs)
    df = pd.read_csv(data_path)
    print("Shape of DF",df.shape)
    df.columns = [str(c).strip() for c in df.columns]
    if target not in df.columns: raise ValueError(f"Target '{target}' not in data.")

    # Target cleaning
    y_raw = df[target]
    if not pd.api.types.is_numeric_dtype(y_raw):
        mapping = {"y":1,"yes":1,"true":1,"t":1,"approved":1,"1":1,"n":0,"no":0,"false":0,"f":0,"rejected":0,"0":0}
        y_series = y_raw.astype(str).str.strip().str.lower().map(mapping)
    else:
        y_series = pd.to_numeric(y_raw, errors="coerce")
    mask = y_series.notna()
    if mask.sum()<len(y_series): print(f"[warn] dropped {len(y_series)-mask.sum()} rows with invalid target")
    df = df.loc[mask].copy(); y = y_series.loc[mask].astype(int)
    print("Dataframe after cleaning:", df.shape)

    df_f, tags, allowed, protected = apply_feature_tags_and_filter(df, target, feature_classification)

    X = df_f.drop(columns=[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                        random_state=RANDOM_STATE, stratify=y)
    num, cat = infer_feature_types(X_train, target="")
    prep = build_preprocessor(num, cat)
    pipe = build_pipeline(prep)
    grid = get_param_grid()

    min_class = int(pd.Series(y_train).value_counts().min())
    n_splits = max(2, min(CV_FOLDS, min_class))
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    combos = len(grid[0]["clf__solver"]) * len(grid[0]["clf__C"])
    print(f"[info] Effective grid combos: {combos}  |  CV folds: {n_splits}  |  total fits: {combos*n_splits}")

    gs = GridSearchCV(pipe, param_grid=grid, cv=cv, scoring="f1", n_jobs=-1, verbose=1)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_

    y_proba = best.predict_proba(X_test)[:,1] if hasattr(best,"predict_proba") else None
    y_pred = best.predict(X_test)
    tinfo = None
    if y_proba is not None:
        tinfo = choose_threshold(y_test, y_proba, MIN_PRECISION, MIN_RECALL)
        y_pred = (y_proba >= tinfo["threshold"]).astype(int)

    mets = evaluate(y_test, y_pred, y_proba)
    print("\n[report] Post-threshold classification report")
    print(classification_report(y_test, y_pred, zero_division=0))

    out = {
        "best_params": gs.best_params_,
        "best_score_mean_cv_f1": float(gs.best_score_),
        "threshold_constraints": {"min_precision": MIN_PRECISION, "min_recall": MIN_RECALL, "tuning": tinfo},
        "test_metrics": mets,
        "protected_features_excluded": protected,
        "allowed_features_count": len(allowed),
        "solvers": grid[0]["clf__solver"],
        "C_grid": grid[0]["clf__C"],
        "max_iter": MAX_ITER,
        "cv_folds": n_splits,
        "fast_mode": FAST_MODE
    }

    out_dir = Path(OUT_DIR); out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir/"feature_tags_fast.csv").write_text(tags.to_csv(index=False))
    with open(out_dir/"run_artifacts_fast.json","w") as f: json.dump(out, f, indent=2)
    with open(out_dir/"best_model_fast.pkl","wb") as f: pickle.dump(best, f)

    print("\n=== Summary ===")
    print(json.dumps(out, indent=2))
    if tinfo and not tinfo.get("constraints_met", False):
        print(f"\n[warn] Could not meet precision>={MIN_PRECISION:.2f} & recall>={MIN_RECALL:.2f} at any threshold. "
              f"Best: P={tinfo['precision']:.3f}, R={tinfo['recall']:.3f}.")

if __name__ == "__main__":
    main()


  df = pd.read_csv(data_path)


Shape of DF (110000, 29)
[warn] dropped 85000 rows with invalid target
Dataframe after cleaning: (25000, 29)
[info] Effective grid combos: 6  |  CV folds: 5  |  total fits: 30
Fitting 5 folds for each of 6 candidates, totalling 30 fits

[report] Post-threshold classification report
              precision    recall  f1-score   support

           0       0.97      0.80      0.88      4509
           1       0.76      0.97      0.85      2991

    accuracy                           0.87      7500
   macro avg       0.87      0.88      0.86      7500
weighted avg       0.89      0.87      0.87      7500


=== Summary ===
{
  "best_params": {
    "clf__C": 1000,
    "clf__class_weight": "balanced",
    "clf__max_iter": 50,
    "clf__penalty": "l2",
    "clf__solver": "lbfgs"
  },
  "best_score_mean_cv_f1": 0.8508354167393648,
  "threshold_constraints": {
    "min_precision": 0.6,
    "min_recall": 0.6,
    "tuning": {
      "threshold": 0.42297931660877264,
      "precision": 0.7606635071

Exception ignored in: <function ResourceTracker.__del__ at 0x105fd5bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x104c95bc0>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/opt/anaconda3/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x1043d1bc0>
Traceback (most recent call last

## 4rd Optimization

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Loan Approval Model — Precision-First (L2, balanced) — SPARSE & ROBUST
----------------------------------------------------------------------
- Precision-first CV scorer (maximize precision at recall floor)
- Logistic Regression (L2), class_weight='balanced', solver='saga' (sparse-aware)
- OneHotEncoder forced to sparse to avoid memory blow-ups
- Optional SelectKBest(mutual_info) kept; can be disabled via search
- RandomizedSearchCV over broad C/tol/fit_intercept + selector k
- Pipeline caching enabled
"""

import json, pickle, time, os, warnings
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Any

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import (f1_score, precision_score, recall_score, roc_auc_score,
                             average_precision_score, classification_report, precision_recall_curve)

# ---------------- USER CONFIG ----------------
DEFAULT_DATA = "Data/Loan_dataset_india_110000.csv"
DEFAULT_TARGET = "credit_approved"
FEATURE_CLASSIFICATION: Optional[str] = "data/feature_classification.csv"
OUT_DIR = "Data"

# Precision-first targets
PRECISION_TARGET = 0.60
RECALL_FLOOR    = 0.20

# Search/fit knobs
CV_FOLDS   = 5
N_ITER     = 36
MAX_ITER   = 5000
RANDOM_STATE = 42
TEST_SIZE  = 0.10
SAMPLE_NROWS: Optional[int] = None

# OHE rare-level collapse (kept modest; we rely on sparsity)
OHE_MIN_FREQ = None  # set to float like 0.05 to collapse rare categories

# -----------------------------------------------------

def make_ohe():
    # Force sparse output; prefer 'sparse_output' if available (>=1.2), else 'sparse'
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True,
                             min_frequency=OHE_MIN_FREQ)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)

def read_feature_classification(csv_path: Optional[str]) -> Optional[pd.DataFrame]:
    if not csv_path: return None
    p = Path(csv_path)
    if not p.exists(): return None
    df = pd.read_csv(p)
    colmap = {c.lower(): c for c in df.columns}
    feat = colmap.get("feature", list(df.columns)[0])
    cls  = colmap.get("classification", list(df.columns)[1])
    df = df.rename(columns={feat:"Feature", cls:"Classification"})
    df["Feature"] = df["Feature"].astype(str)
    df["Classification"] = (df["Classification"].astype(str).str.strip().str.lower()
                            .map({"protected":"protected","derived":"derived","given":"given"}).fillna("given"))
    return df

def tag_features_from_classification(all_cols: List[str], fc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
    tag_map = {} if fc_df is None else dict(zip(fc_df["Feature"].astype(str), fc_df["Classification"]))
    rows = []
    for c in all_cols:
        t = tag_map.get(c, "given")
        tag = t if t in {"given","derived","protected"} else "given"
        rows.append({"feature": c, "tag": tag, "raw_tag": t})
    return pd.DataFrame(rows)

def apply_feature_tags_and_filter(df: pd.DataFrame, target: str, fc_path: Optional[str]):
    cols = [c for c in df.columns if c != target]
    tags = tag_features_from_classification(cols, read_feature_classification(fc_path))
    allowed = tags[tags["tag"].isin(["given","derived"])]["feature"].tolist()
    protected = tags[tags["tag"]=="protected"]["feature"].tolist()
    out = pd.concat([df[allowed], df[target]], axis=1)
    tags["used_in_training"] = tags["feature"].isin(allowed)
    return out, tags, allowed, protected

class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self): self.bounds_ = []
    def fit(self, X, y=None):
        A = self._arr(X); self.bounds_ = []
        for i in range(A.shape[1]):
            col = A[:, i]; mask = ~np.isnan(col); v = col[mask]
            if v.size == 0: self.bounds_.append((None,None)); continue
            skew = pd.Series(v).skew()
            if abs(skew) < 1:
                m, s = float(np.mean(v)), float(np.std(v, ddof=0))
                low, high = (m, m) if (s==0 or np.isnan(s)) else (m-3*s, m+3*s)
            else:
                q1, q3 = np.percentile(v,25), np.percentile(v,75); iqr = q3-q1
                low, high = (q1,q3) if iqr==0 else (q1-1.5*iqr, q3+1.5*iqr)
            self.bounds_.append((low, high))
        return self
    def transform(self, X):
        A = self._arr(X); B = A.copy()
        for i,(lo,hi) in enumerate(self.bounds_):
            if lo is not None and hi is not None: B[:,i] = np.clip(A[:,i], lo, hi)
        return B
    @staticmethod
    def _arr(X): return X.values.astype(float) if isinstance(X,pd.DataFrame) else np.asarray(X, dtype=float)

def infer_feature_types(df: pd.DataFrame, target: str):
    cols = [c for c in df.columns if c != target]
    num = df[cols].select_dtypes(include=[np.number]).columns.tolist()
    cat = [c for c in cols if c not in num]
    return num, cat

def build_preprocessor(num_cols, cat_cols):
    num_tf = Pipeline([("imputer", SimpleImputer(strategy="median")),
                       ("cap", OutlierCapper()),
                       ("scale", RobustScaler())])
    cat_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                       ("ohe", make_ohe())])
    return ColumnTransformer([("num", num_tf, num_cols),
                              ("cat", cat_tf, cat_cols)], remainder="drop")

def build_pipeline(prep, cache_dir="cache_precision_sparse"):
    os.makedirs(cache_dir, exist_ok=True)
    clf = LogisticRegression(penalty="l2",
                             class_weight="balanced",
                             solver="saga",            # ensure sparse support
                             max_iter=MAX_ITER,
                             random_state=RANDOM_STATE)
    selector = SelectKBest(score_func=mutual_info_classif, k="all")
    return Pipeline([("prep", prep), ("select", selector), ("clf", clf)], memory=cache_dir)

def choose_threshold_precision_first(y_true, y_proba, precision_target=0.60, recall_floor=0.20):
    p, r, thr = precision_recall_curve(y_true, y_proba)
    idx = []
    for pi, ri, ti in zip(p[:-1], r[:-1], thr):
        if (pi >= precision_target) and (ri >= recall_floor):
            idx.append((ti, pi, ri, 2*pi*ri/(pi+ri+1e-12)))
    if idx:
        ti, pi, ri, f1 = max(idx, key=lambda x: (x[2], x[3]))
        return {"threshold": float(ti), "precision": float(pi), "recall": float(ri),
                "f1": float(f1), "constraints_met": True, "criterion": "precision>=target & recall>=floor (max recall)"}
    cand = [(ti, pi, ri) for pi, ri, ti in zip(p[:-1], r[:-1], thr) if ri >= recall_floor]
    if cand:
        ti, pi, ri = max(cand, key=lambda x: x[1])
        f1 = 2*pi*ri/(pi+ri+1e-12)
        return {"threshold": float(ti), "precision": float(pi), "recall": float(ri),
                "f1": float(f1), "constraints_met": False, "criterion": "max precision @ recall>=floor"}
    f1s = [2*pi*ri/(pi+ri+1e-12) for pi,ri in zip(p[:-1], r[:-1])]
    bi = int(np.argmax(f1s)) if f1s else 0
    bt = float(thr[bi]) if len(thr)>0 else 0.5
    return {"threshold": bt, "precision": float(p[bi]), "recall": float(r[bi]),
            "f1": float(f1s[bi]) if f1s else 0.0, "constraints_met": False, "criterion": "best F1"}

def precision_at_recall_floor_cv(estimator, X, y):
    if not hasattr(estimator, "predict_proba"):
        y_pred = estimator.predict(X)
        return precision_score(y, y_pred, zero_division=0)
    proba = estimator.predict_proba(X)[:, 1]
    p, r, thr = precision_recall_curve(y, proba)
    vals = [pi for pi, ri in zip(p[:-1], r[:-1]) if ri >= RECALL_FLOOR]
    if not vals:
        y_pred = (proba >= 0.5).astype(int)
        return precision_score(y, y_pred, zero_division=0) * 0.25
    return float(np.max(vals))

def evaluate(y_true, y_pred, y_proba=None):
    m = {"f1": f1_score(y_true,y_pred,zero_division=0),
         "precision": precision_score(y_true,y_pred,zero_division=0),
         "recall": recall_score(y_true,y_pred,zero_division=0)}
    if y_proba is not None:
        try: m["roc_auc"] = roc_auc_score(y_true,y_proba)
        except: m["roc_auc"] = None
        try: m["pr_auc"] = average_precision_score(y_true,y_proba)
        except: m["pr_auc"] = None
    return m

def main(data_path=DEFAULT_DATA, target=DEFAULT_TARGET, feature_classification=FEATURE_CLASSIFICATION):
    data_path = Path(data_path); assert data_path.exists(), f"Missing data: {data_path}"
    df = pd.read_csv(data_path, nrows=SAMPLE_NROWS if SAMPLE_NROWS is not None else None)
    df.columns = [str(c).strip() for c in df.columns]
    if target not in df.columns: raise ValueError(f"Target '{target}' not in data.")
    print("[info] shape:", df.shape)

    y_raw = df[target]
    if not pd.api.types.is_numeric_dtype(y_raw):
        mapping = {"y":1,"yes":1,"true":1,"t":1,"approved":1,"1":1,"n":0,"no":0,"false":0,"f":0,"rejected":0,"0":0}
        y_series = y_raw.astype(str).str.strip().str.lower().map(mapping)
    else:
        y_series = pd.to_numeric(y_raw, errors="coerce")
    mask = y_series.notna()
    if mask.sum() < len(y_series): print(f"[warn] dropped {len(y_series)-mask.sum()} rows with invalid target")
    df = df.loc[mask].copy(); y = y_series.loc[mask].astype(int)

    df_f, tags, allowed, protected = apply_feature_tags_and_filter(df, target, feature_classification)
    X = df_f.drop(columns=[target])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                        random_state=RANDOM_STATE, stratify=y)

    num, cat = infer_feature_types(X_train, target="")
    prep = build_preprocessor(num, cat)
    pipe = build_pipeline(prep, cache_dir="cache_precision_sparse")

    # Randomized search space (sparse-friendly)
    C_grid = np.logspace(-3, 2, num=40).tolist()
    tol_grid = [1e-3, 1e-4, 1e-5]
    fit_intercept_grid = [True, False]

    param_dist = {
        "clf__C": C_grid,
        "clf__tol": tol_grid,
        "clf__fit_intercept": fit_intercept_grid,
        "select__k": ["all", 200, 400, 800]  # allow disabling by "all"
    }

    min_class = int(pd.Series(y_train).value_counts().min())
    n_splits = max(2, min(CV_FOLDS, min_class))
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    print(f"[info] RandomizedSearchCV: n_iter={N_ITER}, cv={n_splits}, scorer=precision@recall>={RECALL_FLOOR}. (saga + sparse OHE)")
    rs = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=N_ITER,
        scoring=precision_at_recall_floor_cv,
        cv=cv,
        n_jobs=-1,
        verbose=1,
        random_state=RANDOM_STATE
    )

    t0 = time.perf_counter()
    rs.fit(X_train, y_train)
    print(f"[timing] randomized search fit: {time.perf_counter() - t0:.1f}s")

    best = rs.best_estimator_
    y_proba = best.predict_proba(X_test)[:,1] if hasattr(best,"predict_proba") else None
    y_pred = best.predict(X_test)

    tinfo = None
    if y_proba is not None:
        tinfo = choose_threshold_precision_first(
            y_test, y_proba,
            precision_target=PRECISION_TARGET,
            recall_floor=RECALL_FLOOR
        )
        y_pred = (y_proba >= tinfo["threshold"]).astype(int)

    mets = evaluate(y_test, y_pred, y_proba)

    out_dir = Path(OUT_DIR); out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir/"feature_tags_precision_sparse.csv").write_text(tags.to_csv(index=False))
    with open(out_dir/"best_model_precision_sparse.pkl","wb") as f: pickle.dump(best, f)

    artifacts = {
        "best_params": rs.best_params_,
        "best_cv_score_precision_at_recall_floor": float(rs.best_score_),
        "precision_target": PRECISION_TARGET,
        "recall_floor": RECALL_FLOOR,
        "threshold_choice": tinfo,
        "test_metrics": mets,
        "protected_features_excluded": protected,
        "allowed_features_count": len(allowed),
        "max_iter": MAX_ITER,
        "cv_folds": n_splits,
        "n_iter": N_ITER,
        "ohe_min_frequency": OHE_MIN_FREQ,
        "solver": "saga",
        "sparse_ohe": True
    }
    with open(out_dir/"run_artifacts_precision_sparse.json","w") as f: json.dump(artifacts, f, indent=2)

    print("\n=== Summary (Precision-First SPARSE) ===")
    print(json.dumps(artifacts, indent=2))
    if tinfo and not tinfo.get("constraints_met", False):
        print(f"[warn] Could not meet precision>={PRECISION_TARGET:.2f} with recall>={RECALL_FLOOR:.2f}. "
              f"Used best available: P={tinfo['precision']:.3f}, R={tinfo['recall']:.3f} ({tinfo['criterion']}).")

if __name__ == "__main__":
    main()


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Loan Approval Model — Precision-First (L2, balanced)
----------------------------------------------------
- L2 penalty only
- class_weight='balanced' (fixed)
- Multi-metric CV scoring: precision, recall, f1, PR-AUC, ROC-AUC
- Refit rule: maximize mean CV precision subject to mean CV recall >= MIN_RECALL (precision-first with recall floor)
- Post-fit threshold tuning: choose threshold that maximizes precision with recall >= MIN_RECALL (if possible)
- Expanded solver set (lbfgs, liblinear, newton-cg, saga) + finer C grid to favor higher precision models
"""

import json, pickle, time
from pathlib import Path
from typing import List, Tuple, Optional, Dict, Any

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    f1_score, precision_score, recall_score, roc_auc_score,
    average_precision_score, classification_report, precision_recall_curve,
    make_scorer
)

# ---------------- Config ----------------
DEFAULT_DATA = "Data/Loan_dataset_india_110000_updated.csv"
DEFAULT_TARGET = "credit_approved"
FEATURE_CLASSIFICATION: Optional[str] = "data/feature_classification.csv"
OUT_DIR = "Data"

# speed/quality knobs
CV_FOLDS = 5
FAST_MODE = False  # if True -> only lbfgs; else lbfgs/liblinear/newton-cg/saga
MAX_ITER = 300

# Precision-leaning grid (smaller C often increases conservatism → fewer FPs)
C_GRID = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0]

# threshold constraints (precision-first with recall floor)
MIN_PRECISION = 0.70
MIN_RECALL = 0.60

RANDOM_STATE = 42
TEST_SIZE = 0.3
SMOKE = False
SAMPLE_NROWS: Optional[int] = None

# --------------- Utils ------------------
def make_ohe():
    # Backward compat for older scikit-learn
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, min_frequency=0.01)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

def read_feature_classification(csv_path: Optional[str]) -> Optional[pd.DataFrame]:
    if not csv_path: return None
    p = Path(csv_path)
    if not p.exists(): return None
    df = pd.read_csv(p)
    colmap = {c.lower(): c for c in df.columns}
    feat = colmap.get("feature", list(df.columns)[0])
    cls  = colmap.get("classification", list(df.columns)[1])
    df = df.rename(columns={feat:"Feature", cls:"Classification"})
    df["Feature"] = df["Feature"].astype(str)
    df["Classification"] = (
        df["Classification"].astype(str).str.strip().str.lower()
        .map({"protected":"protected","derived":"derived","given":"given"}).fillna("given")
    )
    return df

def tag_features_from_classification(all_cols: List[str], fc_df: Optional[pd.DataFrame]) -> pd.DataFrame:
    tag_map = {} if fc_df is None else dict(zip(fc_df["Feature"].astype(str), fc_df["Classification"]))
    rows = []
    for c in all_cols:
        t = tag_map.get(c, "given")
        tag = t if t in {"given","derived","protected"} else "given"
        rows.append({"feature": c, "tag": tag, "raw_tag": t})
    return pd.DataFrame(rows)

def apply_feature_tags_and_filter(df: pd.DataFrame, target: str, fc_path: Optional[str]):
    cols = [c for c in df.columns if c != target]
    tags = tag_features_from_classification(cols, read_feature_classification(fc_path))
    allowed = tags[tags["tag"].isin(["given","derived"])]["feature"].tolist()
    protected = tags[tags["tag"]=="protected"]["feature"].tolist()
    out = pd.concat([df[allowed], df[target]], axis=1)
    tags["used_in_training"] = tags["feature"].isin(allowed)
    return out, tags, allowed, protected

class OutlierCapper(BaseEstimator, TransformerMixin):
    def __init__(self): self.bounds_ = []
    def fit(self, X, y=None):
        A = self._arr(X); self.bounds_ = []
        for i in range(A.shape[1]):
            col = A[:, i]; mask = ~np.isnan(col); v = col[mask]
            if v.size == 0: self.bounds_.append((None,None)); continue
            skew = pd.Series(v).skew()
            if abs(skew) < 1:
                m, s = float(np.mean(v)), float(np.std(v, ddof=0))
                low, high = (m, m) if (s==0 or np.isnan(s)) else (m-3*s, m+3*s)
            else:
                q1, q3 = np.percentile(v,25), np.percentile(v,75); iqr = q3-q1
                low, high = (q1,q3) if iqr==0 else (q1-1.5*iqr, q3+1.5*iqr)
            self.bounds_.append((low, high))
        return self
    def transform(self, X):
        A = self._arr(X); B = A.copy()
        for i,(lo,hi) in enumerate(self.bounds_):
            if lo is not None and hi is not None: B[:,i] = np.clip(A[:,i], lo, hi)
        return B
    @staticmethod
    def _arr(X): return X.values.astype(float) if isinstance(X,pd.DataFrame) else np.asarray(X, dtype=float)

def infer_feature_types(df: pd.DataFrame, target: str):
    cols = [c for c in df.columns if c != target]
    num = df[cols].select_dtypes(include=[np.number]).columns.tolist()
    cat = [c for c in cols if c not in num]
    return num, cat

def build_preprocessor(num_cols, cat_cols):
    num_tf = Pipeline([("imputer", SimpleImputer(strategy="median")),
                       ("cap", OutlierCapper()),
                       ("scale", RobustScaler())])
    cat_tf = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                       ("ohe", make_ohe())])
    return ColumnTransformer([("num", num_tf, num_cols),
                              ("cat", cat_tf, cat_cols)], remainder="drop")

def build_pipeline(prep):
    clf = LogisticRegression(
        penalty="l2",
        class_weight="balanced",
        max_iter=MAX_ITER,
        random_state=RANDOM_STATE
    )
    return Pipeline([("prep", prep), ("clf", clf)])

def get_param_grid():
    solvers = ["lbfgs"] if FAST_MODE else ["lbfgs", "liblinear", "newton-cg", "saga"]
    # liblinear/newton-cg/lbfgs/saga all support L2 for binary LR
    return [{
        "clf__solver": solvers,
        "clf__penalty": ["l2"],
        "clf__C": C_GRID,
        "clf__class_weight": ["balanced"],
        "clf__max_iter": [MAX_ITER],
        "clf__tol": [1e-3, 1e-4],
    }]

def choose_threshold(y_true, y_proba, min_p, min_r):
    """Precision-first thresholding with recall floor (maximize precision given recall >= min_r)."""
    p, r, thr = precision_recall_curve(y_true, y_proba)
    # candidates that meet recall floor
    idxs = [i for i in range(len(thr)) if (p[i] >= min_p and r[i] >= min_r)]
    if idxs:
        # among feasible thresholds, pick the one with maximum precision; break ties by F1
        best_i = max(idxs, key=lambda i: (p[i], (2*p[i]*r[i])/(p[i]+r[i]+1e-12)))
        return {"threshold": float(thr[best_i]), "precision": float(p[best_i]),
                "recall": float(r[best_i]), "f1": float((2*p[best_i]*r[best_i])/(p[best_i]+r[best_i]+1e-12)),
                "constraints_met": True}
    # Fallback: pure precision maximization (may heavily sacrifice recall)
    # Use the PR curve points excluding the last (which corresponds to threshold=-inf)
    cand_i = int(np.argmax(p[:-1])) if len(p) > 1 else 0
    best_thr = float(thr[cand_i]) if len(thr) > 0 else 0.5
    return {"threshold": best_thr, "precision": float(p[cand_i]),
            "recall": float(r[cand_i]), "f1": float((2*p[cand_i]*r[cand_i])/(p[cand_i]+r[cand_i]+1e-12)),
            "constraints_met": False}

def evaluate(y_true, y_pred, y_proba=None):
    m = {"f1": f1_score(y_true,y_pred,zero_division=0),
         "precision": precision_score(y_true,y_pred,zero_division=0),
         "recall": recall_score(y_true,y_pred,zero_division=0)}
    if y_proba is not None:
        try: m["roc_auc"] = roc_auc_score(y_true,y_proba)
        except: m["roc_auc"] = None
        try: m["pr_auc"] = average_precision_score(y_true,y_proba)
        except: m["pr_auc"] = None
    return m

def _scorers():
    return {
        "precision": make_scorer(precision_score, zero_division=0),
        "recall":    make_scorer(recall_score, zero_division=0),
        "f1":        make_scorer(f1_score, zero_division=0),
        "roc_auc":   "roc_auc",
        "pr_auc":    "average_precision",
    }

def _refit_precision_with_recall_floor(cv_results):
    """Select the CV setting with highest mean precision subject to mean recall >= MIN_RECALL.
    If none meet the floor, pick the absolute best precision."""
    prec = np.array(cv_results["mean_test_precision"])
    rec  = np.array(cv_results["mean_test_recall"])
    feas_idxs = np.where(rec >= MIN_RECALL)[0]
    if len(feas_idxs):
        return int(feas_idxs[np.argmax(prec[feas_idxs])])
    return int(np.argmax(prec))

def _topn_cv_by_precision(gs, topn=5):
    """For reporting: top-N CV rows by mean precision among those meeting recall floor."""
    df = pd.DataFrame(gs.cv_results_)
    if "mean_test_precision" not in df or "mean_test_recall" not in df: return []
    df = df.copy()
    df = df[df["mean_test_recall"] >= MIN_RECALL]
    if df.empty: 
        df = pd.DataFrame(gs.cv_results_)  # fallback: ignore recall floor
    df = df.sort_values("mean_test_precision", ascending=False).head(topn)
    keep_cols = [
        "mean_test_precision","std_test_precision","mean_test_recall","mean_test_f1",
        "mean_test_pr_auc","mean_test_roc_auc",
        "param_clf__solver","param_clf__C","param_clf__tol"
    ]
    df = df[keep_cols]
    # Make JSON-friendly
    return json.loads(df.to_json(orient="records"))

def main(data_path=DEFAULT_DATA, target=DEFAULT_TARGET, feature_classification=FEATURE_CLASSIFICATION):
    data_path = Path(data_path); assert data_path.exists(), f"Missing data: {data_path}"
    read_kwargs = {}
    # if SMOKE and SAMPLE_NROWS is None: read_kwargs["nrows"]=2000
    # if SAMPLE_NROWS is not None: read_kwargs["nrows"]=int(SAMPLE_NROWS)
    df = pd.read_csv(data_path, **read_kwargs)
    print("Shape of DF", df.shape)
    df.columns = [str(c).strip() for c in df.columns]
    if target not in df.columns: raise ValueError(f"Target '{target}' not in data.")

    # Target cleaning
    y_raw = df[target]
    if not pd.api.types.is_numeric_dtype(y_raw):
        mapping = {"y":1,"yes":1,"true":1,"t":1,"approved":1,"1":1,"n":0,"no":0,"false":0,"f":0,"rejected":0,"0":0}
        y_series = y_raw.astype(str).str.strip().str.lower().map(mapping)
    else:
        y_series = pd.to_numeric(y_raw, errors="coerce")
    mask = y_series.notna()
    if mask.sum()<len(y_series): print(f"[warn] dropped {len(y_series)-mask.sum()} rows with invalid target")
    df = df.loc[mask].copy(); y = y_series.loc[mask].astype(int)
    print("Dataframe after cleaning:", df.shape)

    # Exclude protected features
    df_f, tags, allowed, protected = apply_feature_tags_and_filter(df, target, feature_classification)

    X = df_f.drop(columns=[target])
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    num, cat = infer_feature_types(X_train, target="")
    prep = build_preprocessor(num, cat)
    pipe = build_pipeline(prep)
    grid = get_param_grid()

    min_class = int(pd.Series(y_train).value_counts().min())
    n_splits = max(2, min(CV_FOLDS, min_class))
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

    combos = len(grid[0]["clf__solver"]) * len(grid[0]["clf__C"]) * len(grid[0]["clf__tol"])
    total_fits = combos * n_splits
    print(f"[info] Effective grid combos: {combos}  |  CV folds: {n_splits}  |  total fits: {total_fits}")

    gs = GridSearchCV(
        pipe,
        param_grid=grid,
        cv=cv,
        scoring=_scorers(),
        refit=_refit_precision_with_recall_floor,  # Precision-first with recall floor
        n_jobs=-1,
        verbose=1,
        return_train_score=False
    )
    gs.fit(X_train, y_train)
    best = gs.best_estimator_

    # Probability scores & precision-first thresholding
    y_proba = best.predict_proba(X_test)[:,1] if hasattr(best,"predict_proba") else None
    y_pred = best.predict(X_test)
    tinfo = None
    if y_proba is not None:
        tinfo = choose_threshold(y_test, y_proba, MIN_PRECISION, MIN_RECALL)
        y_pred = (y_proba >= tinfo["threshold"]).astype(int)

    mets = evaluate(y_test, y_pred, y_proba)
    print("\n[report] Post-threshold classification report (precision-first)")
    print(classification_report(y_test, y_pred, zero_division=0))

    top_cv = _topn_cv_by_precision(gs, topn=5)

    out = {
        "best_params": gs.best_params_,
        "best_cv_index": int(gs.best_index_) if hasattr(gs, "best_index_") else None,
        "best_score_mean_cv_precision": float(gs.cv_results_["mean_test_precision"][gs.best_index_])
            if hasattr(gs, "best_index_") else None,
        "refit_rule": "precision_with_recall_floor",
        "cv_top5_by_precision": top_cv,
        "threshold_constraints": {
            "min_precision": MIN_PRECISION,
            "min_recall": MIN_RECALL,
            "tuning": tinfo
        },
        "test_metrics": mets,
        "protected_features_excluded": protected,
        "allowed_features_count": len(allowed),
        "solvers": grid[0]["clf__solver"],
        "C_grid": grid[0]["clf__C"],
        "tol_grid": grid[0]["clf__tol"],
        "max_iter": MAX_ITER,
        "cv_folds": n_splits,
        "fast_mode": FAST_MODE
    }

    out_dir = Path(OUT_DIR); out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir/"feature_tags_fast.csv").write_text(tags.to_csv(index=False))
    with open(out_dir/"run_artifacts_fast.json","w") as f: json.dump(out, f, indent=2)
    with open(out_dir/"best_model_fast.pkl","wb") as f: pickle.dump(best, f)

    print("\n=== Summary (Precision-first) ===")
    print(json.dumps(out, indent=2))
    if tinfo and not tinfo.get("constraints_met", False):
        print(f"\n[warn] Could not meet precision>={MIN_PRECISION:.2f} & recall>={MIN_RECALL:.2f} at any threshold."
              f" Best threshold: P={tinfo['precision']:.3f}, R={tinfo['recall']:.3f}.")

if __name__ == "__main__":
    main()


