In [1]:
import pickle
import numpy as np
import pandas as pd

# Load the pipeline you saved earlier
with open('credit_final_model.pkl', 'rb') as f:
    deploy_dict = pickle.load(f)


In [2]:
def preprocess_unseen(raw_df: pd.DataFrame, deploy_dict: dict) -> np.ndarray:
    """
    raw_df: DataFrame with original columns (no Good_Bad).
    Returns: scaled feature matrix ready for model.predict(...)
    """
    df = raw_df.copy()

    # ---------- 1) MonthlyIncome handling ----------
    mode_income = deploy_dict["mode_MonthlyIncome"]
    # use MonthlyIncome, ignore MonthlyIncome.1 (like training)
    # Remove duplicate income column if exists
    df.drop(columns=["MonthlyIncome.1"], errors="ignore", inplace=True)

    # Impute main MonthlyIncome â†’ create MonthlyIncome_mode
    df["MonthlyIncome"] = df["MonthlyIncome"].fillna(mode_income)
    df["MonthlyIncome_mode"] = df["MonthlyIncome"]

    df.drop(columns=["MonthlyIncome"], inplace=True)

    # ---------- 2) NumberOfDependents handling ----------
    med_dep = deploy_dict["median_NumberOfDependents"]
    df["NumberOfDependents"] = pd.to_numeric(df["NumberOfDependents"])
    df["NumberOfDependents"] = df["NumberOfDependents"].fillna(med_dep)
    df["NumberOfDependents_median"] = df["NumberOfDependents"]
    df.drop(columns=["NumberOfDependents"], errors="ignore", inplace=True)

    # ---------- 3) Numeric log features (no clipping) ----------
    # Raw numeric sources
    # (same as you used in training before making *_log_5th)
    log_map = {
        "RevolvingUtilizationOfUnsecuredLines": "RevolvingUtilizationOfUnsecuredLines_log_5th",
        "age": "age_log_5th",
        "DebtRatio": "DebtRatio_log_5th",
        "NumberOfOpenCreditLinesAndLoans": "NumberOfOpenCreditLinesAndLoans_log_5th",
        "NumberRealEstateLoansOrLines": "NumberRealEstateLoansOrLines_log_5th",
        "MonthlyIncome_mode": "MonthlyIncome_mode_log_5th",
        "NumberOfDependents_median": "NumberOfDependents_median_log_5th",
    }

    for raw_col, new_col in log_map.items():
        df[new_col] = np.log(df[raw_col] + 1)

    # ---------- 4) Categorical encoding ----------
    oh_gender  = deploy_dict["one_hot_gender"]
    oh_region  = deploy_dict["one_hot_region"]
    od_rented  = deploy_dict["ordinal_rented"]
    od_occ     = deploy_dict["ordinal_occupation"]
    od_edu     = deploy_dict["ordinal_education"]

    # Gender -> Gender_male
    gender_arr = oh_gender.transform(df[["Gender"]]).toarray()
    # assume 2 categories -> index 1 is "Male" same as training
    df["Gender_male"] = gender_arr[:, 1].astype(int)

    # Region -> Central / East / North / South
    region_arr = oh_region.transform(df[["Region"]]).toarray()
    region_cats = list(oh_region.categories_[0])
    for i, cat in enumerate(region_cats):
        df[cat] = region_arr[:, i].astype(int)

    # Rented_OwnHouse -> Rented
    df["Rented"] = od_rented.transform(df[["Rented_OwnHouse"]]).astype(int)

    # Occupation -> Occupation_re
    df["Occupation_re"] = od_occ.transform(df[["Occupation"]]).astype(int)

    # Education -> Education_re
    df["Education_re"] = od_edu.transform(df[["Education"]]).astype(int)

    # Drop original categorical columns
    df.drop(
        columns=["Gender", "Region", "Rented_OwnHouse", "Occupation", "Education"],
        inplace=True,
        errors="ignore"
    )

    # ---------- 5) Keep only the final training features ----------
    feature_names = deploy_dict["feature_names"]
    X = df[feature_names].copy()

    # ---------- 6) Scale ----------
    scaler = deploy_dict["scaler"]
    X_scaled = scaler.transform(X)

    return X_scaled


In [3]:
def predict_unseen(raw_df: pd.DataFrame, deploy_dict: dict) -> pd.DataFrame:
    X_scaled = preprocess_unseen(raw_df, deploy_dict)
    model = deploy_dict["model"]
    le    = deploy_dict["label_encoder"]

    # predicted classes (0/1)
    y_pred = model.predict(X_scaled)

    # convert back to "Good"/"Bad"
    y_labels = le.inverse_transform(y_pred)

    result = raw_df.copy()
    result["Predicted_Label"] = y_labels

    # probability of GOOD (class label "Good")
    if hasattr(model, "predict_proba"):
        proba_all = model.predict_proba(X_scaled)
        classes = list(le.classes_)        # ['Bad', 'Good']
        good_index = classes.index("Good") # usually 1

        result["Prob_Good"] = proba_all[:, good_index]
    else:
        result["Prob_Good"] = np.nan

    return result


In [4]:
print(deploy_dict["label_encoder"].classes_)


['Bad' 'Good']


In [5]:
# Example unseen record (YOU can change these values)
sample = {
    "NPA Status": 0.0,
    "RevolvingUtilizationOfUnsecuredLines": 0.25,
    "age": 35.0,
    "Gender": "Male",
    "Region": "East",              # must be one of categories seen in training
    "MonthlyIncome": 5000.0,
    "Rented_OwnHouse": "Rented",   # must match training categories
    "Occupation": "Self_Emp",      # must be in training categories, or will be encoded as per OrdinalEncoder
    "Education": "Graduate",       # same comment
    "NumberOfTime30-59DaysPastDueNotWorse": 0.0,
    "DebtRatio": 0.5,
    "MonthlyIncome.1": 5000.0,
    "NumberOfOpenCreditLinesAndLoans": 5.0,
    "NumberOfTimes90DaysLate": 0.0,
    "NumberRealEstateLoansOrLines": 1.0,
    "NumberOfTime60-89DaysPastDueNotWorse": 0.0,
    "NumberOfDependents": "2"
}

unseen_df = pd.DataFrame([sample])

result = predict_unseen(unseen_df, deploy_dict)
print(result[["Predicted_Label", "Prob_Good"]])



  Predicted_Label  Prob_Good
0            Good   0.993438


In [6]:
bad_like_sample = {
    "NPA Status": 1.0,
    "RevolvingUtilizationOfUnsecuredLines": 0.95,
    "age": 22.0,
    "Gender": "Male",
    "Region": "South",
    "MonthlyIncome": 2000.0,
    "Rented_OwnHouse": "Rented",
    "Occupation": "Self_Emp",
    "Education": "Graduate",
    "NumberOfTime30-59DaysPastDueNotWorse": 3.0,
    "DebtRatio": 1.5,
    "MonthlyIncome.1": 2000.0,
    "NumberOfOpenCreditLinesAndLoans": 15.0,
    "NumberOfTimes90DaysLate": 4.0,
    "NumberRealEstateLoansOrLines": 3.0,
    "NumberOfTime60-89DaysPastDueNotWorse": 2.0,
    "NumberOfDependents": "5"
}

bad_df = pd.DataFrame([bad_like_sample])
res_bad = predict_unseen(bad_df, deploy_dict)
print(res_bad[["Predicted_Label", "Prob_Good"]])


  Predicted_Label  Prob_Good
0             Bad   0.127385
