# Explore x to train LC

In [1]:
# Quick hack to load local SDK code
import os

os.chdir(os.path.join(os.getcwd(), ".."))

# Load API key and secret from environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import validmind as vm

# PD Model
vm.init(project="cl1jyvh2c000909lg1rk0a0zb")

True

In [3]:
import joblib

import numpy as np
import pandas as pd
import scipy

In [4]:
def jeffreys_test(p: float, n: int = 0, d: int = 0) -> float:
    """
    Perform a test that the test probability, p, is consistent with the observed number of 
    successes, d, from a number of trials, n.

    This uses the Jeffrey's posterior probability, which is the Beta distribution with shape
    parameters a = d + 1/2 and b = n - d + 1/2. The result is the one sided p-value representing the 
    probability that the test probability, p, is greater than the true probability.

    :param p: the test probability to be compared to the number of successes given n trials
    :param n: the number of trials
    :param d: the number of successes [optional, default = 0]

    :return p-value: one sided p-value of the test
    """
    return scipy.stats.beta.cdf(p, d + 0.5, n - d + 0.5)

In [5]:
def update_result(s, d, n, dr, p, pval, out = 'Yet to decide'):
    return ({'Segment': s,
            'Defaults': d,
            'Observations': n,
            'Default Rate': dr,
            'Calibrated PD': p,
            'P-value': pval, 
            'Outcome': out})

In [6]:
def calculate_and_return(df = pd.DataFrame, cal_pd = {}, pool = None, obs = 'observed', threshold = 0.9):
    """
    Take the input dataframe, analyse & clean, seprate poolwise.
    Calculate the jeffreys statistic
    """
    
    result = pd.DataFrame(columns = ['Segment', 'Defaults', 'Observations', 'Default Rate', 'Calibrated PD', 'P-value', 'Outcome'])
    
    n = len(df[obs])
    d = np.sum(df[obs])
    dr = np.round(d/n,2)
    p = cal_pd['Model']
    pval = np.round(jeffreys_test(p, n, d),4)
    if pval>=threshold:
        out = 'Satisfactory'
    else:
        out = 'Not Satisfactory'
    
    result = result.append(update_result('Model', d, n, dr, p, pval, out), ignore_index = True)
    
    if pool != None:
        samples = df.groupby(pool)
        
        for sample in samples:
            n = len(sample[1][obs])
            d = np.sum(sample[1][obs])
            dr = np.round(d/n,2)
            p = cal_pd[sample[0]]
            pval = np.round(jeffreys_test(p, n, d),4)
            
            if pval>=threshold:
                out = 'Satisfactory'
            else:
                out = 'Not Satisfactory'
            
            result = result.append(update_result(sample[0], d, n, dr, p, pval, out), ignore_index = True)
            
    return result

In [7]:
df = pd.read_csv("./notebooks/datasets/_temp/x_train_lc.csv")
df.head()

Unnamed: 0,loan_amnt,int_rate,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,inq_last_6mths,mths_since_last_delinq,...,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w,application_type_Individual
0,28000.0,7.12,10,125000.0,15.97,0.0,26,725.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1
1,11200.0,10.99,2,80600.0,15.93,0.0,15,670.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,1
2,14000.0,15.1,6,83000.0,18.17,0.0,13,660.0,1.0,76.0,...,0,0,0,0,0,0,0,0,1,1
3,12725.0,12.12,6,71300.0,29.7,0.0,13,675.0,2.0,25.0,...,0,0,0,0,0,0,0,1,0,1
4,7200.0,15.31,1,25000.0,32.98,0.0,8,700.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,1


In [8]:
df["acc_now_delinq"].value_counts()

0.0    59802
1.0      187
2.0        7
3.0        3
5.0        1
Name: acc_now_delinq, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 131 entries, loan_amnt to application_type_Individual
dtypes: float64(60), int64(71)
memory usage: 60.0 MB


In [10]:
test_df = pd.read_csv("./notebooks/datasets/_temp/x_test_lc.csv")
test_df.head()

Unnamed: 0,loan_amnt,int_rate,emp_length,annual_inc,dti,delinq_2yrs,earliest_cr_line,fico_range_low,inq_last_6mths,mths_since_last_delinq,...,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_f,initial_list_status_w,application_type_Individual
0,15500.0,8.9,10,100000.0,0.74,0.0,14,715.0,3.0,25.0,...,0,0,0,0,0,0,0,1,0,1
1,10800.0,11.67,10,68000.0,15.44,1.0,20,670.0,1.0,8.0,...,0,0,0,0,0,0,0,1,0,1
2,15850.0,15.1,2,36000.0,26.5,0.0,31,720.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,1
3,16000.0,15.31,2,80000.0,24.54,1.0,12,705.0,0.0,21.0,...,0,0,0,0,0,0,0,0,1,1
4,14000.0,12.12,10,90000.0,14.07,0.0,14,695.0,1.0,44.0,...,0,0,0,0,0,0,0,1,0,1


In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Columns: 131 entries, loan_amnt to application_type_Individual
dtypes: float64(60), int64(71)
memory usage: 20.0 MB


In [12]:
model = joblib.load("./notebooks/datasets/_temp/lc_model.pickle")

In [13]:
segments = [
    {
        "name": "Grade",
        "segments": [
            {"name": "Grade A", "query": "grade_A == 1"},
            {"name": "Grade B", "query": "grade_B == 1"},
            {"name": "Grade C", "query": "grade_C == 1"},
            {"name": "Grade D", "query": "grade_D == 1"},
            {"name": "Grade E", "query": "grade_E == 1"},
            {"name": "Grade F", "query": "grade_F == 1"},
            {"name": "Grade G", "query": "grade_G == 1"},
        ]
    },
    {
        "name": "Delinquency",
        "segments": [
            {"name": "Delinquency: None", "query": "acc_now_delinq == 0"},
            {"name": "Delinquency: 1 Account", "query": "acc_now_delinq == 1"},
            {"name": "Delinquency: 2 Accounts", "query": "acc_now_delinq == 2"},
        ]
    }
]

In [14]:
def get_calibrated_pds(df, model, segments):
    model_preds = model.predict_proba(df)[:, 1]
    model_class_preds = (model_preds > 0.5).astype(int)

    pds = {"Model": model_class_preds.sum() / len(model_class_preds)}

    for segment in segments:
        for segment in segment["segments"]:
            segment_df = df.query(segment["query"])
            y_pred = model.predict_proba(segment_df)[:, -1]
            class_pred = (y_pred > 0.5).astype(int)
            total_pds = class_pred.sum()
            segment_pd = total_pds / len(class_pred)

            pds[segment["name"]] = segment_pd
    return pds

In [15]:
calibrated_pds = get_calibrated_pds(df, model, segments)
calibrated_pds

{'Model': 0.027933333333333334,
 'Grade A': 0.0022715539494062983,
 'Grade B': 0.007202947160059383,
 'Grade C': 0.014655226404459197,
 'Grade D': 0.043563336766220394,
 'Grade E': 0.10736266241167085,
 'Grade F': 0.1781818181818182,
 'Grade G': 0.24396135265700483,
 'Delinquency: None': 0.027791712651750778,
 'Delinquency: 1 Account': 0.06951871657754011,
 'Delinquency: 2 Accounts': 0.14285714285714285}

In [16]:
def process_observations(df, model, segments):
    test_input = pd.DataFrame(columns = ['Segment', 'Observed'])

    for segment in segments:
        for segment in segment["segments"]:
            segment_df = df.query(segment["query"])
            y_pred = model.predict_proba(segment_df)[:, -1]
            class_pred = (y_pred > 0.5).astype(int)
            # Concat to test_input by adding all rows of class_pred and segment as a single value
            test_input = pd.concat([test_input, pd.DataFrame({'Segment': segment["name"], 'Observed': class_pred})], ignore_index=True)

    return test_input

In [17]:
observations = process_observations(test_df, model, segments)

In [18]:
results = calculate_and_return(
    observations,
    cal_pd=calibrated_pds,
    pool = 'Segment',
    obs=
    'Observed',
    threshold = 0.85
)
results

Unnamed: 0,Segment,Defaults,Observations,Default Rate,Calibrated PD,P-value,Outcome
0,Model,708,39999,0.02,0.027933,1.0,Satisfactory
1,Delinquency: 1 Account,3,54,0.06,0.069519,0.6307,Not Satisfactory
2,Delinquency: 2 Accounts,0,6,0.0,0.142857,0.8352,Not Satisfactory
3,Delinquency: None,351,19939,0.02,0.027792,1.0,Satisfactory
4,Grade A,2,3341,0.0,0.002272,0.9904,Satisfactory
5,Grade B,13,6023,0.0,0.007203,1.0,Satisfactory
6,Grade C,24,5318,0.0,0.014655,1.0,Satisfactory
7,Grade D,84,3133,0.03,0.043563,1.0,Satisfactory
8,Grade E,120,1509,0.08,0.107363,0.9999,Satisfactory
9,Grade F,82,537,0.15,0.178182,0.9408,Satisfactory


# Send results to ValidMind

In [19]:
# Test passed only if all values for 'Outcome' are 'Satisfactory'
passed = results['Outcome'].all() == 'Satisfactory'
passed

False

In [20]:
# Build a vm.TestResult object for each row in the results dataframe
test_results = []
for index, row in results.iterrows():
    test_results.append(vm.TestResult(
        passed=row['Outcome'] == 'Satisfactory',
        values={
            'segment': row['Segment'],
            'defaults': row['Defaults'],
            'observations': row['Observations'],
            'default_rate': row['Default Rate'],
            'calibrated_pd': row['Calibrated PD'],
            'p_value': row['P-value']
        }
    ))

In [21]:
jeffreys_params = {
    "threshold": 0.85
}

jeffreys_test_result = vm.TestResults(
    category="model_performance",
    test_name="jeffreys_test",
    params=jeffreys_params,
    passed=passed,
    results=test_results,
)

In [22]:
vm.log_test_results([
    jeffreys_test_result
])

Successfully logged test results for test: jeffreys_test


True