# Probability of Default Model using ValidMind

## Step 1: Connect Notebook to ValidMind Project

#### Import Libraries

In [1]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env

import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.stats import chi2_contingency
%matplotlib inline

#### Connect Notebook to ValidMind Project

In [2]:

import validmind as vm

vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  api_key = "2494c3838f48efe590d531bfe225d90b",
  api_secret = "4f692f8161f128414fef542cab2a4e74834c75d01b3a8e088a1834f2afcfe838",
  project = "clibjj9cl00056qy6tz2hkc6l"
)
  

2023-06-06 17:02:11,077 - INFO - api_client - Connected to ValidMind. Project: PD Model - Initial Validation (clibjj9cl00056qy6tz2hkc6l)


## Step 2: Import Raw Data

#### Import Lending Club Dataset

In [3]:
# Specify the path to the zip file
filepath = '/Users/juanvalidmind/Dev/datasets/lending club/data_2007_2014/loan_data_2007_2014.csv'
df = pd.read_csv(filepath)

# Perform operations on the DataFrame as needed
print(df.head())

Columns (19) have mixed types. Specify dtype option on import or set low_memory=False.


        id  member_id  loan_amnt  funded_amnt  funded_amnt_inv        term  \
0  1077501    1296599       5000         5000           4975.0   36 months   
1  1077430    1314167       2500         2500           2500.0   60 months   
2  1077175    1313524       2400         2400           2400.0   36 months   
3  1076863    1277178      10000        10000          10000.0   36 months   
4  1075358    1311748       3000         3000           3000.0   60 months   

   int_rate  installment grade sub_grade  ... total_bal_il il_util  \
0     10.65       162.87     B        B2  ...          NaN     NaN   
1     15.27        59.83     C        C4  ...          NaN     NaN   
2     15.96        84.33     C        C5  ...          NaN     NaN   
3     13.49       339.31     C        C1  ...          NaN     NaN   
4     12.69        67.79     B        B5  ...          NaN     NaN   

  open_rv_12m  open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi  \
0         NaN          NaN        Na

## Step 3: Describe Raw Data

In [4]:
from validmind.vm_models.test_context import TestContext
from validmind.data_validation.metrics import TabularDescriptionTables

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)
metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


VBox(children=(HTML(value='<p>This section provides descriptive statistics for numerical, categorical and date…

#### Format Dates

In [5]:
from datetime import datetime

def convert_to_datetime(df, columns):
    # Specify the date format
    date_format = "%b-%y"

    # Iterate over the specified columns and convert to datetime
    for column in columns:
        df[column] = pd.to_datetime(df[column], format=date_format)

    return df

In [6]:
# Convert the specified columns to datetime
columns_to_convert = ['earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d']
df = convert_to_datetime(df, columns_to_convert)

Run test again. 

In [7]:
vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)
metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


VBox(children=(HTML(value='<p>This section provides descriptive statistics for numerical, categorical and date…

#### Remove Categorical Variables Not Included in the Analysis

In [8]:
unused_variables = ['url', 'zip_code', 'title', 'emp_title', 'desc', 'application_type', 'addr_state']
df = df.drop(columns=unused_variables)

Run test again.

In [9]:
vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)
metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


VBox(children=(HTML(value='<p>This section provides descriptive statistics for numerical, categorical and date…

#### Remove Variables with Large Number of Missing Values

In [10]:
metric_output = metric.result.metric.value

In [11]:
def drop_variables_with_min_missing(metric_output, min_missing_count):
    variables_to_drop = []

    for key, df in metric_output.items():
        missing_counts = df["Missing Values (%)"]

        variables_to_drop.extend(
            df.loc[missing_counts > min_missing_count, df.columns[0]].tolist()
        )

    return variables_to_drop

In [12]:
min_missing_count = 45
variables_to_drop = drop_variables_with_min_missing(metric_output, min_missing_count)
print(variables_to_drop)

['inq_last_12m', 'open_rv_12m', 'verification_status_joint', 'open_acc_6m', 'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'dti_joint', 'annual_inc_joint', 'all_util', 'open_rv_24m', 'total_cu_tl', 'inq_fi', 'max_bal_bc', 'mths_since_last_record', 'mths_since_last_major_derog', 'mths_since_last_delinq', 'next_pymnt_d']


In [13]:
df = df.drop(columns=variables_to_drop)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 46 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   id                          466285 non-null  int64         
 1   member_id                   466285 non-null  int64         
 2   loan_amnt                   466285 non-null  int64         
 3   funded_amnt                 466285 non-null  int64         
 4   funded_amnt_inv             466285 non-null  float64       
 5   term                        466285 non-null  object        
 6   int_rate                    466285 non-null  float64       
 7   installment                 466285 non-null  float64       
 8   grade                       466285 non-null  object        
 9   sub_grade                   466285 non-null  object        
 10  emp_length                  445277 non-null  object        
 11  home_ownership              466285 non-

Run test again.

In [15]:
vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)
metric = TabularDescriptionTables(test_context)
metric.run()
metric.result.show()

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


VBox(children=(HTML(value='<p>This section provides descriptive statistics for numerical, categorical and date…