                This Notebook explains how to use the bivariates function and to calculate CSI from its output

Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from csi_bivar import info_val

For the purpose of this tutorial Im using a kaggle notebook which depicts a binary classification problem

In [2]:
df=pd.read_csv("bank.csv")
df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [3]:
df.y.replace({'yes':1,'no':0},inplace=True)

Splitting the data represent train and new Out-of-time.

In [4]:
train_data, oot_data = np.split(df.sample(frac=1, random_state=np.random.RandomState()), [int(0.7 * len(df))])

In [5]:
train_data.reset_index(inplace=True,drop= True)
oot_data.reset_index(inplace=True,drop= True)

In [6]:
train_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,48,self-employed,married,secondary,no,91,no,no,cellular,136,8,-1,0,unknown,0
1,70,retired,married,primary,no,579,no,no,cellular,82,2,181,1,failure,0
2,35,technician,single,tertiary,no,2012,yes,no,unknown,413,2,-1,0,unknown,0
3,31,entrepreneur,single,tertiary,no,38,no,no,cellular,185,2,-1,0,unknown,0
4,29,blue-collar,married,secondary,no,1074,no,no,cellular,153,1,-1,0,unknown,0


We need 4 parameters to call the bivariates function
1) y (string): Name of the Y variable
2) inp_fe (list): List containg the list of features for which the metrics should be calculated
3) in_df (DataFrame): Train DataFrame
4) out_df (DataFrame): Test/Out of time DataFrame

In [7]:
y="y"
inp_fe = list(train_data.columns)[:-1]
inp_fe

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [8]:
indf,outdf = info_val(y, inp_fe, train_data.copy(), oot_data.copy())

age
job
marital
education
default
balance
housing
loan
contact
duration
campaign
pdays
previous
poutcome


In [9]:
indf.head()


Unnamed: 0,keys_intime,Levels_intime,lower_intime,upper_intime,total_intime,perc_in_feature_intime,Target_counts_intime,non_Target_counts_intime,Target_perc_current_level_intime,Target_percent_wrt_total_intime,Non_Target_perc_current_level_intime,Non_Target_percent_wrt_total_intime,WOE_intime,IV_intime
0,age,"(17.0, 29.0]",17.0,29.0,3670,0.115967,632,3038,0.172207,0.172066,0.827793,0.108601,-0.460201,0.029207
1,age,"(29.0, 32.0]",29.0,32.0,4070,0.128606,451,3619,0.110811,0.122788,0.889189,0.12937,0.052219,0.000344
2,age,"(32.0, 34.0]",32.0,34.0,2723,0.086043,281,2442,0.103195,0.076504,0.896805,0.087295,0.131951,0.001424
3,age,"(34.0, 36.0]",34.0,36.0,2587,0.081746,286,2301,0.110553,0.077866,0.889447,0.082255,0.054841,0.000241
4,age,"(36.0, 39.0]",36.0,39.0,3259,0.10298,326,2933,0.100031,0.088756,0.899969,0.104847,0.166617,0.002681


In [None]:
'keys'                          - Variable
'Levels'                        - Level/bin of variable
'lower'                         - Lowest value in the bin for numerical variables
'upper'                         - Highest value in the bin for numerical variables 
'total'                         - Total number of rows/values
'perc_in_feature'               - % of rows for this particular level
'Target_counts'                 - Count of 1s in this level
'non_Target_counts'             - Count of 0s in this level
'Target_perc_current_level'     - % of 1s in this level out of total rows for the level
'Target_percent_wrt_total'      - % of 1s in this level out of total 1s ion data
'Non_Target_perc_current_level' - % of 0s in this level out of total rows for the level
'Non_Target_percent_wrt_total'  - % of 0s in this level out of total 0s in data
'WOE'                           - Weight of evidence
'IV'                            - Information value

# Character Stability Index

Once we have the bivariates with the percentages of Y at each level calculating CSI is straight forward

In [10]:
intime= indf[['keys_intime','Levels_intime','perc_in_feature_intime']]
outtime= outdf[['keys_outtime','Levels_outtime','perc_in_feature_outtime']]

csi=pd.merge(intime,outtime,left_on=['keys_intime','Levels_intime'],right_on=['keys_outtime','Levels_outtime'],how='left')
csi['csi']=(csi['perc_in_feature_intime']-csi['perc_in_feature_outtime'])*np.log((csi['perc_in_feature_intime']/csi['perc_in_feature_outtime']))

In [11]:
csi.head(10)

Unnamed: 0,keys_intime,Levels_intime,perc_in_feature_intime,keys_outtime,Levels_outtime,perc_in_feature_outtime,csi
0,age,"(17.0, 29.0]",0.115967,age,"(17.0, 29.0]",0.11818,4.2e-05
1,age,"(29.0, 32.0]",0.128606,age,"(29.0, 32.0]",0.130345,2.3e-05
2,age,"(32.0, 34.0]",0.086043,age,"(32.0, 34.0]",0.086921,9e-06
3,age,"(34.0, 36.0]",0.081746,age,"(34.0, 36.0]",0.082055,1e-06
4,age,"(36.0, 39.0]",0.10298,age,"(36.0, 39.0]",0.102477,2e-06
5,age,"(39.0, 42.0]",0.087654,age,"(39.0, 42.0]",0.082129,0.00036
6,age,"(42.0, 46.0]",0.103896,age,"(42.0, 46.0]",0.103214,4e-06
7,age,"(46.0, 51.0]",0.110342,age,"(46.0, 51.0]",0.107785,6e-05
8,age,"(51.0, 56.0]",0.093153,age,"(51.0, 56.0]",0.092082,1.2e-05
9,age,"(56.0, 95.0]",0.089614,age,"(56.0, 95.0]",0.09481,0.000293
