In [24]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

print('numpy version : ',np.__version__)
print('pandas version : ',pd.__version__)
print('seaborn version : ',sns.__version__)

numpy version :  1.20.1
pandas version :  1.2.4
seaborn version :  0.11.1


In [25]:
df = pd.read_csv('credit-g_csv.csv')

In [26]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


In [27]:
df.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


In [28]:
def segment(x):
    if x['class'] == 'good':
        class_encoded = '0'
    else:
        class_encoded = '1'
    return class_encoded

df['class'] = df.apply(lambda x: segment(x), axis=1)

In [29]:
df["class"] = pd.to_numeric(df["class"], downcast="float")

In [30]:
def calc_iv(df, feature, target, pr=False):

    d1 = df.groupby(by=feature, as_index=True)
    data = pd.DataFrame()

    data['all'] = d1[target].count()
    data['bad'] = d1[target].sum()
    data['good'] = data['all']-data['bad']
    data['share'] = data['all'] / data['all'].sum()
    data['bad_rate'] = d1[target].mean()
    data['d_g'] = (data['all'] - data['bad']) / (data['all'] - data['bad']).sum()
    data['d_b'] = data['bad'] / data['bad'].sum()
    data['woe'] = np.log(data['d_g'] / data['d_b'])
    data = data.replace({'woe': {np.inf: 0, -np.inf: 0}})
    data['iv'] = data['woe'] * (data['d_g'] - data['d_b'])

    data.insert(0, 'variable', feature)
    data.insert(1, 'value', data.index)
    data.index = range(len(data))

    iv = data['iv'].sum()

    if pr:
        print(data)
        print('IV = %s' % iv)

    return iv, data

In [35]:
iv, data = calc_iv(df, 'credit_amount', 'class')

In [36]:
data

Unnamed: 0,variable,value,all,bad,good,share,bad_rate,d_g,d_b,woe,iv
0,credit_amount,"(249.999, 708.95]",50,12.0,38.0,0.05,0.24,0.054286,0.04,0.305382,0.004363
1,credit_amount,"(708.95, 932.0]",51,19.0,32.0,0.051,0.372549,0.045714,0.063333,-0.326001,0.005744
2,credit_amount,"(932.0, 1157.55]",49,14.0,35.0,0.049,0.285714,0.05,0.046667,0.068993,0.00023
3,credit_amount,"(1157.55, 1262.0]",51,16.0,35.0,0.051,0.313726,0.05,0.053333,-0.064539,0.000215
4,credit_amount,"(1262.0, 1365.5]",49,16.0,33.0,0.049,0.326531,0.047143,0.053333,-0.123379,0.000764
5,credit_amount,"(1365.5, 1479.4]",50,10.0,40.0,0.05,0.2,0.057143,0.033333,0.538996,0.012833
6,credit_amount,"(1479.4, 1602.65]",50,9.0,41.0,0.05,0.18,0.058571,0.03,0.66905,0.019116
7,credit_amount,"(1602.65, 1906.8]",50,13.0,37.0,0.05,0.26,0.052857,0.043333,0.198671,0.001892
8,credit_amount,"(1906.8, 2100.55]",50,16.0,34.0,0.05,0.32,0.048571,0.053333,-0.093526,0.000445
9,credit_amount,"(2100.55, 2319.5]",50,14.0,36.0,0.05,0.28,0.051429,0.046667,0.097164,0.000463


data['bad_rate']

In [120]:
data['bad_rate'].sort_values(ascending = True)

6     0.180000
14    0.183673
5     0.200000
10    0.215686
12    0.220000
13    0.235294
0     0.240000
7     0.260000
11    0.265306
9     0.280000
2     0.285714
3     0.313726
8     0.320000
16    0.320000
4     0.326531
18    0.360000
1     0.372549
15    0.400000
17    0.440000
19    0.580000
Name: bad_rate, dtype: float32

In [155]:
data

Unnamed: 0,variable,value,all,bad,good,share,bad_rate,d_g,d_b,woe,iv
0,credit_amount,"(249.999, 708.95]",50,12.0,38.0,0.05,0.24,0.054286,0.04,0.305382,0.004363
1,credit_amount,"(708.95, 932.0]",51,19.0,32.0,0.051,0.372549,0.045714,0.063333,-0.326001,0.005744
2,credit_amount,"(932.0, 1157.55]",49,14.0,35.0,0.049,0.285714,0.05,0.046667,0.068993,0.00023
3,credit_amount,"(1157.55, 1262.0]",51,16.0,35.0,0.051,0.313726,0.05,0.053333,-0.064539,0.000215
4,credit_amount,"(1262.0, 1365.5]",49,16.0,33.0,0.049,0.326531,0.047143,0.053333,-0.123379,0.000764
5,credit_amount,"(1365.5, 1479.4]",50,10.0,40.0,0.05,0.2,0.057143,0.033333,0.538996,0.012833
6,credit_amount,"(1479.4, 1602.65]",50,9.0,41.0,0.05,0.18,0.058571,0.03,0.66905,0.019116
7,credit_amount,"(1602.65, 1906.8]",50,13.0,37.0,0.05,0.26,0.052857,0.043333,0.198671,0.001892
8,credit_amount,"(1906.8, 2100.55]",50,16.0,34.0,0.05,0.32,0.048571,0.053333,-0.093526,0.000445
9,credit_amount,"(2100.55, 2319.5]",50,14.0,36.0,0.05,0.28,0.051429,0.046667,0.097164,0.000463


In [33]:
df['duration'] = pd.qcut(df['duration'], q=5)

In [37]:
df['age'] = pd.qcut(df['age'], q=5)

In [None]:
df['residence_since'] = 

In [41]:
df['residence_since'].unique()

array([4, 2, 3, 1], dtype=int64)

In [45]:
df['num_dependents'].unique()

array([1, 2], dtype=int64)

In [47]:
df['existing_credits'].unique()

array([2, 1, 3, 4], dtype=int64)

In [62]:
data['bad_rate'].sort_values(ascending=False)

3    0.333333
0    0.315956
1    0.276276
2    0.214286
Name: bad_rate, dtype: float32

In [79]:
def coarse_classer(df, indexloc_1, indexloc_2):
    mean_val = pd.DataFrame(np.mean(pd.DataFrame([df.iloc[indexloc_1], df.iloc[indexloc_2]]))).T
    original = df.drop([indexloc_1, indexloc_2])
    
    coarsed_df = pd.concat([original, mean_val])
    coarsed_df = coarsed_df.sort_values(by='bad_rate', ascending=False).reset_index(drop=True)
    
    return coarsed_df

In [13]:
df = coarse_classer(data, 6, 14)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,"(1157.55, 1262.0]",no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,0.0
1,0<=X<200,48,existing paid,radio/tv,"(4720.0, 5969.95]",<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,1.0
2,no checking,12,critical/other existing credit,education,"(1906.8, 2100.55]",<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,0.0
3,<0,42,existing paid,furniture/equipment,"(7179.4, 9162.7]",<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,0.0
4,<0,24,delayed previously,new car,"(4720.0, 5969.95]",<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12,existing paid,furniture/equipment,"(1602.65, 1906.8]",<100,4<=X<7,3,female div/dep/mar,none,...,real estate,31,none,own,1,unskilled resident,1,none,yes,0.0
996,<0,30,existing paid,used car,"(3590.0, 3972.25]",<100,1<=X<4,4,male div/sep,none,...,life insurance,40,none,own,1,high qualif/self emp/mgmt,1,yes,yes,0.0
997,no checking,12,existing paid,radio/tv,"(708.95, 932.0]",<100,>=7,4,male single,none,...,car,38,none,own,1,skilled,1,none,yes,0.0
998,<0,45,existing paid,radio/tv,"(1602.65, 1906.8]",<100,1<=X<4,4,male single,none,...,no known property,23,none,for free,1,skilled,1,yes,yes,1.0


In [None]:
data['woe'].replace({ 'Spain': 'Spain_and_France', 'France': 'Spain_and_France' }, inplace=True)

In [34]:
df['credit_amount'] = pd.qcut(df['credit_amount'], q=20)