In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/home-credit-default-risk'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv(path + '/train.csv').set_index('SK_ID_CURR')
test = pd.read_csv(path + '/test.csv').set_index('SK_ID_CURR')

In [None]:
train.loc[:,['b_now_overdue_percent_Credit card', 'b_now_overdue_percent_Consumer credit', 'b_now_overdue_percent_Car loan',
             'b_now_overdue_percent_all']] = train[['b_now_overdue_percent_Credit card', 'b_now_overdue_percent_Consumer credit', 'b_now_overdue_percent_Car loan',
             'b_now_overdue_percent_all']].replace([np.inf], 0)
test.loc[:,['b_now_overdue_percent_Credit card', 'b_now_overdue_percent_Consumer credit', 'b_now_overdue_percent_Car loan',
             'b_now_overdue_percent_all']] = test[['b_now_overdue_percent_Credit card', 'b_now_overdue_percent_Consumer credit', 'b_now_overdue_percent_Car loan',
             'b_now_overdue_percent_all']].replace([np.inf], 0)      

In [None]:
train_cash = train[train['NAME_CONTRACT_TYPE'] == 'Cash loans']
y_train_cash = train_cash['TARGET']
train_cash = train_cash.drop(columns=['TARGET', 'NAME_CONTRACT_TYPE'])
test_cash = test[test['NAME_CONTRACT_TYPE'] == 'Cash loans']
y_test_cash = test_cash['TARGET']
test_cash = test_cash.drop(columns=['TARGET', 'NAME_CONTRACT_TYPE'])
train_rev = train[train['NAME_CONTRACT_TYPE'] == 'Revolving loans']
y_train_rev = train_rev['TARGET']
train_rev = train_rev.drop(columns=['TARGET', 'NAME_CONTRACT_TYPE'])
test_rev = test[test['NAME_CONTRACT_TYPE'] == 'Revolving loans']
y_test_rev = test_rev['TARGET']
test_rev = test_rev.drop(columns=['TARGET', 'NAME_CONTRACT_TYPE'])


## Cash loans

### Предворительная обработка данных (выделение категориальных и интервальных переменных, разбиение интервальных переменных на бины, обработка пропусков)

In [None]:
categorical = []
for c in train_cash.columns:
  if train_cash[c].dtype == 'object':
    categorical.append(c)
categorical += ['FLAG_MOBIL', 'FLAG_EMP_PHONE',	'FLAG_WORK_PHONE',	'FLAG_CONT_MOBILE',
               'FLAG_PHONE', 'FLAG_EMAIL', 'REG_REGION_NOT_LIVE_REGION',
               'REG_REGION_NOT_WORK_REGION',	'LIVE_REGION_NOT_WORK_REGION',	'REG_CITY_NOT_LIVE_CITY',	'REG_CITY_NOT_WORK_CITY',
               'LIVE_CITY_NOT_WORK_CITY', 'ORGANIZATION_TYPE', 'b_Microloan', 'b_overdue_all',
               'b_overdue_Credit card', 'b_overdue_Consumer credit', 'b_overdue_Mortgage', 'b_overdue_Car loan', 'b_overdue_all_1',
               'b_overdue_Credit card_1', 'b_overdue_Consumer credit_1', 'b_overdue_Mortgage_1', 'b_overdue_Car loan_1', 'b_overdue_all_3', 
               'b_overdue_Credit card_3', 'b_overdue_Consumer credit_3', 'b_overdue_Mortgage_3', 'b_overdue_Car loan_3', 'p_is_several_a_day',
               'p_new_client_1', 'p_new_client_3', 'p_ins_ind']
for i in range(2, 22):
  categorical.append('FLAG_DOCUMENT_'+str(i))
real = list(set(train_cash.columns) - set(categorical))

In [None]:
all_data = pd.concat([train_cash, test_cash])

In [None]:
for c in real:
  all_data[c + '_bin'] = pd.qcut(all_data[c], 20, duplicates='drop').cat.add_categories("NAN").fillna("NAN")
train_cash = all_data[:train_cash.shape[0]]
test_cash = all_data[train_cash.shape[0]:]

In [None]:
from sklearn.model_selection import train_test_split
train_cash, valid_cash, y_train_cash, y_valid_cash = train_test_split(train_cash, y_train_cash, test_size=0.2, random_state=42, stratify=(y_train_cash))
train_cash, count_cash, y_train_cash, y_count_cash = train_test_split(train_cash, y_train_cash, test_size=0.2, random_state=42, stratify=(y_train_cash))

In [None]:
train_cash[categorical] = train_cash[categorical].fillna('NAN')
valid_cash[categorical] = valid_cash[categorical].fillna('NAN')
count_cash[categorical] = count_cash[categorical].fillna('NAN')
test_cash[categorical] = test_cash[categorical].fillna('NAN')

### Вычисление weight of evidence на отложенной выборке count

In [None]:
woe_train_cash = train_cash.copy()
woe_valid_cash = valid_cash.copy()
woe_count_cash = count_cash.copy()
woe_test_cash = test_cash.copy()

In [None]:
def calculate_woe_iv(data, column, target):
    lst = []
    for i in range(data[column].nunique()):
        val = list(data[column].unique())[i]
        lst.append({
            'Value': val,
            'All': data[data[column] == val].count()[column],
            'Good': data[(data[column] == val) & (target == 0)].count()[column],
            'Bad': data[(data[column] == val) & (target == 1)].count()[column]
        })

    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    
    dset = dset.sort_values(by='WoE')
    
    return dset, iv

In [None]:
scorecard_cash = pd.DataFrame()

In [None]:
res_cat = pd.DataFrame(index = categorical, columns = ['iv'])
for c in categorical:
  df, iv = calculate_woe_iv(count_cash, c, y_count_cash)
  df['feature'] = c
  for i in df.index:
    woe_train_cash.loc[woe_train_cash[c] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_valid_cash.loc[woe_valid_cash[c] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_test_cash.loc[woe_test_cash[c] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']

  res_cat.loc[c, 'iv'] = iv
  scorecard_cash = scorecard_cash.append(df[['feature', 'Value', 'WoE']])
    
res_cat[res_cat['iv'] >= 0.02]

In [None]:
from sklearn.tree import DecisionTreeClassifier

res_tree = pd.DataFrame(index = real, columns = ['iv'])

for c in real:
  d = pd.DataFrame(index = woe_count_cash.index, columns = ['woe'])
  df, iv = calculate_woe_iv(count_cash, c + '_bin', y_count_cash)
  df['feature'] = c
  for i in df.index:
    d.loc[woe_count_cash[c + '_bin'] == df.loc[i, 'Value'], 'woe'] = df.loc[i, 'WoE']

  clf = DecisionTreeClassifier(random_state = 42, max_depth = 2)
  clf.fit(d, y_count_cash)
  d['leaf'] = clf.apply(d)
  all_intervals = []
  for i in d['leaf'].unique():
    intervals = woe_count_cash[d['leaf'] == i][c+'_bin'].values.unique()
    woe_count_cash.loc[:, c+'_bin'] = woe_count_cash.loc[:, c+'_bin'].replace(intervals, i)
    woe_train_cash.loc[:, c+'_bin'] = woe_train_cash.loc[:, c+'_bin'].replace(intervals, i)
    woe_valid_cash.loc[:, c+'_bin'] = woe_valid_cash.loc[:, c+'_bin'].replace(intervals, i)
    woe_test_cash.loc[:, c+'_bin'] = woe_test_cash.loc[:, c+'_bin'].replace(intervals, i)
    all_intervals.append(str(intervals).split('\n')[0])

  df, iv = calculate_woe_iv(woe_count_cash, c + '_bin', y_count_cash)
  df['feature'] = c
  for i in df.index:
    woe_train_cash.loc[woe_train_cash[c + '_bin'] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_valid_cash.loc[woe_valid_cash[c + '_bin'] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_test_cash.loc[woe_test_cash[c + '_bin'] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
  df.loc[:, 'Value'] = df.loc[:, 'Value'].replace(d['leaf'].unique(), all_intervals)
  res_tree.loc[c, 'iv'] = iv
  scorecard_cash = scorecard_cash.append(df[['feature', 'Value', 'WoE']])

res_tree[res_tree['iv'] >= 0.02]

Unnamed: 0,iv
b_last_closed_Credit card,0.0233529
b_start_Consumer credit_avg,0.110112
p_Revolving loans_sum_app_avg,0.0218261
b_Credit card_sum_3,0.0413691
b_Credit card_avg,0.0302844
...,...
ENTRANCES_AVG,0.0269654
b_limit_min,0.0300389
b_Credit card_num_1,0.046701
b_active_all_dur_max,0.0509989


In [None]:
iv_cash = pd.concat([res_cat, res_tree])

In [None]:
scorecard_cash.to_csv('scorecard_cash.csv', index=True)

### Отбор признаков (по information value и корреляциям Спирмана)

In [None]:
good_f = iv_cash[iv_cash['iv'] >= 0.02].index

In [None]:
woe_train_cash1 = woe_train_cash[good_f]
woe_valid_cash1 = woe_valid_cash[good_f]
woe_test_cash1 = woe_test_cash[good_f]

In [None]:
correlations_woe = woe_train_cash1.corr('spearman')
correlations_woe

Unnamed: 0,FLAG_EMP_PHONE,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_6,b_last_closed_Credit card,b_start_Consumer credit_avg,p_Revolving loans_sum_app_avg,b_Credit card_sum_3,b_Credit card_avg,b_active_Consumer credit_dur_max,p_all_down_payment_avg_,b_Credit card_num,b_Car loan_sum,p_Consumer loans_down_payment_avg_,b_dur_avg_Consumer credit,FLOORSMAX_MEDI,b_late_num_all,b_early_num_all,p_prod_group_Cash Street_percent,p_sum,p_Consumer loans_sum_avg_,EXT_SOURCE_1,AMT_ANNUITY,b_Consumer credit_avg,b_active_Credit card_dur_avg,AMT_REQ_CREDIT_BUREAU_QRT,b_closed_days_avg_all,b_Consumer credit_sum_3,b_start_Credit card_avg,APARTMENTS_MEDI,b_Consumer credit_avg_1,b_Mortgage_avg,b_Car loan_avg,LIVINGAREA_MEDI,b_Consumer credit_num_1,YEARS_BUILD_AVG,b_debt_sum,b_now_overdue_percent_all,REGION_RATING_CLIENT_W_CITY,b_good_percent_avg,...,BASEMENTAREA_MEDI,p_cnt_avg,b_all_sum_3,APARTMENTS_MODE,p_Revolving loans_num,YEARS_BUILD_MODE,p_cnt_max,b_long_percent_avg,p_yield_low_normal,p_Consumer loans_sum_app_avg_3,b_closed_Credit card_num,b_last_closed_Car loan,b_all_num_1,b_all_avg,b_last_closed_all,DAYS_BIRTH,b_Mortgage_num,YEARS_BEGINEXPLUATATION_AVG,b_Credit card_mean,AMT_REQ_CREDIT_BUREAU_MON,NONLIVINGAREA_AVG,b_active_Credit card_num,b_debt_num,b_all_sum_1,AMT_GOODS_PRICE,AMT_CREDIT,p_Consumer loans_high_percent,b_debt_avg_Credit card,NONLIVINGAREA_MODE,ENTRANCES_MODE,LIVINGAREA_MODE,b_Credit card_num_3,p_all_num_3,b_Mortgage_mean,b_all_num,ENTRANCES_AVG,b_limit_min,b_Credit card_num_1,b_active_all_dur_max,p_evening_percent
FLAG_EMP_PHONE,1.000000,0.093636,0.261971,0.615325,0.031271,0.034352,0.039073,0.030811,0.005735,0.018287,-0.045206,0.041183,-0.040700,-0.037482,0.037297,-0.012412,-0.000163,-0.001424,0.013696,0.034810,-0.005741,0.027197,0.047414,0.026660,0.019141,-0.004369,0.025020,0.026209,0.005001,-0.009207,-0.006643,-0.044051,-0.040614,-0.012928,-0.011577,-0.007084,0.041115,-0.009529,-0.013559,0.005133,...,-0.008933,-0.070840,0.004222,-0.008932,0.029683,-0.007753,-0.011416,-0.008799,0.067884,0.013014,-0.015048,0.031271,0.004234,-0.046116,0.031271,0.562456,-0.012799,-0.009122,0.025575,-0.018059,-0.011263,0.037677,0.023976,0.020066,-0.003360,0.003162,-0.010999,0.035449,-0.010890,-0.005861,-0.012039,0.037917,-0.031626,-0.044032,-0.012912,-0.006853,0.010915,0.003540,0.044890,-0.101452
REG_CITY_NOT_LIVE_CITY,0.093636,1.000000,0.437358,0.059096,0.015454,0.083277,0.031103,0.028529,0.021427,0.007815,0.006752,0.011523,0.026433,0.009316,0.064248,0.159357,0.043951,0.046592,-0.008791,0.037691,-0.001279,0.068178,0.011153,0.036826,0.026363,0.032491,0.079888,0.011117,0.054021,0.152494,0.020291,0.024116,0.026579,0.152266,0.023960,0.108837,0.015544,0.028234,0.022552,0.027331,...,0.129704,-0.020706,0.015667,0.149333,0.018090,0.110345,-0.007938,0.029578,0.039317,0.022327,0.030418,0.015454,0.026708,0.016653,0.015454,0.170125,0.029678,0.141021,0.023117,0.032373,0.141636,0.016966,0.012088,0.028191,0.037093,0.026740,0.027998,0.030395,0.141744,0.149586,0.148418,0.027319,-0.001231,0.024073,0.026699,0.149427,0.037134,0.030844,0.022072,-0.019155
REG_CITY_NOT_WORK_CITY,0.261971,0.437358,1.000000,0.165817,0.011782,0.067379,0.030672,0.013361,0.007173,0.018725,-0.007222,-0.004133,0.013210,-0.003939,0.051817,0.268478,0.040384,0.039747,-0.001495,0.046241,0.011569,0.085237,0.029617,0.035358,0.018845,0.025844,0.069045,0.014764,0.050550,0.256354,0.011254,0.022188,0.013078,0.259986,0.015864,0.186790,0.025294,0.023961,0.005601,0.024540,...,0.218223,-0.023696,0.016492,0.251185,0.022678,0.188748,0.001293,0.023766,0.052918,0.032821,0.029703,0.011782,0.019066,0.009631,0.011782,0.225919,0.023740,0.234396,0.009594,0.029162,0.239724,0.007533,0.015698,0.020082,0.038747,0.027016,0.025443,0.023763,0.241043,0.248358,0.251474,0.014915,-0.007143,0.022141,0.019604,0.249164,0.026239,0.023777,0.027834,-0.007429
FLAG_DOCUMENT_6,0.615325,0.059096,0.165817,1.000000,0.022153,0.016839,0.052297,0.024176,0.009595,0.012729,-0.017892,0.030787,-0.025387,-0.013768,0.023186,-0.004970,-0.003575,-0.001300,0.034054,0.027282,0.011124,0.017345,0.033480,0.018429,0.017829,-0.000053,0.011488,0.019305,0.003597,-0.002705,-0.003898,-0.027185,-0.025136,-0.004044,-0.006667,0.002309,0.028436,-0.004646,-0.022570,0.010745,...,0.001787,-0.035638,0.004097,-0.002402,0.047939,0.002487,0.011863,-0.003188,0.052818,0.028292,-0.006295,0.022153,0.001900,-0.028065,0.022153,0.378175,-0.005545,-0.003157,0.020922,-0.010645,-0.003013,0.028803,0.020195,0.014119,-0.019643,-0.009582,0.014011,0.027563,-0.002307,-0.001327,-0.004668,0.028364,0.002881,-0.027198,-0.009391,-0.001043,0.008800,0.005904,0.030365,-0.060441
b_last_closed_Credit card,0.031271,0.015454,0.011782,0.022153,1.000000,0.161759,0.012332,0.359745,0.293829,0.253581,0.017371,0.303048,0.122240,0.019091,0.125461,-0.007348,0.096209,0.047081,0.034635,0.018150,0.017020,-0.003633,0.014377,0.100212,0.249323,0.150814,0.154331,0.163117,0.195144,-0.006973,0.328423,0.110138,0.122483,-0.008325,0.230053,-0.008679,0.519572,0.166861,0.022869,0.119757,...,-0.007065,0.043892,0.266091,-0.007841,0.021065,-0.008488,0.020727,0.154331,0.021204,0.018061,0.123328,1.000000,0.282362,0.073706,1.000000,0.031777,0.154864,-0.007100,0.326528,0.126032,-0.007861,0.273050,0.351806,0.387358,0.010056,0.011521,0.015810,0.335871,-0.008474,-0.006712,-0.006903,0.256968,0.026410,0.110113,-0.005039,-0.007073,0.196990,0.181756,0.349367,0.000800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENTRANCES_AVG,-0.006853,0.149427,0.249164,-0.001043,-0.007073,0.036799,0.027995,-0.021748,-0.007900,0.026421,0.018177,-0.040571,0.036937,0.019978,0.017580,0.890381,0.032954,0.038207,0.002581,0.048499,0.036528,0.058774,0.017201,0.015962,-0.004065,0.026592,0.046635,-0.001590,0.041662,0.888118,0.019947,0.043900,0.037208,0.842222,0.019031,0.638952,0.008991,0.024492,0.078965,0.013628,...,0.754693,0.013490,0.017059,0.877752,0.021009,0.649033,0.026030,0.023626,0.039121,0.051589,0.039047,-0.007073,0.006914,0.038592,-0.007073,0.015770,0.027031,0.831597,-0.019058,0.044164,0.828311,-0.022746,0.014697,0.002393,0.038589,0.028633,0.046074,-0.002358,0.835034,0.977234,0.833153,-0.017579,0.011809,0.043907,0.025084,1.000000,0.017395,0.015720,0.007410,0.080386
b_limit_min,0.010915,0.037134,0.026239,0.008800,0.196990,0.359155,0.026049,0.570203,0.587283,0.460912,0.042065,0.519349,0.726615,0.040527,0.442044,0.018793,0.611296,0.564425,0.000134,0.033198,0.001503,0.042851,0.003575,0.600225,0.557389,0.763273,0.426623,0.512913,0.505049,0.017461,0.475478,0.726382,0.726501,0.017194,0.581976,0.013209,0.438077,0.762072,0.010194,0.537207,...,0.014302,0.003549,0.543911,0.016772,0.018913,0.014953,-0.006453,0.791455,0.037553,0.007261,0.733510,0.196990,0.424562,0.537912,0.196990,0.044512,0.799900,0.017273,0.584679,0.707520,0.013993,0.534400,0.455984,0.504305,0.040001,0.021218,0.042310,0.601375,0.014544,0.017970,0.015949,0.485841,-0.007491,0.726359,0.519399,0.017395,1.000000,0.682281,0.468083,0.027828
b_Credit card_num_1,0.003540,0.030844,0.023777,0.005904,0.181756,0.381517,0.033980,0.624803,0.594740,0.507894,0.040038,0.658344,0.745864,0.037862,0.454554,0.016142,0.627849,0.566355,0.009854,0.027275,0.004672,0.039755,0.004274,0.615417,0.635024,0.786548,0.447981,0.543740,0.562137,0.015620,0.528151,0.745832,0.745948,0.015559,0.636304,0.006921,0.467908,0.789671,0.005092,0.566456,...,0.013421,0.012791,0.577838,0.015542,0.026711,0.008215,0.002546,0.815029,0.034738,0.006484,0.732911,0.181756,0.594236,0.553622,0.181756,0.037205,0.824174,0.015489,0.571780,0.727828,0.013136,0.687782,0.535864,0.594750,0.042710,0.023825,0.042019,0.625276,0.013102,0.016684,0.014521,0.680509,0.011509,0.745806,0.540265,0.015720,0.682281,1.000000,0.540750,0.022108
b_active_all_dur_max,0.044890,0.022072,0.027834,0.030365,0.349367,0.281266,0.026393,0.541863,0.495762,0.634642,0.033751,0.547808,0.471804,0.031423,0.277224,0.007206,0.399088,0.326720,0.036606,0.011010,0.015255,0.016900,0.006334,0.406806,0.557841,0.523674,0.301882,0.449878,0.415440,0.008979,0.513833,0.514564,0.472302,0.007570,0.498758,0.000534,0.528018,0.530270,-0.000672,0.410303,...,0.005068,0.046447,0.492073,0.007713,0.031363,0.001517,0.026097,0.541863,0.026746,0.011613,0.465779,0.349367,0.470989,0.360054,0.349367,0.045597,0.549257,0.008269,0.503777,0.474592,0.006056,0.536117,0.522836,0.572950,0.027179,0.014837,0.030108,0.533283,0.005822,0.007935,0.007934,0.515136,0.030426,0.514541,0.303240,0.007410,0.468083,0.540750,1.000000,0.004393


In [None]:
features = correlations_woe.columns
corr = []
for i in range(len(features)-1):
  for j in range(i+1, len(features)):
    if abs(correlations_woe.iloc[i, j]) >= 0.8:
       if iv_cash.loc[features[i], 'iv'] > iv_cash.loc[features[j], 'iv']:
         corr.append(features[j])
       else:
         corr.append(features[i])
woe_train_cash1 = woe_train_cash1.drop(columns=corr)
woe_valid_cash1 = woe_valid_cash1.drop(columns=corr)
woe_test_cash1 = woe_test_cash1.drop(columns=corr)

In [None]:
from sklearn.metrics import roc_auc_score

def Gini(y_true, y_pred):
  return (2*roc_auc_score(y_true.values, y_pred) - 1)

### Обучение модели

In [None]:
from sklearn.linear_model import LogisticRegression
best_g = 0
best_c = 0
for c in [0.1, 0.3, 0.5, 0.7, 1]:
  clf = LogisticRegression(random_state=42, C=c).fit(woe_train_cash1, y_train_cash)
  pred = clf.predict_proba(woe_valid_cash1)
  er = Gini(y_valid_cash, pred[:, 1])
  if best_g == 0 or er > best_g:
    best_g = er
    best_c = c
print('best Gini =', best_g, '\nbest C =', best_c)

best Gini = 0.5045837514370708 
best C = 1


In [None]:
woe_train_all_cash = pd.concat([woe_train_cash1, woe_valid_cash1])
y_train_all_cash = pd.concat([y_train_cash, y_valid_cash])
clf = LogisticRegression(random_state=42, C=best_c).fit(woe_train_all_cash, y_train_all_cash)
pred = clf.predict_proba(woe_test_cash1)
print('test Gini =', Gini(y_test_cash, pred[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_cash.values, pred[:, 1]))

test Gini = 0.4832941279537375
test AUC_ROC = 0.7416470639768687


### Отбор признаков forward и backward selection

In [None]:
features_cash = []
prev_gini = 0
while len(features_cash) < 30:
    f = 0
    best_change = -1
    for i in range(woe_train_cash1.shape[1]):
        if (~(i in features_cash)):
            clf = LogisticRegression(random_state=42, C=best_c)
            features_cash.append(i)
            clf.fit(woe_train_cash1.iloc[:, features_cash], y_train_cash)
            pred = clf.predict_proba(woe_valid_cash1.iloc[:, features_cash])
            gini = Gini(y_valid_cash, pred[:, 1])
            features_cash.pop()
            if ((best_change == -1) | ((gini - prev_gini) > best_change)):
                best_change = gini - prev_gini
                f = i
    features_cash.append(f)

In [None]:
woe_train_cash1 = woe_train_cash1.iloc[:, features_cash]
woe_valid_cash = woe_valid_cash1.iloc[:, features_cash]
woe_test_cash1 = woe_test_cash1.iloc[:, features_cash]
woe_train_cash1.to_csv('woe_train_cash1.csv', index=True)
woe_valid_cash1.to_csv('woe_valid_cash1.csv', index=True)
woe_test_cash1.to_csv('woe_test_cash1.csv', index=True)

In [None]:
clf = LogisticRegression(random_state=42, C=best_c)
clf.fit(woe_train_cash1, y_train_cash)
pred = clf.predict_proba(woe_valid_cash1)
prev_gini = Gini(y_valid_cash, pred[:, 1])
features_cash = np.arange(0, woe_train_cash1.shape[1])
while len(features_cash) > 15:
  f = 0
  best_change = -1
  for i in range(len(features_cash)):
    clf = LogisticRegression(random_state=42, C=best_c)
    feat = np.delete(features_cash, i)
    clf.fit(woe_train_cash1.iloc[:, feat], y_train_cash)
    pred = clf.predict_proba(woe_valid_cash1.iloc[:, feat])
    gini = Gini(y_valid_cash, pred[:, 1])
    if ((gini - prev_gini) > best_change):
      best_change = gini - prev_gini
      f = i
  features_cash = np.delete(features_cash, f)
  prev_gini += best_change
len(features_cash)

### Обучение финальной модели для cash loans

In [None]:
best_g = 0
best_c_cash = 0
for c in [0.1, 0.3, 0.5, 0.7, 1]:
  clf = LogisticRegression(random_state=42, C=c).fit(woe_train_cash1.iloc[:, features_cash], y_train_cash)
  pred = clf.predict_proba(woe_valid_cash1.iloc[:, features_cash])
  er = Gini(y_valid_cash, pred[:, 1])
  if best_g == 0 or er > best_g:
    best_g = er
    best_c_cash = c
print('best Gini =', best_g, '\nbest C =', best_c_cash)

In [None]:
woe_train_all_cash = pd.concat([woe_train_cash1, woe_valid_cash1])
y_train_all_cash = pd.concat([y_train_cash, y_valid_cash])
clf = LogisticRegression(random_state=42, C=best_c_cash).fit(woe_train_all_cash.iloc[:, features_cash], y_train_all_cash)
pred = clf.predict_proba(woe_test_cash1.iloc[:, features_cash])
print('test Gini =', Gini(y_test_cash, pred[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_cash.values, pred[:, 1]))

In [None]:
woe_train_all_cash.iloc[:, features_cash].to_csv('woe_all_train_cash_final.csv', index=True)
woe_test_cash1.iloc[:, features_cash].to_csv('woe_test_cash_final.csv', index=True)

#Revolving loans

In [None]:
all_data = pd.concat([train_rev, test_rev])

In [None]:
for c in real:
  all_data[c + '_bin'] = pd.qcut(all_data[c], 20, duplicates='drop').cat.add_categories("0-0").fillna("0-0")
train_rev = all_data[:train_rev.shape[0]]
test_rev = all_data[train_rev.shape[0]:]

In [None]:
from sklearn.model_selection import train_test_split
train_rev, valid_rev, y_train_rev, y_valid_rev = train_test_split(train_rev, y_train_rev, test_size=0.2, random_state=42, stratify=(y_train_rev))

In [None]:
train_rev[categorical] = train_rev[categorical].fillna('NAN')
valid_rev[categorical] = valid_rev[categorical].fillna('NAN')
test_rev[categorical] = test_rev[categorical].fillna('NAN')

In [None]:
scorecard_rev = pd.DataFrame()

In [None]:
woe_train_rev = train_rev.copy()
woe_valid_rev = valid_rev.copy()
woe_test_rev = test_rev.copy()

In [None]:
res_cat = pd.DataFrame(index = categorical, columns = ['iv'])
for c in categorical:
  df, iv = calculate_woe_iv(train_rev, c, y_train_rev)
  df['feature'] = c
  for i in df.index:
    woe_train_rev.loc[woe_train_rev[c] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_valid_rev.loc[woe_valid_rev[c] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_test_rev.loc[woe_test_rev[c] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']

  res_cat.loc[c, 'iv'] = iv
  scorecard_rev = scorecard_rev.append(df[['feature', 'Value', 'WoE']])
    
res_cat[res_cat['iv'] >= 0.02]

Unnamed: 0,iv
NAME_INCOME_TYPE,0.062273
NAME_EDUCATION_TYPE,0.108189
NAME_FAMILY_STATUS,0.034721
NAME_HOUSING_TYPE,0.0350167
OCCUPATION_TYPE,0.127442
ORGANIZATION_TYPE,0.0993023
FONDKAPREMONT_MODE,0.0352369
HOUSETYPE_MODE,0.0363533
WALLSMATERIAL_MODE,0.0688655
EMERGENCYSTATE_MODE,0.0441584


In [None]:
from sklearn.tree import DecisionTreeClassifier

res_tree = pd.DataFrame(index = real, columns = ['iv'])

for c in real:
  d = pd.DataFrame(index = woe_train_rev.index, columns = ['woe'])
  df, iv = calculate_woe_iv(train_rev, c + '_bin', y_train_rev)
  df['feature'] = c
  for i in df.index:
    d.loc[woe_train_rev[c + '_bin'] == df.loc[i, 'Value'], 'woe'] = df.loc[i, 'WoE']

  clf = DecisionTreeClassifier(random_state = 42, max_depth = 2)
  clf.fit(d, y_train_rev)
  d['leaf'] = clf.apply(d)
  all_intervals = []
  for i in d['leaf'].unique():
    intervals = woe_train_rev[d['leaf'] == i][c+'_bin'].values.unique()
    woe_train_rev.loc[:, c+'_bin'] = woe_train_rev.loc[:, c+'_bin'].replace(intervals, i)
    woe_valid_rev.loc[:, c+'_bin'] = woe_valid_rev.loc[:, c+'_bin'].replace(intervals, i)
    woe_test_rev.loc[:, c+'_bin'] = woe_test_rev.loc[:, c+'_bin'].replace(intervals, i)
    all_intervals.append(str(intervals).split('\n')[0])

  df, iv = calculate_woe_iv(woe_train_rev, c + '_bin', y_train_rev)
  df['feature'] = c
  for i in df.index:
    woe_train_rev.loc[woe_train_rev[c + '_bin'] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_valid_rev.loc[woe_valid_rev[c + '_bin'] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
    woe_test_rev.loc[woe_test_rev[c + '_bin'] == df.loc[i, 'Value'], c] = df.loc[i, 'WoE']
  df.loc[:, 'Value'] = df.loc[:, 'Value'].replace(d['leaf'].unique(), all_intervals)
  res_tree.loc[c, 'iv'] = iv
  scorecard_rev = scorecard_rev.append(df[['feature', 'Value', 'WoE']])

res_tree[res_tree['iv'] >= 0.02]

Unnamed: 0,iv
b_last_closed_Credit card,0.0360395
b_start_Consumer credit_avg,0.153994
b_Credit card_sum_3,0.0284058
b_Credit card_avg,0.0473227
b_active_Consumer credit_dur_max,0.0697788
...,...
b_all_num,0.0302053
ENTRANCES_AVG,0.0513033
b_limit_min,0.0270723
b_Credit card_num_1,0.0346794


In [None]:
iv_rev = pd.concat([res_cat, res_tree])
good_f = iv_rev[iv_rev['iv'] >= 0.02].index

In [None]:
scorecard_rev.to_csv('scorecard_rev.csv', index=True)
iv_cash.to_csv('iv_cash.csv', index=True)
iv_rev.to_csv('iv_rev.csv', index=True)

In [None]:
woe_train_rev1 = woe_train_rev[good_f]
woe_valid_rev1 = woe_valid_rev[good_f]
woe_test_rev1 = woe_test_rev[good_f]

In [None]:
correlations_woe = woe_train_rev1.corr('spearman')
correlations_woe

Unnamed: 0,FLAG_WORK_PHONE,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,FLAG_DOCUMENT_3,b_last_closed_Credit card,b_start_Consumer credit_avg,b_Credit card_sum_3,b_Credit card_avg,b_active_Consumer credit_dur_max,p_type_suite_Family_percent,p_all_down_payment_avg_,p_prod_group_POS household_percent,p_Consumer loans_down_payment_avg_,b_dur_avg_Consumer credit,LANDAREA_AVG,FLOORSMAX_MEDI,b_early_num_all,p_sum,p_Consumer loans_sum_avg_,EXT_SOURCE_1,AMT_ANNUITY,b_Consumer credit_avg,b_active_Credit card_dur_avg,b_closed_days_avg_all,b_active_Mortgage_dur_max,b_Consumer credit_sum_3,b_start_Credit card_avg,APARTMENTS_MEDI,b_Consumer credit_avg_1,AMT_INCOME_TOTAL,b_Mortgage_avg,LIVINGAREA_MEDI,b_Consumer credit_num_1,YEARS_BUILD_AVG,b_debt_sum,LANDAREA_MODE,OBS_60_CNT_SOCIAL_CIRCLE,LIVINGAPARTMENTS_MEDI,p_Consumer loans_sum_avg_1,b_all_num_3,...,p_all_low_percent,EXT_SOURCE_2,BASEMENTAREA_MEDI,p_cnt_avg,b_all_sum_3,APARTMENTS_MODE,FLOORSMIN_AVG,YEARS_BUILD_MODE,p_all_sum_app_avg_1,LIVINGAPARTMENTS_MODE,OBS_30_CNT_SOCIAL_CIRCLE,p_yield_low_normal,p_Consumer loans_sum_app_avg_3,b_last_closed_Car loan,b_all_num_1,b_all_avg,NONLIVINGAPARTMENTS_MODE,b_last_closed_all,DAYS_BIRTH,YEARS_BEGINEXPLUATATION_AVG,b_Credit card_mean,AMT_REQ_CREDIT_BUREAU_MON,NONLIVINGAREA_AVG,b_debt_num,b_all_sum_1,AMT_GOODS_PRICE,AMT_CREDIT,b_debt_avg_Credit card,p_prod_group_POS mobile_percent,DEF_30_CNT_SOCIAL_CIRCLE,NONLIVINGAREA_MODE,ENTRANCES_MODE,LIVINGAREA_MODE,b_Credit card_num_3,b_Mortgage_mean,b_all_num,ENTRANCES_AVG,b_limit_min,b_Credit card_num_1,b_active_all_dur_max
FLAG_WORK_PHONE,1.000000,0.072000,0.121056,0.058850,-0.009527,0.059059,0.028143,0.035070,0.028922,-0.029682,-0.033360,-0.033493,-0.020240,0.043232,0.038538,0.058399,0.061336,0.054192,0.037167,0.011888,0.066054,0.037135,0.050435,0.039169,0.063160,0.037945,0.029056,0.046341,0.033201,0.072580,0.060619,0.040850,0.051687,0.039738,0.009626,0.037572,0.047212,0.043233,0.027202,0.037087,...,-0.073414,0.026574,0.036979,-0.000041,0.042013,0.036486,0.044731,0.038597,-0.011236,0.040701,0.044357,-0.000391,0.044113,-0.009527,0.040393,0.057775,0.041021,-0.009527,0.099636,0.037646,0.035194,0.075464,0.041948,0.036112,0.018251,0.113145,0.066054,0.012657,0.042094,0.067487,0.035054,0.047084,0.043141,0.025421,0.060852,0.060986,0.044151,0.056069,0.052718,0.016068
REG_CITY_NOT_LIVE_CITY,0.072000,1.000000,0.484076,0.010096,0.013926,0.083048,0.013302,0.039259,0.003724,0.016789,0.010299,-0.010010,0.005995,0.054502,0.151792,0.171042,0.043004,0.049006,0.008334,0.078954,0.102767,0.009886,0.032279,0.091532,0.022691,0.006916,0.069338,0.183571,0.011547,0.025012,0.021491,0.179702,0.022394,0.129350,0.007197,0.137611,0.008721,0.106956,0.028559,0.012396,...,-0.004650,0.062952,0.143597,0.011683,0.011243,0.158738,0.134488,0.131598,0.038806,0.114828,-0.000037,0.027670,0.037685,0.013926,0.026400,0.032352,0.133031,0.013926,0.174752,0.172723,0.031173,0.028461,0.170152,0.007794,0.019643,0.100632,0.102767,0.019652,0.050223,0.023210,0.153824,0.178999,0.164821,0.027702,0.021212,0.021422,0.179093,0.025875,0.026127,0.014740
REG_CITY_NOT_WORK_CITY,0.121056,0.484076,1.000000,0.008389,0.006975,0.067396,0.012893,0.040159,0.022860,0.024369,0.009060,0.004226,0.007451,0.024617,0.244736,0.275195,0.035842,0.058258,0.030916,0.094471,0.114345,0.028548,0.026400,0.067828,0.022935,0.026386,0.060466,0.285437,0.024021,0.029147,0.025928,0.271390,0.018154,0.194551,0.014569,0.225182,0.013865,0.175614,0.029550,0.001129,...,-0.004061,0.094699,0.230415,0.018754,0.013422,0.249174,0.212492,0.198921,0.043793,0.184507,0.013415,0.051642,0.050532,0.006975,0.012761,0.034603,0.207176,0.006975,0.192634,0.262190,0.038705,0.031279,0.266013,0.016995,0.015300,0.110612,0.114345,0.008509,0.061790,0.020199,0.233213,0.275308,0.260288,-0.001661,0.027137,0.020121,0.276305,0.020663,0.015256,0.015190
FLAG_DOCUMENT_3,0.058850,0.010096,0.008389,1.000000,0.021349,0.020983,0.007035,0.005871,0.031420,-0.070032,-0.061723,-0.077269,-0.047823,0.004920,-0.000648,-0.000824,0.023977,-0.007760,-0.045524,0.026990,0.006224,0.028774,0.017603,0.005951,0.022366,0.028816,0.009927,-0.003270,0.029860,0.011694,0.022209,-0.009928,0.030556,-0.000970,0.029132,-0.000151,0.025159,0.004295,-0.020571,0.031698,...,-0.078801,0.027718,0.000125,-0.022220,0.008625,0.007787,-0.003788,0.001139,-0.025670,0.011063,0.020130,-0.040294,-0.015207,0.021349,0.030725,0.018432,-0.001839,0.021349,0.048892,-0.016669,0.007046,0.036984,-0.005946,0.030176,0.035530,0.028624,0.006224,0.028115,-0.029268,0.029416,-0.007491,-0.006256,-0.007809,0.030956,0.021782,0.018737,-0.006571,0.041572,0.030708,0.026045
b_last_closed_Credit card,-0.009527,0.013926,0.006975,0.021349,1.000000,0.138117,0.139919,0.050294,0.280946,0.013088,0.006272,0.013562,-0.000024,0.093922,0.005217,-0.005747,0.057153,0.014005,0.002034,0.008271,0.010694,0.090447,0.059281,0.077738,0.092491,0.200368,-0.005110,-0.004216,0.228521,-0.023483,0.096125,-0.004277,0.182454,-0.002369,0.327481,0.001312,0.007388,-0.002958,0.006337,0.291559,...,0.015734,0.023722,0.003160,0.011673,0.120373,-0.008886,-0.010459,-0.005185,0.003338,-0.004962,0.009500,0.005278,0.001934,1.000000,0.223668,-0.008692,-0.004763,1.000000,0.022350,-0.006715,0.015280,0.108095,-0.004393,0.283606,0.288997,0.005838,0.010694,0.274674,0.005363,0.006971,0.000618,-0.002763,0.001457,0.204995,0.096865,0.052236,-0.005580,0.110802,0.150778,0.241113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b_all_num,0.060986,0.021422,0.020121,0.018737,0.052236,0.356228,0.457666,0.333132,0.189916,-0.010707,0.005023,0.004557,0.008569,0.382386,0.016796,0.004334,0.602932,0.080103,-0.038463,0.050022,0.053557,0.387241,0.459669,0.217547,0.679084,0.378170,0.163740,0.004801,0.206210,0.050022,0.678202,0.005778,0.440656,0.012852,0.196699,0.011446,-0.011785,-0.011689,-0.001492,0.324968,...,-0.015638,0.038270,0.007244,0.019815,0.419353,0.002412,0.010683,0.014179,-0.003752,-0.002154,-0.017472,0.038576,0.004866,0.052236,0.334014,0.451923,0.008169,0.052236,0.081666,0.018944,0.393951,0.672646,0.009937,0.252884,0.232915,0.070174,0.053557,0.297197,0.029826,-0.004150,0.007679,0.017666,0.004062,0.396857,0.679881,1.000000,0.013110,0.572638,0.609758,0.200573
ENTRANCES_AVG,0.044151,0.179093,0.276305,-0.006571,-0.005580,0.043744,0.027529,0.045171,0.045555,0.020336,0.048200,0.032975,0.040857,-0.007113,0.720215,0.777203,0.023062,0.023462,0.058156,0.084088,0.111290,0.031053,0.037976,0.030961,0.041777,0.040336,0.062301,0.849527,0.029631,0.138372,0.047212,0.827961,0.019034,0.603699,0.014541,0.665549,0.045489,0.483212,0.050645,0.008015,...,0.054200,0.119865,0.707530,0.021007,0.022638,0.720795,0.602140,0.605660,0.051018,0.511534,0.048157,0.036081,0.050976,-0.005580,0.011564,0.049246,0.610811,-0.005580,0.026396,0.819429,0.055628,0.028693,0.806634,0.019466,0.009282,0.116580,0.111290,-0.006734,0.020791,0.039372,0.717200,0.972231,0.778166,-0.016158,0.047445,0.013110,1.000000,0.020555,0.011643,0.029261
b_limit_min,0.056069,0.025875,0.020663,0.041572,0.110802,0.345051,0.482202,0.408069,0.262358,-0.024505,0.004561,-0.008728,0.006807,0.339099,0.021875,0.021903,0.557028,0.071269,-0.027781,0.051937,0.052835,0.422475,0.487281,0.171448,0.696178,0.448905,0.135330,0.015908,0.273460,0.054596,0.697819,0.007602,0.496886,0.015786,0.276653,0.010888,-0.007312,-0.003909,0.014146,0.433308,...,-0.011485,0.047307,0.012427,0.020283,0.454799,0.011663,0.011660,0.016527,0.018068,0.000090,-0.015005,0.037491,0.009858,0.110802,0.418650,0.472903,0.010563,0.110802,0.086750,0.023190,0.425009,0.691941,0.020536,0.367518,0.290058,0.071812,0.052835,0.400692,0.023363,-0.003916,0.014355,0.024312,0.012665,0.476693,0.699230,0.572638,0.020555,1.000000,0.656541,0.252157
b_Credit card_num_1,0.052718,0.026127,0.015256,0.030708,0.150778,0.391251,0.521019,0.422918,0.303631,-0.014415,0.008698,0.000223,0.012566,0.371527,0.018629,0.005008,0.600170,0.066721,-0.033274,0.048113,0.051334,0.450141,0.520325,0.201012,0.750594,0.498711,0.230038,0.006193,0.319962,0.039075,0.752334,0.002268,0.567501,0.008272,0.323667,0.011383,-0.005965,-0.008610,0.008474,0.564809,...,-0.008027,0.055397,0.006665,0.012345,0.503406,0.001860,0.003540,0.008766,0.012122,-0.003989,-0.013422,0.023291,0.009740,0.150778,0.613946,0.486328,0.006513,0.150778,0.097464,0.016866,0.406961,0.746521,0.005196,0.471750,0.411610,0.070451,0.051334,0.487103,0.015334,-0.010501,0.003814,0.012918,-0.000446,0.700158,0.754257,0.609758,0.011643,0.656541,1.000000,0.332610


In [None]:
features = correlations_woe.columns
corr = []
for i in range(len(features)-1):
  for j in range(i+1, len(features)):
    if abs(correlations_woe.iloc[i, j]) >= 0.8:
       if iv_rev.loc[features[i], 'iv'] > iv_rev.loc[features[j], 'iv']:
         corr.append(features[j])
       else:
         corr.append(features[i])
woe_train_rev1 = woe_train_rev1.drop(columns=corr)
woe_valid_rev1 = woe_valid_rev1.drop(columns=corr)
woe_test_rev1 = woe_test_rev1.drop(columns=corr)

In [None]:
from sklearn.linear_model import LogisticRegression
best_g = 0
best_c = 0
for c in [0.1, 0.3, 0.5, 0.7, 1]:
  clf = LogisticRegression(random_state=42, C=c).fit(woe_train_rev1, y_train_rev)
  pred = clf.predict_proba(woe_valid_rev1)
  er = Gini(y_valid_rev, pred[:, 1])
  if best_g == 0 or er > best_g:
    best_g = er
    best_c = c
print('best Gini =', best_g, '\nbest C =', best_c)

best Gini = 0.4398526845343136 
best C = 0.1


In [None]:
clf = LogisticRegression(random_state=42, C=best_c).fit(woe_train_rev, y_train_rev)
pred = clf.predict_proba(woe_test_rev)
print('test Gini =', Gini(y_test_rev, pred[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_rev.values, pred[:, 1]))

test Gini = 0.45711226910278047
test AUC_ROC = 0.7285561345513902


In [None]:
features_rev = []
prev_gini = 0
while len(features_rev) < 30:
    f = 0
    best_change = -1
    for i in range(woe_train_rev1.shape[1]):
        if (~(i in features_rev)):
            clf = LogisticRegression(random_state=42, C=best_c)
            features_rev.append(i)
            clf.fit(woe_train_rev1.iloc[:, features_rev], y_train_rev)
            pred = clf.predict_proba(woe_valid_rev1.iloc[:, features_rev])
            gini = Gini(y_valid_rev, pred[:, 1])
            features_rev.pop()
            if ((best_change == -1) | ((gini - prev_gini) > best_change)):
                best_change = gini - prev_gini
                f = i
    features_rev.append(f)

In [None]:
woe_train_rev1 = woe_train_rev1.iloc[:, features_rev]
woe_valid_rev = woe_valid_rev1.iloc[:, features_rev]
woe_test_rev1 = woe_test_rev1.iloc[:, features_rev]
woe_train_rev1.to_csv('woe_train_rev1.csv', index=True)
woe_valid_rev1.to_csv('woe_valid_rev1.csv', index=True)
woe_test_rev1.to_csv('woe_test_rev1.csv', index=True)

In [None]:
clf = LogisticRegression(random_state=42, C=best_c)
clf.fit(woe_train_rev1, y_train_rev)
pred = clf.predict_proba(woe_valid_rev1)
prev_gini = Gini(y_valid_rev, pred[:, 1])
features_rev = np.arange(0, woe_train_rev1.shape[1])
while len(features_rev) > 15:
  f = 0
  best_change = -1
  for i in range(len(features_rev)):
    clf = LogisticRegression(random_state=42, C=best_c)
    feat = np.delete(features_rev, i)
    clf.fit(woe_train_rev1.iloc[:, feat], y_train_rev)
    pred = clf.predict_proba(woe_valid_rev1.iloc[:, feat])
    gini = Gini(y_valid_rev, pred[:, 1])
    if ((gini - prev_gini) > best_change):
      best_change = gini - prev_gini
      f = i
  features_rev = np.delete(features_rev, f)
  prev_gini += best_change
len(features_rev)

In [None]:
best_g = 0
best_c_rev = 0
for c in [0.1, 0.3, 0.5, 0.7, 1]:
  clf = LogisticRegression(random_state=42, C=c).fit(woe_train_rev1, y_train_rev)
  pred = clf.predict_proba(woe_valid_rev1)
  er = Gini(y_valid_rev, pred[:, 1])
  if best_g == 0 or er > best_g:
    best_g = er
    best_c_rev = c
print('best Gini =', best_g, '\nbest C =', best_c_rev)

In [None]:
woe_train_all_rev = pd.concat([woe_train_rev1, woe_valid_rev1])
y_train_all_rev = pd.concat([y_train_rev, y_valid_rev])
clf = LogisticRegression(random_state=42, C=best_c_rev).fit(woe_train_all_rev.iloc[:, features_rev], y_train_all_rev)
pred = clf.predict_proba(woe_test_rev1.iloc[:, features_rev])
print('test Gini =', Gini(y_test_rev, pred[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_rev.values, pred[:, 1]))

In [None]:
woe_train_all_rev.iloc[:, features_rev].to_csv('woe_all_train_rev_final.csv', index=True)
woe_test_rev1.iloc[:, features_rev].to_csv('woe_test_rev_final.csv', index=True)

##Результат на нашем тесте

In [None]:
clf_cash = LogisticRegression(random_state=42, C=best_c_cash).fit(woe_train_all_cash.iloc[:, features_cash], y_train_all_cash)
pred_cash = pd.DataFrame(clf_cash.predict_proba(woe_test_cash1.iloc[:, features_cash]), index=woe_test_cash1.index)

clf_rev = LogisticRegression(random_state=42, C=best_c_rev).fit(woe_train_all_rev.iloc[:, features_rev], y_train_all_rev)
pred_rev = pd.DataFrame(clf_rev.predict_proba(woe_test_rev1.iloc[:, features_rev]), index=woe_test_rev1.index)

pred_test = pd.concat([pred_cash, pred_rev])
y_test = pd.concat([y_test_cash, y_test_rev])

print('test Gini =', Gini(y_test, pred_test.iloc[:,1]))
print('test AUC_ROC =', roc_auc_score(y_test.values, pred_test.iloc[:,1]))

## Построение скоркарты

In [1]:
import pandas as pd 
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.metrics import roc_auc_score

def Gini(y_true, y_pred):
  return (2*roc_auc_score(y_true.values, y_pred) - 1)

In [None]:
woe_test_cash = pd.read_csv('/content/drive/My Drive/woe_test_cash1.csv').set_index('SK_ID_CURR')
woe_train_cash = pd.read_csv('/content/drive/My Drive/woe_train_cash1.csv').set_index('SK_ID_CURR')
woe_test_rev = pd.read_csv('/content/drive/My Drive/woe_test_rev1.csv').set_index('SK_ID_CURR')
woe_train_rev = pd.read_csv('/content/drive/My Drive/woe_train_rev1.csv').set_index('SK_ID_CURR')

In [None]:
y_test_cash = pd.read_csv('/content/drive/My Drive/y_test_cash.csv').set_index('SK_ID_CURR')
y_train_cash = pd.read_csv('/content/drive/My Drive/y_train_cash.csv').set_index('SK_ID_CURR')
y_test_rev = pd.read_csv('/content/drive/My Drive/y_test_rev.csv').set_index('SK_ID_CURR')
y_train_rev = pd.read_csv('/content/drive/My Drive/y_train_rev.csv').set_index('SK_ID_CURR')

In [None]:
from sklearn.linear_model import LogisticRegression
clf_rev = LogisticRegression(random_state=42, C=0.1).fit(woe_train_rev, y_train_rev)
pred_rev = clf_rev.predict_proba(woe_test_rev)
print('test Gini =', Gini(y_test_rev, pred_rev[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_rev.values, pred_rev[:, 1]))

clf_cash = LogisticRegression(random_state=42, C=0.1).fit(woe_train_cash, y_train_cash)
pred_cash = clf_cash.predict_proba(woe_test_cash)
print('test Gini =', Gini(y_test_cash, pred_cash[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_cash.values, pred_cash[:, 1]))

test Gini = 0.48576706507755474
test AUC_ROC = 0.7428835325387774
test Gini = 0.47251207062438483
test AUC_ROC = 0.7362560353121924


In [None]:
coef_cash = pd.DataFrame(columns = ['feature', 'coef'])
coef_cash['feature'] = woe_train_cash.columns.values
coef_cash['coef'] = clf_cash.coef_[0]

coef_rev = pd.DataFrame(columns = ['feature', 'coef'])
coef_rev['feature'] = woe_train_rev.columns.values
coef_rev['coef'] = clf_rev.coef_[0]

In [None]:
coef_cash = coef_cash[coef_cash['coef'] < -0.3]
clf_cash = LogisticRegression(random_state=42, C=0.1).fit(woe_train_cash[coef_cash['feature']], y_train_cash)
pred_cash = clf_cash.predict_proba(woe_test_cash[coef_cash['feature']])
print('test Gini =', Gini(y_test_cash, pred_cash[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_cash.values, pred_cash[:, 1]))

test Gini = 0.4604392095347627
test AUC_ROC = 0.7302196047673813


In [None]:
coef_rev = coef_rev[coef_rev['coef'] < -0.3]
clf_rev = LogisticRegression(random_state=42, C=0.1).fit(woe_train_rev[coef_rev['feature']], y_train_rev)
pred_rev = clf_rev.predict_proba(woe_test_rev[coef_rev['feature']])
print('test Gini =', Gini(y_test_rev, pred_rev[:, 1]))
print('test AUC_ROC =', roc_auc_score(y_test_rev.values, pred_rev[:, 1]))

test Gini = 0.4804451459479342
test AUC_ROC = 0.7402225729739671


In [None]:
pred_test = pd.concat([pd.DataFrame(pred_cash, index=woe_test_cash.index), pd.DataFrame(pred_rev, index=woe_test_rev.index)])
y_test = pd.concat([y_test_cash, y_test_rev])

print('test Gini =', Gini(y_test, pred_test.iloc[:,1]))
print('test AUC_ROC =', roc_auc_score(y_test.values, pred_test.iloc[:,1]))

test Gini = 0.4652468071881404
test AUC_ROC = 0.7326234035940702


In [None]:
scorecard_cash = pd.read_csv('/content/drive/My Drive/scorecard_cash.csv').set_index('Unnamed: 0').rename_axis(None, axis=1).rename_axis('', axis=0)
scorecard_rev = pd.read_csv('/content/drive/My Drive/scorecard_rev.csv').set_index('Unnamed: 0').rename_axis(None, axis=1).rename_axis('', axis=0)
iv_cash = pd.read_csv('/content/drive/My Drive/iv_cash.csv')
iv_rev = pd.read_csv('/content/drive/My Drive/iv_rev.csv')

In [None]:
scorecard_cash['iv'] = 0
for f in scorecard_cash['feature'].unique():
  scorecard_cash.loc[scorecard_cash['feature'] == f, 'iv'] = iv_cash[iv_cash['Unnamed: 0'] == f]['iv'].values[0]

In [None]:
scorecard_rev['iv'] = 0
for f in scorecard_rev['feature'].unique():
  scorecard_rev.loc[scorecard_rev['feature'] == f, 'iv'] = iv_rev[iv_rev['Unnamed: 0'] == f]['iv'].values[0]

In [None]:
name_features_cash = list(coef_cash['feature'])
name_features_rev = list(coef_rev['feature'])
scorecard_cash['feature'] = np.vectorize(lambda x: x if x in name_features_cash else '')(scorecard_cash['feature'])
scorecard_cash = scorecard_cash[scorecard_cash['feature'] != '']
scorecard_rev['feature'] = np.vectorize(lambda x: x if x in name_features_rev else '')(scorecard_rev['feature'])
scorecard_rev = scorecard_rev[scorecard_rev['feature'] != '']

In [None]:
scorecard_cash = scorecard_cash.sort_values(by='iv', ascending=False)
scorecard_rev = scorecard_rev.sort_values(by='iv', ascending=False)
scorecard_cash.index = np.arange(0, scorecard_cash.shape[0])
scorecard_rev.index = np.arange(0, scorecard_rev.shape[0])

In [None]:
R = 40 / np.log(2)
A = 600 - R * np.log(72)
scorecard_cash_final['score'] = 0
for f in scorecard_cash_final['feature'].unique():
  scorecard_cash_final.loc[scorecard_cash_final['feature'] == f, 'score'] = (scorecard_cash_final[scorecard_cash_final['feature'] == f]['WoE']*coef_cash[coef_cash['feature'] == f]['coef'].values[0] - clf_cash.intercept_[0]/17)*R + A/17
scorecard_rev_final['score'] = 0
for f in scorecard_rev_final['feature'].unique():
  scorecard_rev_final.loc[scorecard_rev_final['feature'] == f, 'score'] = (scorecard_rev_final[scorecard_rev_final['feature'] == f]['WoE']*coef_rev[coef_rev['feature'] == f]['coef'].values[0] - clf_rev.intercept_[0]/16)*R + A/16

In [14]:
scorecard_cash_final = pd.read_csv('/content/drive/My Drive/scorecard_cash_final.csv').set_index('Unnamed: 0').rename_axis(None, axis=1).rename_axis('', axis=0)
scorecard_rev_final = pd.read_csv('/content/drive/My Drive/scorecard_rev_final.csv').set_index('Unnamed: 0').rename_axis(None, axis=1).rename_axis('', axis=0)

#### Парсинг строкового представление интервалов в pd.Interval

In [3]:
res_res = []
for i in range(scorecard_cash_final.shape[0]):
  s = scorecard_cash_final.loc[i, 'Value']
  j = 0
  if s[:9] != '[Interval':
    res = [[s]]
    res_res.append(res)
    continue
  res = []
  while j < len(s):
    while j < len(s) and s[j] != '(':
      if s[j] == 'N':
        res.append(['NAN'])
        j += 3
      else:
        j += 1
    first = ''
    j += 1
    while j < len(s) and s[j] != ',':
      first += s[j]
      j += 1
    j += 1
    second = ''
    while j < len(s) and s[j] != ',':
      second += s[j]
      j += 1
    if first != '' and second != '':
      res.append(pd.Interval(left=float(first), right=float(second), closed='right'))  
  res_res.append(res)

    


In [None]:
scorecard_cash_final.loc[:, 'Value'] = res_res

In [24]:
res_res = []
for i in range(scorecard_rev_final.shape[0]):
  s = scorecard_rev_final.loc[i, 'Value']
  j = 0
  if s[:9] != '[Interval':
    res = [[s]]
    res_res.append(res)
    continue
  res = []
  while j < len(s):
    while j < len(s) and s[j] != '(':
      if s[j] == 'N':
        res.append(['NAN'])
        j += 3
      else:
        j += 1
    first = ''
    j += 1
    while j < len(s) and s[j] != ',':
      first += s[j]
      j += 1
    j += 1
    second = ''
    while j < len(s) and s[j] != ',':
      second += s[j]
      j += 1
    if first != '' and second != '':
      res.append('('+str(first)+', '+str(second)+']')  
  res_res.append(res)

    


In [25]:
scorecard_rev_final.loc[:, 'Value'] = res_res

In [None]:
scorecard_cash_final.loc[:, 'WoE'] = np.round(scorecard_cash_final.loc[:, 'WoE'], decimals=3)
scorecard_cash_final.loc[:, 'iv'] = np.round(scorecard_cash_final.loc[:, 'iv'], decimals=3)
scorecard_cash_final.loc[:, 'score'] = np.round(scorecard_cash_final.loc[:, 'score'], decimals=3)
scorecard_cash_final.to_csv('cash_beautiful.csv')

In [26]:
scorecard_rev_final.loc[:, 'WoE'] = np.round(scorecard_rev_final.loc[:, 'WoE'], decimals=3)
scorecard_rev_final.loc[:, 'iv'] = np.round(scorecard_rev_final.loc[:, 'iv'], decimals=3)
scorecard_rev_final.loc[:, 'score'] = np.round(scorecard_rev_final.loc[:, 'score'], decimals=3)
scorecard_rev_final.to_csv('rev_beautiful.csv')

In [None]:
from sklearn.model_selection import train_test_split
a, valid_cash, b, y_valid_cash = train_test_split(train_cash, y_train_cash, test_size=0.2, random_state=11, stratify=y_train_cash)
b, valid_rev, b, y_valid_rev = train_test_split(train_rev, y_train_rev, test_size=0.2, random_state=11, stratify=y_train_rev)

In [None]:
test_cash = test_cash[scorecard_cash_final['feature'].unique()]
test_rev = test_rev[scorecard_rev_final['feature'].unique()]

In [None]:
test_cash = test_cash.fillna('NAN')
test_rev = test_rev.fillna('NAN')

In [None]:
valid_cash = valid_cash[scorecard_cash_final['feature'].unique()]
valid_rev = valid_rev[scorecard_rev_final['feature'].unique()]
valid_cash = valid_cash.fillna('NAN')
valid_rev = valid_rev.fillna('NAN')

In [None]:
score_cash = pd.Series(0, index=test_cash.index)
score_rev = pd.Series(0, index=test_rev.index)

In [None]:
score_cash_valid = pd.Series(0, index=valid_cash.index)
score_rev_valid = pd.Series(0, index=valid_rev.index)

In [None]:
for i in test_cash.index:
  for f in scorecard_cash_final['feature'].unique():
    val = test_cash.loc[i, f]
    for n, row in scorecard_cash_final[scorecard_cash_final['feature'] == f].iterrows():
      for interval in row['Value']:
        if (type(val) == str and type(interval) == list) or type(val) == float:
          if val in interval:
            score_cash[i] += row['score']

In [None]:
for i in valid_cash.index:
  for f in scorecard_cash_final['feature'].unique():
    val = valid_cash.loc[i, f]
    for n, row in scorecard_cash_final[scorecard_cash_final['feature'] == f].iterrows():
      for interval in row['Value']:
        if (type(val) == str and type(interval) == list) or type(val) == float:
          if val in interval:
            score_cash_valid[i] += row['score']

In [None]:
for i in test_rev.index:
  for f in scorecard_rev_final['feature'].unique():
    val = test_rev.loc[i, f]
    for n, row in scorecard_rev_final[scorecard_rev_final['feature'] == f].iterrows():
      for interval in row['Value']:
        if (type(val) == str and type(interval) == list) or type(val) == float:
          if val in interval:
            score_rev[i] += row['score']

In [None]:
for i in valid_rev.index:
  for f in scorecard_rev_final['feature'].unique():
    val = valid_rev.loc[i, f]
    for n, row in scorecard_rev_final[scorecard_rev_final['feature'] == f].iterrows():
      for interval in row['Value']:
        if (type(val) == str and type(interval) == list) or type(val) == float:
          if val in interval:
            score_rev_valid[i] += row['score']

In [None]:
test_cash_for_effect = pd.DataFrame(index = score_cash.index, columns=['target', 'score', 'sum'])
test_cash_for_effect['target'] = y_test_cash
test_cash_for_effect['score'] = score_cash
test_cash_for_effect['sum'] = test.loc[score_cash.index, 'AMT_CREDIT']
test_cash_for_effect.to_csv('test_cash_for_effect.csv')

valid_cash_for_effect = pd.DataFrame(index = score_cash_valid.index, columns=['target', 'score', 'sum'])
valid_cash_for_effect['target'] = y_valid_cash
valid_cash_for_effect['score'] = score_cash_valid
valid_cash_for_effect['sum'] = train.loc[score_cash_valid.index, 'AMT_CREDIT']
valid_cash_for_effect.to_csv('valid_cash_for_effect.csv')

test_rev_for_effect = pd.DataFrame(index = score_rev.index, columns=['target', 'score', 'sum'])
test_rev_for_effect['target'] = y_test_rev
test_rev_for_effect['score'] = score_rev
test_rev_for_effect['sum'] = test.loc[score_rev.index, 'AMT_CREDIT']
test_rev_for_effect.to_csv('test_rev_for_effect.csv')

valid_rev_for_effect = pd.DataFrame(index = score_rev_valid.index, columns=['target', 'score', 'sum'])
valid_rev_for_effect['target'] = y_valid_rev
valid_rev_for_effect['score'] = score_rev_valid
valid_rev_for_effect['sum'] = train.loc[score_rev_valid.index, 'AMT_CREDIT']
valid_rev_for_effect.to_csv('valid_rev_for_effect.csv')

## Экономический эффект

Считаем что кредиты берутся на 3 года под 10 процентов годовых для потребов и 20 для возобновляемых
Дифференцированный платеж (сумма долга должна уменьшаться равномерно), значит прибыль банка 0.1s+2/3 * 0.1s + 1/3 * 0.1s для потребов и 0.2s+2/3 * 0.2s + 1/3 * 0.2s для возобновляемых, причем для возоб. s=0.7*лимит (0.7 - процент утилизации)

In [None]:
test_cash_for_effect = pd.read_csv('/content/drive/My Drive/test_cash_for_effect.csv').set_index('SK_ID_CURR')
valid_cash_for_effect = pd.read_csv('/content/drive/My Drive/valid_cash_for_effect.csv').set_index('SK_ID_CURR')
test_rev_for_effect = pd.read_csv('/content/drive/My Drive/test_rev_for_effect.csv').set_index('SK_ID_CURR')
valid_rev_for_effect = pd.read_csv('/content/drive/My Drive/valid_rev_for_effect.csv').set_index('SK_ID_CURR')

In [None]:
valid_cash_for_effect['pred'] = 0

In [None]:
valid_cash_for_effect

Unnamed: 0_level_0,target,score,sum,pred
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
254150,0,347,254700.0,0
161456,0,388,99504.0,0
186858,0,437,494550.0,0
149833,1,375,203760.0,0
260206,0,360,808650.0,0
...,...,...,...,...
134417,0,334,143910.0,0
438005,0,406,390447.0,0
320354,0,455,765000.0,0
383230,0,339,942300.0,0


In [83]:
start = valid_cash_for_effect['score'].min()
finish = np.percentile(valid_cash_for_effect['score'], 50)

In [85]:
best_profit = 0
best_treshold = start
for treshold in range(int(start), int(finish)):
  valid_cash_for_effect['pred'] = np.vectorize(lambda x: 1 if x < treshold else 0)(valid_cash_for_effect['score'])
  profit = valid_cash_for_effect[valid_cash_for_effect['target'] == 0][valid_cash_for_effect['pred'] == 0]['sum'].sum()*0.2
  profit -= valid_cash_for_effect[valid_cash_for_effect['target'] == 1][valid_cash_for_effect['pred'] == 0]['sum'].sum()
  if profit >= best_profit:
    best_profit = profit
    best_treshold = treshold
print(best_profit)
print(best_treshold)


3161448432.9000006
236


In [86]:
test_cash_for_effect['pred'] = np.vectorize(lambda x: 1 if x < best_treshold else 0)(test_cash_for_effect['score'])
profit_cash = test_cash_for_effect[test_cash_for_effect['target'] == 0][test_cash_for_effect['pred'] == 0]['sum'].sum()*0.2
profit_cash -= test_cash_for_effect[test_cash_for_effect['target'] == 1][test_cash_for_effect['pred'] == 0]['sum'].sum()
loss_cash = test_cash_for_effect[test_cash_for_effect['target'] == 0][test_cash_for_effect['pred'] == 1]['sum'].sum()*0.2
profit_cash

2816077445.1000004

In [87]:
start = valid_rev_for_effect['score'].min()
finish = np.percentile(valid_rev_for_effect['score'], 50)

In [97]:
best_profit = 0
best_treshold = start
for treshold in range(int(start), int(finish)):
  valid_rev_for_effect['pred'] = np.vectorize(lambda x: 1 if x < treshold else 0)(valid_rev_for_effect['score'])
  profit = valid_rev_for_effect[valid_rev_for_effect['target'] == 0][valid_rev_for_effect['pred'] == 0]['sum'].sum()*0.7*0.4
  profit -= valid_rev_for_effect[valid_rev_for_effect['target'] == 1][valid_rev_for_effect['pred'] == 0]['sum'].sum()*0.7
  if profit > best_profit:
    best_profit = profit
    best_treshold = treshold
print(best_profit)
print(best_treshold)


379449000.0
273


In [99]:
test_rev_for_effect['pred'] = np.vectorize(lambda x: 1 if x < best_treshold else 0)(test_rev_for_effect['score'])
profit_rev = test_rev_for_effect[test_rev_for_effect['target'] == 0][test_rev_for_effect['pred'] == 0]['sum'].sum()*0.7*0.4
profit_rev -= test_rev_for_effect[test_rev_for_effect['target'] == 1][test_rev_for_effect['pred'] == 0]['sum'].sum()*0.7
loss_rev = test_rev_for_effect[test_rev_for_effect['target'] == 0][test_rev_for_effect['pred'] == 1]['sum'].sum()*0.7*0.4
profit_rev

341406450.0

In [101]:
total_profit = profit_cash+profit_rev
total_loss = loss_cash+loss_rev
total_profit

3157483895.1000004

In [102]:
total_loss

10704541.5

## Экономический эффект для бустинга

In [108]:
boost_valid = pd.read_csv('/content/drive/My Drive/Notebooksvalid_res.csv').set_index('SK_ID_CURR').drop(columns='0')
boost_test = pd.read_csv('/content/drive/My Drive/Notebookstest_res.csv').set_index('SK_ID_CURR').drop(columns='0')

In [112]:
boost_valid.columns=['prob']
boost_test.columns=['prob']

In [110]:
train = pd.read_csv(path + '/train.csv').set_index('SK_ID_CURR')
test = pd.read_csv(path + '/test.csv').set_index('SK_ID_CURR')

In [118]:
boost_valid['target'] = train.loc[boost_valid.index, 'TARGET']
boost_test['target'] = test['TARGET']
boost_valid['sum'] = train.loc[boost_valid.index, 'AMT_CREDIT']
boost_test['sum'] = test['AMT_CREDIT']
boost_valid['type'] = train.loc[boost_valid.index, 'NAME_CONTRACT_TYPE']
boost_test['type'] = test['NAME_CONTRACT_TYPE']

In [135]:
best_profit = 0
best_treshold = 0.0
treshold = 0.0
while treshold < 0.5:
  boost_valid['pred'] = np.vectorize(lambda x: 1 if x > treshold else 0)(boost_valid['prob'])
  profit = boost_valid[boost_valid['target'] == 0][boost_valid['pred'] == 0][boost_valid['type'] == 'Revolving loans']['sum'].sum()*0.7*0.4
  profit += boost_valid[boost_valid['target'] == 0][boost_valid['pred'] == 0][boost_valid['type'] == 'Cash loans']['sum'].sum()*0.2
  profit -= boost_valid[boost_valid['target'] == 1][boost_valid['pred'] == 0][boost_valid['type'] == 'Revolving loans']['sum'].sum()*0.7
  profit -= boost_valid[boost_valid['target'] == 1][boost_valid['pred'] == 0][boost_valid['type'] == 'Cash loans']['sum'].sum()
  if profit > best_profit:
    best_profit = profit
    best_treshold = treshold
  treshold += 0.05
print(best_profit)
print(best_treshold)

3957655484.700001
0.2


In [125]:
boost_test['pred'] = np.vectorize(lambda x: 1 if x > best_treshold else 0)(boost_test['prob'])
profit_boost = boost_test[boost_test['target'] == 0][boost_test['pred'] == 0][boost_test['type'] == 'Revolving loans']['sum'].sum()*0.7*0.4
profit_boost += boost_test[boost_test['target'] == 0][boost_test['pred'] == 0][boost_test['type'] == 'Cash loans']['sum'].sum()*0.2
profit_boost -= boost_test[boost_test['target'] == 1][boost_test['pred'] == 0][boost_test['type'] == 'Revolving loans']['sum'].sum()*0.7
profit_boost -= boost_test[boost_test['target'] == 1][boost_test['pred'] == 0][boost_test['type'] == 'Cash loans']['sum'].sum()
loss_boost = boost_test[boost_test['target'] == 0][boost_test['pred'] == 1][boost_test['type'] == 'Revolving loans']['sum'].sum()*0.7*0.4
loss_boost += boost_test[boost_test['target'] == 0][boost_test['pred'] == 1][boost_test['type'] == 'Cash loans']['sum'].sum()*0.2

In [129]:
loss_boost

141154136.1

In [132]:
profit_boost

3351374100.0