In [1]:
import pandas as pd
from itertools import combinations
import random
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('data/df_done.csv')

In [3]:
df.shape

(3000000, 276)

In [6]:
#Сохраним df_test для тестирования пайплайна до feature engineering
df_train, df_test = train_test_split(df, stratify=df['flag'], test_size=0.2, random_state=42)
df_test.to_csv('data/df_test.csv', index = False)


In [3]:
# Преобразуем все колонки, кроме id , в int8
df_id = df.id
df=df.drop('id', axis = 1)
df=df.astype('int8')
df['id'] = df_id

In [4]:
# создадим фичу "отсутствие просрочек (любой длительности) на количество кредитов"
# в полях-флагах 1 значит, что нет просрочек, при аггрегации мы суммировали (т.е.3 в таком поле будет значить, что в трех кредитах не было таких просрочек).
#Т.е. мы поделим сумму значений по всем полям-флагам на количество кредитов (rn)
flags_no_delays = ['is_zero_loans5', 'is_zero_loans530', 
         'is_zero_loans3060' , 'is_zero_loans6090','is_zero_loans90']

def count_zeros_rate(row, flag):
    count = 0
    for col in flag:
        count+=row[col]
    return count / row['rn']
df['no_delays_rate'] = df.apply(lambda x: count_zeros_rate(x, flags_no_delays), axis = 1)
df.head()

Unnamed: 0,flag,rn,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,...,pre_since_confirmed_9,pre_since_confirmed_10,pre_since_confirmed_11,pre_since_confirmed_12,pre_since_confirmed_13,pre_since_confirmed_14,pre_since_confirmed_16,pre_since_confirmed_17,id,no_delays_rate
0,0,10,9,10,10,10,10,6,9,9,...,7,0,0,1,0,0,0,0,0,4.9
1,0,14,12,10,12,12,11,10,12,11,...,1,0,0,0,0,3,0,0,1,4.071429
2,0,3,3,2,2,2,3,1,3,2,...,2,0,0,0,0,1,0,0,2,4.0
3,0,15,15,15,15,15,15,8,14,14,...,7,0,0,0,0,1,1,0,3,5.0
4,0,1,1,1,1,1,1,1,1,1,...,1,0,0,0,0,0,0,0,4,5.0


In [5]:
def count_zero_feat(row, flag):
    count = 0
    for col in flag:
        count+=row[col]
    return count
df['no_delays'] = df.apply(lambda x: count_zero_feat(x, flags_no_delays), axis = 1)
df.head()

Unnamed: 0,flag,rn,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,...,pre_since_confirmed_10,pre_since_confirmed_11,pre_since_confirmed_12,pre_since_confirmed_13,pre_since_confirmed_14,pre_since_confirmed_16,pre_since_confirmed_17,id,no_delays_rate,no_delays
0,0,10,9,10,10,10,10,6,9,9,...,0,0,1,0,0,0,0,0,4.9,49.0
1,0,14,12,10,12,12,11,10,12,11,...,0,0,0,0,3,0,0,1,4.071429,57.0
2,0,3,3,2,2,2,3,1,3,2,...,0,0,0,0,1,0,0,2,4.0,12.0
3,0,15,15,15,15,15,15,8,14,14,...,0,0,0,0,1,1,0,3,5.0,75.0
4,0,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,4,5.0,5.0


In [6]:
df['no_delays'].corr(df['flag'])

-0.022329272290983224

In [7]:
flags_no_delays_over_530 = ['is_zero_loans3060' , 'is_zero_loans6090','is_zero_loans90']

df['no_delays_over_530'] = df.apply(lambda x: count_zeros_rate(x, flags_no_delays_over_530), axis = 1)
df['no_delays_over_530'].corr(df['flag'])

-0.07467422348569196

In [8]:
df['no_delays_rate'].corr(df['no_delays_over_530'])

0.8525016177249926

In [9]:
# посмотрим на значения оставшихся флагов
other_flags= ['is_zero_util', 'is_zero_over2limit', 'is_zero_maxover2limit', 'pclose_flag', 'fclose_flag']
# добавим еще отдельные фичи для каждого отдельного типа просрочек (без суммирования)
flags = other_flags + flags_no_delays

In [10]:
rate_cols = []
for col in flags:
    name =col+'_rate'
    rate_cols.append(name)
    df[name] = df.apply(lambda x: (x[col]/x['rn']), axis = 1)

In [11]:
for col in rate_cols:
    print(col,':', df[col].corr(df['flag']))

is_zero_util_rate : -0.04729338365066287
is_zero_over2limit_rate : -0.017485173067125888
is_zero_maxover2limit_rate : -0.03840838106515936
pclose_flag_rate : 0.012771470869451768
fclose_flag_rate : 0.019829590447032405
is_zero_loans5_rate : -0.032215293245396376
is_zero_loans530_rate : -0.05667006647395925
is_zero_loans3060_rate : -0.07001918338016913
is_zero_loans6090_rate : -0.06705042682234126
is_zero_loans90_rate : -0.06842015752154092


In [12]:
# после генерации фич путем попарного сложения и вычитания, а также деления на 'rn' (количество взятых кредитов)
#были найдены следующие пары с корреляцией с таргетом >0.06
# добавим в датасет такие фичи
features_for_subtraction = [['enc_paym_10_3','is_zero_loans530'],
                            ['is_zero_loans530','enc_paym_21_3']]

features_for_sum_per_rn = [['pre_maxover2limit_infrequent_sklearn', 'enc_paym_3_3'],
                           ['enc_paym_17_infrequent_sklearn', 'enc_paym_4_3'],
                           ['enc_paym_13_infrequent_sklearn', 'enc_paym_4_3'],
                           ['enc_paym_4_3', 'enc_paym_1_1'],
                           ['enc_paym_4_3', 'enc_paym_12_1'],
                           ['enc_paym_4_3', 'pclose_flag_rate'],
                           ['enc_paym_1_infrequent_sklearn', 'enc_paym_6_3'],
                           ['fclose_flag_rate', 'enc_paym_4_3'],
                           ['pre_util_6','enc_loans_credit_type_5']
                          ]

features_for_difference_per_rn = [['enc_paym_17_infrequent_sklearn', 'enc_paym_4_3'],
                                  ['enc_paym_9_3', 'pre_maxover2limit_17'],
                                  ['enc_paym_17_3', 'is_zero_loans5'],
                                  ['pre_util_5', 'enc_paym_5_3']
                                 ]



def add_feature_from_pair(data, pairs, operation):
    for pair in pairs:
        if operation == 'plus':
            name_plus = pair[0] + '_plus_' + pair[1]
            data[name_plus]=data.apply(lambda x: ((x[pair[0]]) + (x[pair[1]])).astype('int8'), axis = 1)
        elif operation == 'minus':
            name_minus = pair[0] + '_minus_' + pair[1]
            data[name_minus]=data.apply(lambda x: ((x[pair[0]]) - (x[pair[1]])).astype('int8'), axis = 1)  
        elif operation == 'plus_per_rn':
            name_plus_rate = pair[0] + '_plus_' + pair[1] + '_rate'
            data[name_plus_rate]=data.apply(lambda x: (((x[pair[0]]) + (x[pair[1]]))/ x['rn']), axis = 1)  
        elif operation == 'minus_per_rn':
            name_minus_rate = pair[0] + '_minus_' + pair[1] + '_rate'
            data[name_minus_rate]=data.apply(lambda x: (((x[pair[0]]) - (x[pair[1]])) / x['rn']), axis = 1)  
        else:
            print('Wrong operation name')
    return data

In [13]:
df = add_feature_from_pair(df,features_for_subtraction, 'minus')
df = add_feature_from_pair(df,features_for_sum_per_rn, 'plus_per_rn')
df = add_feature_from_pair(df,features_for_difference_per_rn, 'minus_per_rn')

In [14]:
# Второй проход - после повторной генерации фичей и отбора по максимальной корреляции
#plus rate 
features_for_sum_per_rn_second_time = [['pre_util_4','enc_paym_10_3_minus_is_zero_loans530'],
 ['enc_paym_10_3_minus_is_zero_loans530','pre_util_5'],
 ['fclose_flag_rate','enc_paym_10_3_minus_is_zero_loans530'],
 ['enc_paym_0_1','enc_paym_1_infrequent_sklearn_plus_enc_paym_6_3_rate'],
 ['enc_paym_10_3_minus_is_zero_loans530','pre_maxover2limit_infrequent_sklearn_plus_enc_paym_3_3_rate'],
 ['is_zero_loans530_minus_enc_paym_21_3','pre_loans_credit_limit_2'],
 ['enc_paym_4_3_plus_enc_paym_12_1_rate','enc_paym_0_1']
]

#minus rate 
features_for_difference_per_rn_second_time = [['enc_paym_0_1','is_zero_loans530_minus_enc_paym_21_3'],
 ['enc_paym_10_3_minus_is_zero_loans530','pre_util_5'],
 ['enc_paym_10_3_minus_is_zero_loans530','pre_util_4'],
 ['is_zero_loans530_minus_enc_paym_21_3','fclose_flag_rate'],
 ['pre_util_5_minus_enc_paym_5_3_rate','enc_loans_credit_type_5']
]

df = add_feature_from_pair(df,features_for_sum_per_rn_second_time, 'plus_per_rn')
df = add_feature_from_pair(df,features_for_difference_per_rn_second_time, 'minus_per_rn')

In [15]:
df.head()

Unnamed: 0,flag,rn,is_zero_loans5,is_zero_loans530,is_zero_loans3060,is_zero_loans6090,is_zero_loans90,is_zero_util,is_zero_over2limit,is_zero_maxover2limit,...,fclose_flag_rate_plus_enc_paym_10_3_minus_is_zero_loans530_rate,enc_paym_0_1_plus_enc_paym_1_infrequent_sklearn_plus_enc_paym_6_3_rate_rate,enc_paym_10_3_minus_is_zero_loans530_plus_pre_maxover2limit_infrequent_sklearn_plus_enc_paym_3_3_rate_rate,is_zero_loans530_minus_enc_paym_21_3_plus_pre_loans_credit_limit_2_rate,enc_paym_4_3_plus_enc_paym_12_1_rate_plus_enc_paym_0_1_rate,enc_paym_0_1_minus_is_zero_loans530_minus_enc_paym_21_3_rate,enc_paym_10_3_minus_is_zero_loans530_minus_pre_util_5_rate,enc_paym_10_3_minus_is_zero_loans530_minus_pre_util_4_rate,is_zero_loans530_minus_enc_paym_21_3_minus_fclose_flag_rate_rate,pre_util_5_minus_enc_paym_5_3_rate_minus_enc_loans_credit_type_5_rate
0,0,10,9,10,10,10,10,6,9,9,...,-0.48,0.13,-0.48,0.3,0.12,-0.1,-0.5,-0.5,0.18,-0.02
1,0,14,12,10,12,12,11,10,12,11,...,-0.27551,0.107143,-0.260204,0.0,0.086735,0.071429,-0.285714,-0.285714,-0.010204,-0.030612
2,0,3,3,2,2,2,3,1,3,2,...,0.222222,0.111111,0.222222,0.0,0.222222,0.0,0.0,0.0,-0.222222,-0.111111
3,0,15,15,15,15,15,15,8,14,14,...,-0.573333,0.008889,-0.595556,0.466667,0.004444,-0.466667,-0.6,-0.666667,0.44,-0.071111
4,0,1,1,1,1,1,1,1,1,1,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0


In [17]:
df.to_csv('data/df_tuned.csv', index = False)