In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

PATH = '/home/kai/data/kaggle/homecredit/'

inst0 = pd.read_pickle(PATH + '/inter/tmp/inst.pkl')
inst0.shape

(13605229, 7)

In [2]:
inst0.columns

Index(['SK_ID_PREV', 'SK_ID_CURR', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT',
       'DAYS_ENTRY_PAYMENT', 'AMT_PAYMENT', 'AMT_INSTALMENT'],
      dtype='object')

In [3]:
from sklearn.linear_model import LinearRegression

def minus_name(col1, col2): return col1 + '_minus_' + col2
def minus(df, col1, col2): return df[col1] - df[col2]

def ratio_name(col1, col2): return col1 + '_divide_' + col2
def ratio(df, col1, col2): return df[col1] / (df[col2] + 1e-3)

def positive_count(df, gp_col, col):
    group = (df[col] > 0).astype('int8')
    group = pd.concat([df[gp_col], group], axis=1).groupby(gp_col)[[col]].sum().reset_index().rename(columns={col: 'positivecount_'+col})
    return group

def negative_count(df, gp_col, col):
    group = (df[col] < 0).astype('int8')
    group = pd.concat([df[gp_col], group], axis=1).groupby(gp_col)[[col]].sum().reset_index().rename(columns={col: 'negativecount_'+col})
    return group

def count(df, gp_col, col):
    group = df.groupby(gp_col)[[col]].count().reset_index().rename(index=str, columns={col:'count_' + '_'.join(gp_col)})
    return group

def numerical(df, gp_col, col, agg_fun):
    _df = df.groupby(gp_col)[[col]].agg(agg_fun)
    
    columns = []
    for x in _df.columns.levels[0]:
        for y in _df.columns.levels[1]:
            columns.append('_'.join([x, y]))
    _df.columns = columns
    return _df.reset_index()

def feature_in_time_window(df, gp_col, col, func, agg_fun=None, n=None, time_col=None):
    tmp = None
    _df = df.copy()
    if n is not None:
        _df = _df[_df[time_col] >= n]
    if agg_fun is None:
        tmp = func(_df, gp_col, col)
    else:
        tmp = func(_df, gp_col, col, agg_fun)
    columns = [str(abs(n))+'_'+x for x in set(tmp.columns) - set(gp_col)]
    tmp.columns = columns
    del _df
    gc.collect()
    return tmp

# def slope_agg(df, gp_col, x, y):
#     gp = df.copy()
#     gp['slope_'+y] = gp.groupby(gp_col)[y].shift(-1) - gp[y]
#     r = gp.groupby(gp_col).size().reset_index()[gp_col]
#     l = ['max', 'min', 'mean', 'std']
#     r = r.merge(numerical(gp, gp_col, 'slope_'+y, l), on=gp_col, how='left')
#     r = r.merge(positive_count(gp, gp_col, 'slope_'+y), on=gp_col, how='left')
#     r = r.merge(negative_count(gp, gp_col, 'slope_'+y), on=gp_col, how='left')
#     return r

def slope_agg(df, gp_col, x, y):
    gp = df.copy()
    gp_max = gp.groupby(gp_col)[[x]].max().reset_index().rename(columns={x:'max'})
    gp_min = gp.groupby(gp_col)[[x]].min().reset_index().rename(columns={x:'min'})
    gp = gp.merge(gp_max)
    gp = gp.merge(gp_min)
    gp['normal_x'] = (gp[x]-gp['max']) / (gp['max']-gp['min']+1)
    gp['value'] = gp.groupby(gp_col)[y].shift(-1) - gp[y]
    gp['x_diff'] = gp.groupby(gp_col)[x].shift(-1) - gp[x]
    gp['normal_x_diff'] = gp.groupby(gp_col)['normal_x'].shift(-1) - gp['normal_x']
    gp['slope_'+y] = gp['value'] / gp['x_diff']
    gp['normalslope_'+y] = gp['value'] / gp['normal_x_diff']
    r = gp.groupby(gp_col).size().reset_index()[gp_col]
    l = ['max', 'min', 'mean', 'std']
    r = r.merge(numerical(gp, gp_col, 'slope_'+y, l), on=gp_col, how='left')
    r = r.merge(positive_count(gp, gp_col, 'slope_'+y), on=gp_col, how='left')
    r = r.merge(negative_count(gp, gp_col, 'slope_'+y), on=gp_col, how='left')
    r = r.merge(numerical(gp, gp_col, 'normalslope_'+y, l), on=gp_col, how='left')
    r = r.merge(positive_count(gp, gp_col, 'normalslope_'+y), on=gp_col, how='left')
    r = r.merge(negative_count(gp, gp_col, 'normalslope_'+y), on=gp_col, how='left')
    return r
    
def area_under_curve(df, gp_col, x, y):
    gp = df.copy()
    gp_max = gp.groupby(gp_col)[[x]].max().reset_index().rename(columns={x:'max'})
    gp_min = gp.groupby(gp_col)[[x]].min().reset_index().rename(columns={x:'min'})
    gp = gp.merge(gp_max)
    gp = gp.merge(gp_min)
    gp['normal_x'] = (gp[x]-gp['max']) / (gp['max']-gp['min']+1)
    
    group = gp.groupby(gp_col)
    gp['tmp'] = (group[y].shift(-1)+gp[y]) * (group[x].shift(-1)-gp[x]) / 2
    gp['tmp_normal'] = (group[y].shift(-1)+gp[y]) * (group['normal_x'].shift(-1)-gp['normal_x']) / 2
    return gp.groupby(gp_col).agg({'tmp':'sum', 'tmp_normal':'sum'}).reset_index().rename(columns={'tmp':x+'_area_'+y, 'tmp_normal':x+'_normalarea_'+y})

'done'

'done'

In [4]:
gp_col = ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'SK_ID_CURR']
new = inst0.groupby(gp_col)[['AMT_INSTALMENT']].max().reset_index()
new['AMT_PAYMENT'] = 0
new['DAYS_ENTRY_PAYMENT'] = new['DAYS_INSTALMENT']
inst = pd.concat([inst0, new])

inst['EARLY_DAYS'] = inst['DAYS_INSTALMENT'] - inst['DAYS_ENTRY_PAYMENT']
inst['LATE_DAYS'] = -inst['EARLY_DAYS']
inst['LATE_DAYS_SIGN'] = (np.sign(inst['LATE_DAYS']) > 0) * inst['LATE_DAYS']
inst['EARLY_DAYS_SIGN'] = (np.sign(inst['EARLY_DAYS']) > 0) * inst['EARLY_DAYS']

p = inst.groupby(gp_col+['LATE_DAYS_SIGN']).agg({'AMT_PAYMENT': 'sum', 'DAYS_ENTRY_PAYMENT': 'max', 'AMT_INSTALMENT': 'max', 'EARLY_DAYS_SIGN': 'max'}).reset_index()
p['AMT_CUM_PAYMENT'] = p.groupby(gp_col)[['AMT_PAYMENT']].cumsum()
p['AMT_LATE_PAYMENT'] = p['AMT_INSTALMENT'] - p['AMT_CUM_PAYMENT']
p['AMT_LATE_PAYMENT'] = p['AMT_LATE_PAYMENT'] * (abs(p['AMT_LATE_PAYMENT'])>1e-4)
p['RATE_LATE_PAYMENT'] = p['AMT_LATE_PAYMENT'] / p['AMT_INSTALMENT']

p['RATE_AMT_CUM_PAYMENT'] = p['AMT_CUM_PAYMENT'] / p['DAYS_INSTALMENT']
p['RATE_AMT_LATE_PAYMENT'] = p['AMT_LATE_PAYMENT'] / p['DAYS_INSTALMENT']
p['RATE_RATE_LATE_PAYMENT'] = p['RATE_LATE_PAYMENT'] / p['DAYS_INSTALMENT']
inst['RATE_LATE_DAYS_SIGN'] = inst['LATE_DAYS_SIGN'] / inst['DAYS_INSTALMENT']
inst['RATE_EARLY_DAYS_SIGN'] = inst['EARLY_DAYS_SIGN'] / inst['DAYS_INSTALMENT']

p_new_cols = ['AMT_CUM_PAYMENT', 'AMT_LATE_PAYMENT', 'RATE_LATE_PAYMENT', 'RATE_AMT_CUM_PAYMENT', 'RATE_AMT_LATE_PAYMENT', 'RATE_RATE_LATE_PAYMENT']

inst_new_cols = ['RATE_LATE_DAYS_SIGN', 'LATE_DAYS_SIGN', 'RATE_EARLY_DAYS_SIGN', 'EARLY_DAYS_SIGN']

p.shape

(14011460, 15)

In [5]:
df = p.groupby(gp_col).size().reset_index()[gp_col]

df = df.merge(count(inst, gp_col, 'AMT_INSTALMENT'), on=gp_col, how='left')
count_col = [x for x in df.columns if 'count_' in x][0]
print(df.shape, count_col)


df = df.merge(slope_agg(p, gp_col, 'LATE_DAYS_SIGN', 'AMT_LATE_PAYMENT'))
df = df.merge(slope_agg(p, gp_col, 'LATE_DAYS_SIGN', 'RATE_LATE_PAYMENT'))
print('slope', df.shape)

p['DURATION'] = p['DAYS_ENTRY_PAYMENT'] - p['DAYS_INSTALMENT']
df = df.merge(p.groupby(gp_col)[['DURATION']].max().reset_index(), on=gp_col, how='left')
p.drop('DURATION', axis=1, inplace=True)
print('DURATION', df.shape)


p['EARLY_DURATION'] = p['DAYS_INSTALMENT'] - p['DAYS_ENTRY_PAYMENT']
df = df.merge(p.groupby(gp_col)[['EARLY_DURATION']].max().reset_index(), on=gp_col, how='left')
p.drop('EARLY_DURATION', axis=1, inplace=True)
print('EARLY_DURATION', df.shape)


df['TOTAL_DURATION'] = df['DURATION'] + df['EARLY_DURATION']
print('TOTAL_DURATION', df.shape)


df = df.merge(area_under_curve(p, gp_col, 'LATE_DAYS_SIGN', 'AMT_LATE_PAYMENT'))
df = df.merge(area_under_curve(p, gp_col, 'LATE_DAYS_SIGN', 'RATE_LATE_PAYMENT'))
print('AREA', df.shape)


df = df.merge(p.groupby(gp_col)[['AMT_INSTALMENT']].max().reset_index(), on=gp_col, how='left')

df = df.merge(p.groupby(gp_col)[['AMT_LATE_PAYMENT']].max().reset_index().rename(columns={'AMT_LATE_PAYMENT':'AMT_TOTAL_LATE'}), on=gp_col, how='left')
name = ratio_name('AMT_TOTAL_LATE', 'AMT_INSTALMENT')
df[name] = ratio(df, 'AMT_TOTAL_LATE', 'AMT_INSTALMENT')
print('AMT_TOTAL_LATE', df.shape)

df['AMT_ONTIME_PAYMENT'] = df['AMT_INSTALMENT'] - df['AMT_TOTAL_LATE']
name = ratio_name('AMT_ONTIME_PAYMENT', 'AMT_INSTALMENT')
df[name] = ratio(df, 'AMT_ONTIME_PAYMENT', 'AMT_INSTALMENT')
print('AMT_ONTIME_PAYMENT', df.shape)


df = df.merge(positive_count(inst, gp_col, 'LATE_DAYS_SIGN'), on=gp_col, how='left')
df = df.merge(positive_count(inst, gp_col, 'EARLY_DAYS_SIGN'), on=gp_col, how='left')
name1 = ratio_name('positivecount_LATE_DAYS_SIGN', count_col)
df[name1] = ratio(df, 'positivecount_LATE_DAYS_SIGN', count_col)
name2 = ratio_name('positivecount_EARLY_DAYS_SIGN', count_col)
df[name2] = ratio(df, 'positivecount_EARLY_DAYS_SIGN', count_col)
print('positivecount', df.shape)
    
    
df.columns

(12862309, 5) count_SK_ID_PREV_NUM_INSTALMENT_NUMBER_DAYS_INSTALMENT_SK_ID_CURR
slope (12862309, 29)
DURATION (12862309, 30)
EARLY_DURATION (12862309, 31)
TOTAL_DURATION (12862309, 32)
AREA (12862309, 36)
AMT_TOTAL_LATE (12862309, 39)
AMT_ONTIME_PAYMENT (12862309, 41)
positivecount (12862309, 45)


Index(['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'SK_ID_CURR',
       'count_SK_ID_PREV_NUM_INSTALMENT_NUMBER_DAYS_INSTALMENT_SK_ID_CURR',
       'slope_AMT_LATE_PAYMENT_max', 'slope_AMT_LATE_PAYMENT_min',
       'slope_AMT_LATE_PAYMENT_mean', 'slope_AMT_LATE_PAYMENT_std',
       'positivecount_slope_AMT_LATE_PAYMENT',
       'negativecount_slope_AMT_LATE_PAYMENT',
       'normalslope_AMT_LATE_PAYMENT_max', 'normalslope_AMT_LATE_PAYMENT_min',
       'normalslope_AMT_LATE_PAYMENT_mean', 'normalslope_AMT_LATE_PAYMENT_std',
       'positivecount_normalslope_AMT_LATE_PAYMENT',
       'negativecount_normalslope_AMT_LATE_PAYMENT',
       'slope_RATE_LATE_PAYMENT_max', 'slope_RATE_LATE_PAYMENT_min',
       'slope_RATE_LATE_PAYMENT_mean', 'slope_RATE_LATE_PAYMENT_std',
       'positivecount_slope_RATE_LATE_PAYMENT',
       'negativecount_slope_RATE_LATE_PAYMENT',
       'normalslope_RATE_LATE_PAYMENT_max',
       'normalslope_RATE_LATE_PAYMENT_min',
       'normalslope_RATE_LAT

In [6]:
train = pd.read_csv(PATH + 'application_train.csv')
test = pd.read_csv(PATH + 'application_test.csv')

merge_col = ['SK_ID_CURR']
m = pd.concat([train[merge_col], test[merge_col]])

for x in set(df.columns) - set(['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'SK_ID_CURR']):
    m = m.merge(numerical(df, merge_col, x, ['mean', 'max', 'min', 'sum', 'std']))
print('df', m.shape)
    
for x in p_new_cols:
    m = m.merge(numerical(p, merge_col, x, ['mean', 'max', 'min', 'sum', 'std']))
print('p', m.shape)
    
for x in inst_new_cols:
    m = m.merge(numerical(inst, merge_col, x, ['mean', 'max', 'min', 'sum', 'std']))
print('inst', m.shape)
m.columns

df (339587, 211)
p (339587, 241)
inst (339587, 261)


Index(['SK_ID_CURR', 'slope_RATE_LATE_PAYMENT_max_mean',
       'slope_RATE_LATE_PAYMENT_max_max', 'slope_RATE_LATE_PAYMENT_max_min',
       'slope_RATE_LATE_PAYMENT_max_sum', 'slope_RATE_LATE_PAYMENT_max_std',
       'LATE_DAYS_SIGN_normalarea_AMT_LATE_PAYMENT_mean',
       'LATE_DAYS_SIGN_normalarea_AMT_LATE_PAYMENT_max',
       'LATE_DAYS_SIGN_normalarea_AMT_LATE_PAYMENT_min',
       'LATE_DAYS_SIGN_normalarea_AMT_LATE_PAYMENT_sum',
       ...
       'RATE_EARLY_DAYS_SIGN_mean', 'RATE_EARLY_DAYS_SIGN_max',
       'RATE_EARLY_DAYS_SIGN_min', 'RATE_EARLY_DAYS_SIGN_sum',
       'RATE_EARLY_DAYS_SIGN_std', 'EARLY_DAYS_SIGN_mean',
       'EARLY_DAYS_SIGN_max', 'EARLY_DAYS_SIGN_min', 'EARLY_DAYS_SIGN_sum',
       'EARLY_DAYS_SIGN_std'],
      dtype='object', length=261)

In [7]:
col = []
for x in m.columns:
    tmp = 'inst_' + x if x != 'SK_ID_CURR' else x
    col.append(tmp)
m.columns = col
m.to_pickle(PATH + 'inter/brandnew_inst2curr_with_days.pkl')