In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

PATH = '/home/kai/data/kaggle/homecredit/'

inst = pd.read_pickle(PATH + '/inter/tmp/inst.pkl')
inst.shape

(13605229, 7)

In [9]:
from sklearn.linear_model import LinearRegression

def minus_name(col1, col2): return col1 + '_minus_' + col2
def minus(df, col1, col2): return df[col1] - df[col2]

def ratio_name(col1, col2): return col1 + '_divide_' + col2
def ratio(df, col1, col2): return df[col1] / (df[col2] + 1)

def positive_count(df, gp_col, col):
    group = (df[col] > 0).astype('int8')
    group = pd.concat([df[gp_col], group], axis=1).groupby(gp_col)[[col]].sum().reset_index().rename(columns={col: 'positivecount_'+col})
    return group

def negative_count(df, gp_col, col):
    group = (df[col] < 0).astype('int8')
    group = pd.concat([df[gp_col], group], axis=1).groupby(gp_col)[[col]].sum().reset_index().rename(columns={col: 'negativecount_'+col})
    return group

def count(df, gp_col, col):
    group = df.groupby(gp_col)[[col]].count().reset_index().rename(index=str, columns={col:'count_' + '_'.join(gp_col)})
    return group

def numerical(df, gp_col, col, agg_fun):
    _df = df.groupby(gp_col)[[col]].agg(agg_fun)
    
    columns = []
    for x in _df.columns.levels[0]:
        for y in _df.columns.levels[1]:
            columns.append('_'.join([x, y]))
    _df.columns = columns
    return _df.reset_index()

def feature_in_time_window(df, gp_col, col, func, agg_fun=None, n=None, time_col=None):
    tmp = None
    _df = df.copy()
    if n is not None:
        _df = _df[_df[time_col] >= n]
    if agg_fun is None:
        tmp = func(_df, gp_col, col)
    else:
        tmp = func(_df, gp_col, col, agg_fun)
    columns = [str(abs(n))+'_'+x for x in set(tmp.columns) - set(gp_col)]
    tmp.columns = columns
    del _df
    gc.collect()
    return tmp

def slope_agg(df, gp_col, x, y):
    gp = df.copy()
    gp_max = gp.groupby(gp_col)[[x]].max().reset_index().rename(columns={x:'max'})
    gp_min = gp.groupby(gp_col)[[x]].min().reset_index().rename(columns={x:'min'})
    gp = gp.merge(gp_max)
    gp = gp.merge(gp_min)
    gp['normal_x'] = (gp[x]-gp['max']) / (gp['max']-gp['min']+1)
    gp['value'] = gp.groupby(gp_col)[y].shift(-1) - gp[y]
    gp['x_diff'] = gp.groupby(gp_col)[x].shift(-1) - gp[x]
    gp['normal_x_diff'] = gp.groupby(gp_col)['normal_x'].shift(-1) - gp['normal_x']
    gp['slope_'+y] = gp['value'] / gp['x_diff']
    gp['normalslope_'+y] = gp['value'] / gp['normal_x_diff']
    r = gp.groupby(gp_col).size().reset_index()[gp_col]
    l = ['max', 'min', 'mean', 'std']
    r = r.merge(numerical(gp, gp_col, 'slope_'+y, l), on=gp_col, how='left')
    r = r.merge(positive_count(gp, gp_col, 'slope_'+y), on=gp_col, how='left')
    r = r.merge(negative_count(gp, gp_col, 'slope_'+y), on=gp_col, how='left')
    r = r.merge(numerical(gp, gp_col, 'normalslope_'+y, l), on=gp_col, how='left')
    r = r.merge(positive_count(gp, gp_col, 'normalslope_'+y), on=gp_col, how='left')
    r = r.merge(negative_count(gp, gp_col, 'normalslope_'+y), on=gp_col, how='left')
    return r
    
def area_under_curve(df, gp_col, x, y):
    gp = df.copy()
    gp_max = gp.groupby(gp_col)[[x]].max().reset_index().rename(columns={x:'max'})
    gp_min = gp.groupby(gp_col)[[x]].min().reset_index().rename(columns={x:'min'})
    gp = gp.merge(gp_max)
    gp = gp.merge(gp_min)
    gp['normal_x'] = (gp[x]-gp['max']) / (gp['max']-gp['min']+1)
    
    group = gp.groupby(gp_col)
    gp['tmp'] = (group[y].shift(-1)+gp[y]) * (group[x].shift(-1)-gp[x]) / 2
    gp['tmp_normal'] = (group[y].shift(-1)+gp[y]) * (group['normal_x'].shift(-1)-gp['normal_x']) / 2
    return gp.groupby(gp_col).agg({'tmp':'sum', 'tmp_normal':'sum'}).reset_index().rename(columns={'tmp':x+'_area_'+y, 'tmp_normal':x+'_normalarea_'+y})

'done'

'done'

In [3]:
inst0 = pd.read_pickle(PATH + '/inter/tmp/inst.pkl')

gp_col = ['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'SK_ID_CURR']
new = inst0.groupby(gp_col)[['AMT_INSTALMENT']].max().reset_index()
new['AMT_PAYMENT'] = 0
new['DAYS_ENTRY_PAYMENT'] = new['DAYS_INSTALMENT']
inst = pd.concat([inst0, new])

inst['EARLY_DAYS'] = inst['DAYS_INSTALMENT'] - inst['DAYS_ENTRY_PAYMENT']
inst['LATE_DAYS'] = -inst['EARLY_DAYS']
inst['LATE_DAYS_SIGN'] = (np.sign(inst['LATE_DAYS']) > 0) * inst['LATE_DAYS']

p = inst.groupby(gp_col+['LATE_DAYS_SIGN']).agg({'AMT_PAYMENT': 'sum', 'DAYS_ENTRY_PAYMENT': 'max', 'AMT_INSTALMENT': 'max'}).reset_index()
p['AMT_CUM_PAYMENT'] = p.groupby(gp_col)[['AMT_PAYMENT']].cumsum()
p['AMT_LATE_PAYMENT'] = p['AMT_INSTALMENT'] - p['AMT_CUM_PAYMENT']
p['AMT_LATE_PAYMENT'] = p['AMT_LATE_PAYMENT'] * (abs(p['AMT_LATE_PAYMENT'])>1e-4)
p['RATE_LATE_PAYMENT'] = p['AMT_LATE_PAYMENT'] / p['AMT_INSTALMENT']
p.sort_values('DAYS_ENTRY_PAYMENT', ascending=False)
p.shape

(14011460, 11)

In [13]:
527828 - 14011460 + 13605229

121597

In [11]:
p.columns

Index(['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'SK_ID_CURR',
       'LATE_DAYS_SIGN', 'AMT_PAYMENT', 'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT',
       'AMT_CUM_PAYMENT', 'AMT_LATE_PAYMENT', 'RATE_LATE_PAYMENT'],
      dtype='object')

In [10]:
df = p.groupby(gp_col).size().reset_index()[gp_col]

df = df.merge(slope_agg(p, gp_col, 'LATE_DAYS_SIGN', 'AMT_LATE_PAYMENT'))
df = df.merge(slope_agg(p, gp_col, 'LATE_DAYS_SIGN', 'RATE_LATE_PAYMENT'))
print(df.shape)

p['DURATION'] = p['DAYS_ENTRY_PAYMENT'] - p['DAYS_INSTALMENT']
df = df.merge(p.groupby(gp_col)[['DURATION']].max().reset_index(), on=gp_col, how='left')
p.drop('DURATION', axis=1, inplace=True)
print(df.shape)

inst['HAS_EARLY_DAYS'] = inst['EARLY_DAYS'] > 0
df = df.merge(inst.groupby(gp_col)[['HAS_EARLY_DAYS']].max().reset_index(), on=gp_col, how='left')
inst.drop('HAS_EARLY_DAYS', axis=1, inplace=True)
print(df.shape)

df = df.merge(p.groupby(gp_col)[['AMT_INSTALMENT']].max().reset_index(), on=gp_col, how='left')

df = df.merge(count(inst, gp_col, 'AMT_INSTALMENT'), on=gp_col, how='left')
count_col = [x for x in df.columns if 'count_' in x][0]
print(df.shape)

df = df.merge(p.groupby(gp_col)[['AMT_LATE_PAYMENT']].max().reset_index().rename(columns={'AMT_LATE_PAYMENT':'AMT_TOTAL_LATE'}), on=gp_col, how='left')
name = ratio_name('AMT_TOTAL_LATE', 'AMT_INSTALMENT')
df[name] = ratio(df, 'AMT_TOTAL_LATE', 'AMT_INSTALMENT')
print(df.shape)

df = df.merge(area_under_curve(p, gp_col, 'LATE_DAYS_SIGN', 'AMT_LATE_PAYMENT'))
df = df.merge(area_under_curve(p, gp_col, 'LATE_DAYS_SIGN', 'RATE_LATE_PAYMENT'))
print(df.shape)

df['SCORE'] = (df['DURATION'] / df['AMT_INSTALMENT'] + df[name]) / df['AMT_INSTALMENT']
print(df.shape)

df = df.merge(positive_count(p, gp_col, 'LATE_DAYS_SIGN'), on=gp_col, how='left')
name1 = ratio_name('positivecount_LATE_DAYS_SIGN', count_col)
df[name1] = ratio(df, 'positivecount_LATE_DAYS_SIGN', count_col)
print(df.shape)

df.columns

(12862309, 28)
(12862309, 29)
(12862309, 30)
(12862309, 32)
(12862309, 34)
(12862309, 38)
(12862309, 39)
(12862309, 41)


Index(['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'SK_ID_CURR',
       'slope_AMT_LATE_PAYMENT_max', 'slope_AMT_LATE_PAYMENT_min',
       'slope_AMT_LATE_PAYMENT_mean', 'slope_AMT_LATE_PAYMENT_std',
       'positivecount_slope_AMT_LATE_PAYMENT_x',
       'positivecount_slope_AMT_LATE_PAYMENT_y',
       'normalslope_AMT_LATE_PAYMENT_max', 'normalslope_AMT_LATE_PAYMENT_min',
       'normalslope_AMT_LATE_PAYMENT_mean', 'normalslope_AMT_LATE_PAYMENT_std',
       'positivecount_normalslope_AMT_LATE_PAYMENT_x',
       'positivecount_normalslope_AMT_LATE_PAYMENT_y',
       'slope_RATE_LATE_PAYMENT_max', 'slope_RATE_LATE_PAYMENT_min',
       'slope_RATE_LATE_PAYMENT_mean', 'slope_RATE_LATE_PAYMENT_std',
       'positivecount_slope_RATE_LATE_PAYMENT_x',
       'positivecount_slope_RATE_LATE_PAYMENT_y',
       'normalslope_RATE_LATE_PAYMENT_max',
       'normalslope_RATE_LATE_PAYMENT_min',
       'normalslope_RATE_LATE_PAYMENT_mean',
       'normalslope_RATE_LATE_PAYMENT_std',
   

In [None]:
train = pd.read_csv(PATH + 'application_train.csv')
test = pd.read_csv(PATH + 'application_test.csv')

merge_col = ['SK_ID_CURR']
m = pd.concat([train[merge_col], test[merge_col]])
m = m.merge(count(p, merge_col, 'DAYS_ENTRY_PAYMENT'), on=merge_col, how='left')
m = m.merge(count(df, merge_col, 'SCORE'), on=merge_col, how='left')
print(m.shape)



for x in ['AMT_INSTALMENT', 'DURATION', 'AMT_TOTAL_LATE', 'LATE_DAYS_SIGN'] + [x for x in df.columns if '_area_' in x]:
    m = m.merge(numerical(df, merge_col, x, ['mean', 'max', 'min', 'sum', 'std']))
print(m.shape)

