In [1]:
from scipy.stats import skew,kurtosis,iqr
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import numpy as np
sys.path.append('../../LIB/')
from env import ENV
import pickle
import warnings

from functools import partial
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')
import multiprocessing as mp
from functools import reduce

In [2]:
def calculate_na(ser):
    return np.sum(ser.isnull())


def scan_nan_portion(df):
    portions = []
    columns = []
    for col in df.columns:
        columns.append(col)
        portions.append(np.sum(df[col].isnull())/len(df))
    return pd.Series(data=portions, index=columns)

In [3]:
def add_features(feature_name, aggs, features, feature_names, groupby):
    feature_names.extend(['{}_{}'.format(feature_name, agg) for agg in aggs])

    for agg in aggs:
        if agg == 'kurt':
            agg_func = kurtosis
        elif agg == 'iqr':
            agg_func = iqr
        else:
            agg_func = agg
        
        g = groupby[feature_name].agg(agg_func).reset_index().rename(index=str,
                                                                columns={feature_name: '{}_{}'.format(feature_name,
                                                                                                      agg)})
        features = features.merge(g, on='SK_ID_CURR', how='left')
    return features, feature_names



def add_features_in_group(features, gr_, feature_name, aggs, prefix):
    for agg in aggs:
        if agg == 'sum':
            features['{}{}_sum'.format(prefix, feature_name)] = gr_[feature_name].sum()
        elif agg == 'mean':
            features['{}{}_mean'.format(prefix, feature_name)] = gr_[feature_name].mean()
        elif agg == 'max':
            features['{}{}_max'.format(prefix, feature_name)] = gr_[feature_name].max()
        elif agg == 'min':
            features['{}{}_min'.format(prefix, feature_name)] = gr_[feature_name].min()
        elif agg == 'std':
            features['{}{}_std'.format(prefix, feature_name)] = gr_[feature_name].std()
        elif agg == 'count':
            features['{}{}_count'.format(prefix, feature_name)] = gr_[feature_name].count()
        elif agg == 'skew':
            features['{}{}_skew'.format(prefix, feature_name)] = skew(gr_[feature_name])
        elif agg == 'kurt':
            features['{}{}_kurt'.format(prefix, feature_name)] = kurtosis(gr_[feature_name])
        elif agg == 'iqr':
            features['{}{}_iqr'.format(prefix, feature_name)] = iqr(gr_[feature_name])
        elif agg == 'median':
            features['{}{}_median'.format(prefix, feature_name)] = gr_[feature_name].median()

    return features

def chunk_groups(groupby_object, chunk_size):
    n_groups = groupby_object.ngroups
    group_chunk, index_chunk = [], []
    for i, (index, df) in enumerate(groupby_object):
        group_chunk.append(df)
        index_chunk.append(index)

        if (i + 1) % chunk_size == 0 or i + 1 == n_groups:
            group_chunk_, index_chunk_ = group_chunk.copy(), index_chunk.copy()
            group_chunk, index_chunk = [], []
            yield index_chunk_, group_chunk_

def parallel_apply(groups, func, index_name='Index', num_workers=1, chunk_size=100000):
    n_chunks = np.ceil(1.0 * groups.ngroups / chunk_size)
    indeces, features = [], []
    for index_chunk, groups_chunk in tqdm(chunk_groups(groups, chunk_size), total=n_chunks):
        with mp.pool.Pool(num_workers) as executor:
            features_chunk = executor.map(func, groups_chunk)
        features.extend(features_chunk)
        indeces.extend(index_chunk)

    features = pd.DataFrame(features)
    features.index = indeces
    features.index.name = index_name
    return features

In [4]:
application = pickle.load(open(ENV.application_train_reproduce.value,'rb'))
installments = pd.read_csv(ENV.installments_payments_ori.value)

In [5]:
p = scan_nan_portion(application)
print(p.describe())

count    405.000000
mean       0.346126
std        0.298695
min        0.000000
25%        0.000052
50%        0.313455
75%        0.679680
max        0.829271
dtype: float64


# Feature_engineering

In [6]:
INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = []
for agg in ['mean', 'min', 'max', 'sum', 'var']:
    for select in ['AMT_INSTALMENT',
                   'AMT_PAYMENT',
                   'DAYS_ENTRY_PAYMENT',
                   'DAYS_INSTALMENT',
                   'NUM_INSTALMENT_NUMBER',
                   'NUM_INSTALMENT_VERSION'
                   ]:
        INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES.append((select, agg))
INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES = [(['SK_ID_CURR'], INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES)]

In [7]:
groupby_aggregate_names = []
for groupby_cols, specs in tqdm(INSTALLMENTS_PAYMENTS_AGGREGATION_RECIPIES):
    group_object = installments.groupby(groupby_cols)
    for select, agg in tqdm(specs):
        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
        application = application.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
        groupby_aggregate_names.append(groupby_aggregate_name)

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [8]:
print(application.shape)
print(len(application.SK_ID_CURR.unique()))

(307511, 435)
307511


In [9]:
p = scan_nan_portion(application)
print(p.describe())

count    435.000000
mean       0.325845
std        0.297686
min        0.000000
25%        0.001525
50%        0.313455
75%        0.670958
max        0.829271
dtype: float64


# Solution 4

In [10]:

installments_one = installments[installments['SK_ID_CURR']==328162]

# installments_ = installments.sample(10000)
installments_ = installments.sample(frac=1)
installments_['instalment_paid_late_in_days'] = installments_['DAYS_ENTRY_PAYMENT'] - installments_['DAYS_INSTALMENT'] 
installments_['instalment_paid_late'] = (installments_['instalment_paid_late_in_days'] > 0).astype(int)
installments_['instalment_paid_over_amount'] = installments_['AMT_PAYMENT'] - installments_['AMT_INSTALMENT']
installments_['instalment_paid_over'] = (installments_['instalment_paid_over_amount'] > 0).astype(int)

features = pd.DataFrame({'SK_ID_CURR':installments_['SK_ID_CURR'].unique()})
groupby = installments_.groupby(['SK_ID_CURR'])

## Per id aggregations

In [11]:
feature_names = []

features, feature_names = add_features('NUM_INSTALMENT_VERSION', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_late_in_days', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_late', ['sum','mean'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_over_amount', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                     features, feature_names, groupby)

features, feature_names = add_features('instalment_paid_over', ['sum','mean'],
                                     features, feature_names, groupby)
    
display(features.head())

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_VERSION_mean,NUM_INSTALMENT_VERSION_max,NUM_INSTALMENT_VERSION_min,NUM_INSTALMENT_VERSION_std,NUM_INSTALMENT_VERSION_median,NUM_INSTALMENT_VERSION_skew,NUM_INSTALMENT_VERSION_kurt,NUM_INSTALMENT_VERSION_iqr,...,instalment_paid_over_amount_mean,instalment_paid_over_amount_max,instalment_paid_over_amount_min,instalment_paid_over_amount_std,instalment_paid_over_amount_median,instalment_paid_over_amount_skew,instalment_paid_over_amount_kurt,instalment_paid_over_amount_iqr,instalment_paid_over_sum,instalment_paid_over_mean
0,265131,26.0,1.083333,2.0,1.0,0.28233,1.0,3.21996,7.090909,0.0,...,-299.626875,0.0,-2394.0,808.624837,0.0,-2.421853,3.142839,0.75375,0,0.0
1,277426,103.0,0.635802,2.0,0.0,0.50778,1.0,-0.279466,-1.248001,1.0,...,-154.003333,0.0,-8482.5,1014.019726,0.0,-6.955568,48.041474,0.0,0,0.0
2,196228,30.0,0.47619,2.0,0.0,0.56389,0.0,0.653489,-0.643037,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-3.0,0.0,0,0.0
3,423170,104.0,0.514851,2.0,0.0,0.53928,0.0,0.325034,-1.111557,1.0,...,-1988.972822,0.0,-18075.33,5197.231851,0.0,-2.335282,3.56914,0.0,0,0.0
4,101625,26.0,1.0,1.0,1.0,0.0,1.0,0.0,-3.0,0.0,...,-1421.125962,0.0,-18341.1,4777.951965,0.0,-3.359621,8.081362,0.0,0,0.0


## Per id K last installment information


In [12]:
def last_k_instalment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)
    
    features = {}

    for period in periods:
        gr_period = gr_.iloc[:period]

        features = add_features_in_group(features,gr_period, 'NUM_INSTALMENT_VERSION', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                         'last_{}_'.format(period))
        
        features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                         'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period ,'instalment_paid_late', 
                                     ['count','mean'],
                                         'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                         'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period,'instalment_paid_over', 
                                     ['count','mean'],
                                         'last_{}_'.format(period))        
    
    return features

In [13]:
func = partial(last_k_instalment_features, periods=[1,5,10,20,50,100])

g = parallel_apply(groupby, func, index_name='SK_ID_CURR',
                   num_workers=16, chunk_size=10000).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')

display(features.head())

HBox(children=(IntProgress(value=0, max=34), HTML(value='')))




Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_VERSION_mean,NUM_INSTALMENT_VERSION_max,NUM_INSTALMENT_VERSION_min,NUM_INSTALMENT_VERSION_std,NUM_INSTALMENT_VERSION_median,NUM_INSTALMENT_VERSION_skew,NUM_INSTALMENT_VERSION_kurt,NUM_INSTALMENT_VERSION_iqr,...,last_5_instalment_paid_over_amount_kurt,last_5_instalment_paid_over_amount_max,last_5_instalment_paid_over_amount_mean,last_5_instalment_paid_over_amount_median,last_5_instalment_paid_over_amount_min,last_5_instalment_paid_over_amount_skew,last_5_instalment_paid_over_amount_std,last_5_instalment_paid_over_amount_sum,last_5_instalment_paid_over_count,last_5_instalment_paid_over_mean
0,265131,26.0,1.083333,2.0,1.0,0.28233,1.0,3.21996,7.090909,0.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0
1,277426,103.0,0.635802,2.0,0.0,0.50778,1.0,-0.279466,-1.248001,1.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0
2,196228,30.0,0.47619,2.0,0.0,0.56389,0.0,0.653489,-0.643037,1.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0
3,423170,104.0,0.514851,2.0,0.0,0.53928,0.0,0.325034,-1.111557,1.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0
4,101625,26.0,1.0,1.0,1.0,0.0,1.0,0.0,-3.0,0.0,...,-3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,0.0


In [14]:
p = scan_nan_portion(features)
print(p.describe())

count    218.000000
mean       0.014790
std        0.116656
min        0.000000
25%        0.000000
50%        0.000027
75%        0.002862
max        1.000000
dtype: float64


## per id dynamic

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
def trend_in_last_k_instalment_features(gr, periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)
    
    features = {}

    for period in periods:
        gr_period = gr_.iloc[:period]


        features = _add_trend_feature(features,gr_period,
                                      'instalment_paid_late_in_days','{}_period_trend_'.format(period)
                                     )
        features = _add_trend_feature(features,gr_period,
                                      'instalment_paid_over_amount','{}_period_trend_'.format(period)
                                     )
    return features

def _add_trend_feature(features,gr,feature_name, prefix):
    y = gr[feature_name].values
    try:
        x = np.arange(0,len(y)).reshape(-1,1)
        lr = LinearRegression()
        lr.fit(x,y)
        trend = lr.coef_[0]
    except:
        trend=np.nan
    features['{}{}'.format(prefix,feature_name)] = trend
    return features

In [17]:
func = partial(trend_in_last_k_instalment_features, periods=[10,50,100,500])

g = parallel_apply(groupby, func, index_name='SK_ID_CURR',
                   num_workers=16, chunk_size=10000).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')

display(features.head())

HBox(children=(IntProgress(value=0, max=34), HTML(value='')))




Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_VERSION_mean,NUM_INSTALMENT_VERSION_max,NUM_INSTALMENT_VERSION_min,NUM_INSTALMENT_VERSION_std,NUM_INSTALMENT_VERSION_median,NUM_INSTALMENT_VERSION_skew,NUM_INSTALMENT_VERSION_kurt,NUM_INSTALMENT_VERSION_iqr,...,last_5_instalment_paid_over_count,last_5_instalment_paid_over_mean,100_period_trend_instalment_paid_late_in_days,100_period_trend_instalment_paid_over_amount,10_period_trend_instalment_paid_late_in_days,10_period_trend_instalment_paid_over_amount,500_period_trend_instalment_paid_late_in_days,500_period_trend_instalment_paid_over_amount,50_period_trend_instalment_paid_late_in_days,50_period_trend_instalment_paid_over_amount
0,265131,26.0,1.083333,2.0,1.0,0.28233,1.0,3.21996,7.090909,0.0,...,5,0.0,-0.052174,-30.225854,0.175758,0.0,-0.052174,-30.225854,-0.052174,-30.225854
1,277426,103.0,0.635802,2.0,0.0,0.50778,1.0,-0.279466,-1.248001,1.0,...,5,0.0,0.01904,-7.403459,0.0,0.0,0.000336,-1.536248,-0.045906,0.0
2,196228,30.0,0.47619,2.0,0.0,0.56389,0.0,0.653489,-0.643037,1.0,...,5,0.0,-0.117127,0.0,-0.175758,0.0,-0.117127,0.0,-0.109628,0.0
3,423170,104.0,0.514851,2.0,0.0,0.53928,0.0,0.325034,-1.111557,1.0,...,5,0.0,-0.009049,28.20233,0.0,0.0,-0.008649,21.97413,-0.149772,-69.742059
4,101625,26.0,1.0,1.0,1.0,0.0,1.0,0.0,-3.0,0.0,...,5,0.0,0.246838,-64.307585,-1.818182,0.0,0.246838,-64.307585,0.246838,-64.307585


# Solution 5

In [18]:
def last_k_instalment_features_with_fractions(gr, periods, fraction_periods):
    gr_ = gr.copy()
    gr_.sort_values(['DAYS_INSTALMENT'],ascending=False, inplace=True)
    
    features = {}

    for period in periods:
        gr_period = gr_.iloc[:period]

        features = add_features_in_group(features,gr_period, 'NUM_INSTALMENT_VERSION', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                         'last_{}_'.format(period))
        
        features = add_features_in_group(features,gr_period, 'instalment_paid_late_in_days', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                         'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period ,'instalment_paid_late', 
                                     ['count','mean'],
                                         'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period ,'instalment_paid_over_amount', 
                                       ['sum','mean','max','min','std', 'median','skew', 'kurt','iqr'],
                                         'last_{}_'.format(period))
        features = add_features_in_group(features,gr_period,'instalment_paid_over', 
                                     ['count','mean'],
                                         'last_{}_'.format(period))        
    
    for short_period, long_period in fraction_periods:
        short_feature_names = _get_feature_names(features, short_period)
        long_feature_names = _get_feature_names(features, long_period)
        
        for short_feature, long_feature in zip(short_feature_names, long_feature_names):
            old_name_chunk = '_{}_'.format(short_period)
            new_name_chunk ='_{}by{}_fraction_'.format(short_period, long_period)
            fraction_feature_name = short_feature.replace(old_name_chunk, new_name_chunk)
            features[fraction_feature_name] = safe_div(features[short_feature], features[long_feature])
    return pd.Series(features)

def _get_feature_names(features, period):
    return sorted([feat for feat in features.keys() if '_{}_'.format(period) in feat])


def safe_div(a,b):
    try:
        return float(a)/float(b)
    except:
        return 0.0

In [19]:
func = partial(last_k_instalment_features_with_fractions, 
               periods=[1,5,10,20,50,100],
               fraction_periods=[(5,20),(5,50),(10,100)])

g = parallel_apply(groupby, func, index_name='SK_ID_CURR',
                   num_workers=10, chunk_size=1000).reset_index()
features = features.merge(g, on='SK_ID_CURR', how='left')

display(features.head())

HBox(children=(IntProgress(value=0, max=340), HTML(value='')))




Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_VERSION_mean,NUM_INSTALMENT_VERSION_max,NUM_INSTALMENT_VERSION_min,NUM_INSTALMENT_VERSION_std,NUM_INSTALMENT_VERSION_median,NUM_INSTALMENT_VERSION_skew,NUM_INSTALMENT_VERSION_kurt,NUM_INSTALMENT_VERSION_iqr,...,last_10by100_fraction_instalment_paid_over_amount_kurt,last_10by100_fraction_instalment_paid_over_amount_max,last_10by100_fraction_instalment_paid_over_amount_mean,last_10by100_fraction_instalment_paid_over_amount_median,last_10by100_fraction_instalment_paid_over_amount_min,last_10by100_fraction_instalment_paid_over_amount_skew,last_10by100_fraction_instalment_paid_over_amount_std,last_10by100_fraction_instalment_paid_over_amount_sum,last_10by100_fraction_instalment_paid_over_count,last_10by100_fraction_instalment_paid_over_mean
0,265131,26.0,1.083333,2.0,1.0,0.28233,1.0,3.21996,7.090909,0.0,...,-0.954551,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.416667,0.0
1,277426,103.0,0.635802,2.0,0.0,0.50778,1.0,-0.279466,-1.248001,1.0,...,-0.048682,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.1,0.0
2,196228,30.0,0.47619,2.0,0.0,0.56389,0.0,0.653489,-0.643037,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15873,0.0
3,423170,104.0,0.514851,2.0,0.0,0.53928,0.0,0.325034,-1.111557,1.0,...,-6.861169,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.1,0.0
4,101625,26.0,1.0,1.0,1.0,0.0,1.0,0.0,-3.0,0.0,...,-0.371225,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.384615,0.0


In [20]:
X = application.merge(features, on='SK_ID_CURR',how='left')
X.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,last_10by100_fraction_instalment_paid_over_amount_kurt,last_10by100_fraction_instalment_paid_over_amount_max,last_10by100_fraction_instalment_paid_over_amount_mean,last_10by100_fraction_instalment_paid_over_amount_median,last_10by100_fraction_instalment_paid_over_amount_min,last_10by100_fraction_instalment_paid_over_amount_skew,last_10by100_fraction_instalment_paid_over_amount_std,last_10by100_fraction_instalment_paid_over_amount_sum,last_10by100_fraction_instalment_paid_over_count,last_10by100_fraction_instalment_paid_over_mean
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.526316,0.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,-0.054418,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,0.151515,0.0


In [21]:
p = scan_nan_portion(X)
print(p.describe())

count    939.000000
mean       0.185223
std        0.252511
min        0.000000
25%        0.051601
50%        0.051744
75%        0.211599
max        1.000000
dtype: float64


# Saving

In [22]:
print(X.shape)
print(len(X.SK_ID_CURR.unique()))

(307511, 939)
307511


In [23]:
pickle.dump(X,open(ENV.application_train_reproduce.value,'wb'))

In [24]:
p = scan_nan_portion(X)
print(p.describe())

count    939.000000
mean       0.185223
std        0.252511
min        0.000000
25%        0.051601
50%        0.051744
75%        0.211599
max        1.000000
dtype: float64
