In [6]:
import pandas as pd
import os
import gc
import numpy as np
from sklearn.linear_model import LinearRegression

def _set_type(series, dtype):
    _max, _min = max(series), min(series)
    if dtype == 'uint':
        if _max <= 255: return np.uint8
        elif _max <= 65535: return np.uint16
        elif _max <= 4294967295: return np.uint32
        else: return np.uint64
    elif dtype == 'int':
        if _min >= -128 and _max <= 127: return np.int8
        elif _min >=-32768 and _max <= 32767: return np.int16
        elif _min >= -2147483648 and _max <= 2147483647: return np.int32
        else: return np.int64
    elif dtype == 'float':
        if max(abs(_min), _max) <= 3.4028235e+38: return np.float32
        else: return np.float64

def split_categorical_feature(df, group_col, calc_col):
    tmp_df = pd.concat([df[group_col], pd.get_dummies(df[calc_col], prefix=calc_col)], axis=1).groupby(by=group_col).sum().reset_index()
    dtype = {x: _set_type(tmp_df[x], 'uint') for x in tmp_df.columns if x != group_col}
    for x in tmp_df:
        if x in df.columns:
            dtype[x] = df[x].dtype
    return tmp_df.astype(dtype)

def score(df, group_col, calc_col, time_col, score_map, table_name):
    total = list(group_col)
    total.extend([calc_col, time_col])
    _df = df.sort_values('MONTHS_BALANCE').reset_index()
    _df[calc_col] = _df[calc_col].map(score_map)
    group = _df.groupby(by=group_col)
    _df[calc_col] = (group[calc_col].shift(1).fillna(0) + group[calc_col].shift(-1).fillna(0) + _df[calc_col]) ** 2 / np.exp(-(_df[time_col])**2/144/2)
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[calc_col] = _set_type(_df[calc_col], 'float')
    __df = _df[[group_col, calc_col]].astype(dtype).rename(index=str, columns={calc_col: table_name + '_score'})
    del _df
    gc.collect()
    return __df

def count(df, group_col, calc_col, table_name):
    group = df[[group_col, calc_col]].groupby(by=group_col)[[calc_col]].count().reset_index()
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[calc_col] = _set_type(group[calc_col], 'uint')
    _df = group.astype(dtype).rename(index=str, columns={calc_col: table_name + '_' + calc_col + '_count'})
    return _df

def linear(df_, group_col, value_col, time_col, table_name):
    l = []
    df = df_.sort_values(time_col).reset_index()
    gp = df[[group_col, time_col, value_col]].groupby(by=group_col)
    for i, group in gp:
        lg = LinearRegression()
        lg.fit(group[[time_col]], group[[value_col]])
        
        group1 = group.copy()
        group1[time_col] = (group1[time_col] - group1[time_col].max()) / (group1[time_col].max() - group1[time_col].min() + 1)
        lg1 = LinearRegression()
        lg1.fit(group[[time_col]], group[[value_col]])
        l.append([i, lg.coef_[0][0], lg1.coef_[0][0]])
        
    tmp_df = pd.DataFrame(l, columns=[group_col, table_name + '_' + 'lg_coef', table_name + '_' + 'lg_normalized_coef'])
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[table_name + '_' + 'lg_coef'] = _set_type(tmp_df[table_name + '_' + 'lg_coef'], 'float')
    dtype[table_name + '_' + 'lg_normalized_coef'] = _set_type(tmp_df[table_name + '_' + 'lg_normalized_coef'], 'float')
    return tmp_df.astype(dtype)

def last_before_C(df_, group_col, value_col, time_col, score_map, table_name):
    l = []
    df = df_.sort_values('MONTHS_BALANCE', ascending=False).reset_index()
    df[value_col] = df[value_col].map(score_map)
    d = {}
    for i in range(df.shape[0]):
        idx = df.loc[i, group_col]
        val = df.loc[i, value_col]
        if d.get(idx) == None:
            d[idx] = 'on C'
        elif d.get(idx) == 'on C':
            if val != 7:
                d[idx] = 'not on C'
                l.append([idx, val])
    tmp = pd.DataFrame(l, columns=[group_col, table_name+'_last_on_C_'+value_col])
    return tmp.astype({group_col: df[group_col].dtype, table_name+'_last_on_C_'+value_col: _set_type(tmp[table_name+'_last_on_C_'+value_col], 'uint')})

def ratio_name(numerator, denominator): return numerator + '_divide_' + denominator

def ratio(df, numerator, denominator):
    return df[numerator] / df[denominator]

def substraction_name(col1, col2): return col1 + '_minus_' + col2

def substraction(df, col1, col2):
    return df[col1] - df[col2]

In [7]:
def numerical_agg( df, gp_col, agg_col, extrafunc_list = None):
    
    agg_list = ['max', 'min', 'std','mean']
    '''
    Possible extrafunc_list: ['sum','median','two_minus_one_third','positive_count', 'negative_count','standard_error', 'trimmed_mean_10_pct', 'trimmed_mean_25_pct','normed_std', 'max_minus_min','one_third','two_third']
    '''
    if extrafunc_list:
        if 'one_third' in extrafunc_list:
            def one_third(series):
                return series.quantile(1/3)
            
        if 'two_third' in extrafunc_list:
            def two_third(series):
                return series.quantile(2/3)
            
        if 'max_minus_min' in extrafunc_list:
            def max_minus_min(series):
                return (series.max() - series.min())
        
        if 'two_minus_one_third' in extrafunc_list:
            def two_minus_one_third(series):
                return(series.quantile(0.66666) - series.quantile(0.33333))
       
        if 'positive_count' in extrafunc_list:
            def positive_count(series):
                return pd.Series(series > 0).sum()

        if 'negative_count' in extrafunc_list:
            def negative_count(series):
                return pd.Series(series < 0).sum()

        if 'standard_error' in extrafunc_list:
            def standard_error(series):
                return series.std()/np.sqrt(len(series))

        if 'normed_std' in extrafunc_list:
            def normed_std(series):
                return series.std()/series.mean()

        if 'trimmed_mean_10_pct' in extrafunc_list:
            def trimmed_mean_10_pct(series):
                return stats.trim_mean(series.dropna(), 0.1)

        if 'trimmed_mean_25_pct' in extrafunc_list:
            def trimmed_mean_25_pct(series):
                return stats.trim_mean(series.dropna(), 0.25)
        
        list_tocall = []
        for i in extrafunc_list:
            if i not in set(['sum', 'median']):
                list_tocall.append(eval(i))
            
    if extrafunc_list != None:   
        agg_list.extend(list_tocall)
    
    _df = df.groupby(gp_col).agg({agg_col:agg_list})
    columns = []
    for pre in _df.columns.levels[0]:
        for middle in _df.columns.levels[1]:
            columns.append('bureau_%s_%s' %(pre,middle))
    _df.columns = columns
    
    return _df.reset_index()

In [8]:
import pandas as pd
import numpy as np

PATH = '/home/kai/data/kaggle/homecredit/'
inst = pd.read_csv(PATH + 'installments_payments.csv')
print('installments_payments done')

installments_payments done


In [9]:
sv_path = PATH + 'data/'
inst[substraction_name('DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT')] = substraction(inst, 'DAYS_INSTALMENT' ,'DAYS_ENTRY_PAYMENT')

inst[substraction_name('AMT_PAYMENT', 'AMT_INSTALMENT')] = substraction(inst, 'AMT_PAYMENT' ,'AMT_INSTALMENT')

inst['late_days'] = 0
x = inst['DAYS_INSTALMENT_minus_DAYS_ENTRY_PAYMENT'] < 0
inst['late_days'][x] = inst['DAYS_INSTALMENT_minus_DAYS_ENTRY_PAYMENT'][x]

inst['AMT_PAYMENT_minus_AMT_INSTALMENT_divide_DAYS_INSTALMENT_minus_DAYS_ENTRY_PAYMENT'] = inst['AMT_PAYMENT_minus_AMT_INSTALMENT'] / abs(inst['late_days'] - 1)
inst = inst.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
x = linear(inst, 'SK_ID_CURR', 'AMT_PAYMENT_minus_AMT_INSTALMENT', 'DAYS_ENTRY_PAYMENT', 'installment')
x.to_pickle(sv_path + '1.pkl')
print('done')

done


In [11]:
y = linear(inst, 'SK_ID_CURR', 'DAYS_INSTALMENT_minus_DAYS_ENTRY_PAYMENT', 'DAYS_ENTRY_PAYMENT', 'installment')
y.to_pickle(sv_path + '2.pkl')
print('done')

done


In [12]:
z = linear(inst, 'SK_ID_CURR', 'AMT_PAYMENT_minus_AMT_INSTALMENT_divide_DAYS_INSTALMENT_minus_DAYS_ENTRY_PAYMENT', 'DAYS_ENTRY_PAYMENT', 'installment')
z.to_pickle(sv_path + '3.pkl')
print('done')

done


In [None]:
print(x.SK_ID_CURR)

In [13]:
import pandas as pd
import os
PATH = '/home/kai/data/kaggle/homecredit/'
train = pd.read_pickle(PATH + 'train_merged.pkl')
test = pd.read_pickle(PATH + 'test_merged.pkl')

In [15]:
for x in os.listdir(PATH + 'data/'):
    if x.split('.')[-1] == 'pkl':
        print(x)
        tmp = pd.read_pickle(PATH + 'data/' + x)
        train = train.merge(tmp, on='SK_ID_CURR', how='left')
        test = test.merge(tmp, on='SK_ID_CURR', how='left')
train.to_pickle(PATH + 'train_merged.pkl')
test.to_pickle(PATH + 'test_merged.pkl')
print('done')

3.pkl
2.pkl
1.pkl
done
