In [None]:
import pandas as pd
import numpy as np

PATH = '/home/kai/data/kaggle/homecredit/'
train = pd.read_csv(PATH + 'application_train.csv')
print('train')
test = pd.read_csv(PATH + 'application_test.csv')
print('test')
test = pd.read_csv(PATH + 'previous_application.csv')
print('previous')
df = pd.concat([train[['SK_ID_CURR']], test[['SK_ID_CURR']]])
print('concat')

In [None]:
import pandas as pd
import os
import gc
import numpy as np
from sklearn.linear_model import LinearRegression

def _set_type(series, dtype):
    _max, _min = max(series), min(series)
    if dtype == 'uint':
        if _max <= 255: return np.uint8
        elif _max <= 65535: return np.uint16
        elif _max <= 4294967295: return np.uint32
        else: return np.uint64
    elif dtype == 'int':
        if _min >= -128 and _max <= 127: return np.int8
        elif _min >=-32768 and _max <= 32767: return np.int16
        elif _min >= -2147483648 and _max <= 2147483647: return np.int32
        else: return np.int64
    elif dtype == 'float':
        if max(abs(_min), _max) <= 3.4028235e+38: return np.float32
        else: return np.float64

def split_categorical_feature(df, group_col, calc_col):
    tmp_df = pd.concat([df[group_col], pd.get_dummies(df[calc_col], prefix=calc_col)], axis=1).groupby(by=group_col).sum().reset_index()
    dtype = {x: _set_type(tmp_df[x], 'uint') for x in tmp_df.columns if x != group_col}
    for x in tmp_df:
        if x in df.columns:
            dtype[x] = df[x].dtype
    return tmp_df.astype(dtype)

def score(df, group_col, calc_col, time_col, score_map, table_name):
    total = list(group_col)
    total.extend([calc_col, time_col])
    _df = df.sort_values('MONTHS_BALANCE').reset_index()
    _df[calc_col] = _df[calc_col].map(score_map)
    group = _df.groupby(by=group_col)
    _df[calc_col] = (group[calc_col].shift(1).fillna(0) + group[calc_col].shift(-1).fillna(0) + _df[calc_col]) ** 2 / np.exp(-(_df[time_col])**2/144/2)
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[calc_col] = _set_type(_df[calc_col], 'float')
    __df = _df[[group_col, calc_col]].astype(dtype).rename(index=str, columns={calc_col: table_name + '_score'})
    del _df
    gc.collect()
    return __df

def count(df, group_col, calc_col, table_name):
    group = df[[group_col, calc_col]].groupby(by=group_col)[[calc_col]].count().reset_index()
    dtype = {x: df[x].dtype for x in group_col if x in df.columns}
    dtype[calc_col] = _set_type(group[calc_col], 'uint')
    _df = group.astype(dtype).rename(index=str, columns={calc_col: table_name + '_' + calc_col + '_count'})
    return _df

def linear(df, group_col, time_col, value_col, table_name):
    l = []
    for _, group in df[[group_col, time_col, value_col]].groupby(by=group_col):
        lg = LinearRegression()
        lg.fit(group[time_col], group[value_col])
        l.append([group.loc[0, group_col], lg.coef_[0][0], lg.intercept_[0]])
    return pd.DataFrame(l, columns=[group_col, table_name + '_' + 'lg_coef', table_name + '_' + 'lg_inte'])

def ratio_name(numerator, denominator): return numerator + '_' + denominator + '_ratio'

def ratio(df, numerator, denominator):
    return df[numerator] / df[denominator]

def markov_time_score(df_, group_col, calc_col, time_col):
    def _time(x): np.exp(-x**2/144/2)
    
    df = df_.sort_values('MONTHS_BALANCE').reset_index()
    mapp = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, 'C':6, 'X':7}
    _df = df[calc_col].map(mapp)
    d = {}
    M = {}
    for i in range(df.shape[0]):
        if i % 2000000 == 0: print(i)
        key = df.loc[i, group_col]
        value = _df[i]
        if d.get(key) != None:
            M[key][d.get(key)][value] += 1 * _time(df.loc[i, time_col])
        else:
            M[key] = np.zeros((len(mapp), len(mapp)))
        d[key] = value
    del df, _df, d
    gc.collect()
    return M