In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import gc
import os

PATH = '/home/kai/data/kaggle/homecredit/'
bureau = pd.read_csv(PATH + 'bureau.csv')
bureau_balance = pd.read_csv(PATH + 'bureau_balance.csv')

# utils

In [26]:
def save(df, name):
    df.to_pickle(name)
    
    
def _set_type(series, dtype):
    _max, _min = max(series), min(series)
    if dtype == 'uint':
        if _max <= 255: return 'uint8'
        elif _max <= 65535: return 'uint16'
        elif _max <= 4294967295: return 'uint32'
        else: return 'uint64'
    elif dtype == 'int':
        if _min >= -128 and _max <= 127: return 'int8'
        elif _min >=-32768 and _max <= 32767: return 'int16'
        elif _min >= -2147483648 and _max <= 2147483647: return 'int32'
        else: return 'int64'
    elif dtype == 'float':
        if max(abs(_min), _max) <= 3.4028235e+38: return 'float32'
        else: return 'float64'
        
def set_dtype(df, original_df, type_dict):
    dtype = {x: df[x].dtype if x in original_df.columns else _set_type(df[x], type_dict[x]) for x in df.columns}
    return df.astype(dtype)

# linear regression

In [None]:
def linear(df, group_col, value_col, time_col, score_map=None, table_name):
    l = []
    l1 = []
    _df = df.sort_values('MONTHS_BALANCE').reset_index()
    if score_map != None:
        _df[value_col] = _df[value_col].map(score_map)
    gp = _df[[group_col, time_col, value_col]].groupby(by=group_col)
    for i, group in gp:
        lg = LinearRegression()
        lg.fit(group[[time_col]], group[[value_col]])
        l.append([i, lg.coef_[0][0], lg.intercept_[0]])
        
        group1 = group.copy()
        group1[time_col] = (group1[time_col]-group1[time_col].max()) / (group1[time_col].max()-group1[time_col].min()+1)
        lg1 = LinearRegression()
        lg1.fit(group[[time_col]], group[[value_col]])
        l.append([i, lg.coef_[0][0], lg1.coef_[0][0]])
    
    name = [table_name + '_lgcoef_' + time_col + '_' + value_col,
            table_name + '_lgnormalizedcoef_' + time_col + '_' + value_col]
    tmp_df = set_dtype(pd.DataFrame(l, columns=[group_col] + name), df, {x: 'float' for x in name})
    return tmp_df

# split categorical features

In [None]:
def split_categorical_feature(df, group_col, calc_col):
    tmp_df = pd.concat([df[group_col], pd.get_dummies(df[calc_col], prefix=calc_col)], axis=1).groupby(by=group_col).sum().reset_index()
    tmp_df = set_dtype(tmp_df, df, {x: 'uint' for x in tmp_df.columns})
    return tmp_df.astype(dtype)

# home made score

In [None]:
def score(df, group_col, calc_col, time_col, score_map=None, table_name):
    total = [group_col] + [calc_col, time_col]
    _df = df.sort_values(time_col).reset_index()
    if score_map != None:
        _df[calc_col] = _df[calc_col].map(score_map)
    group = _df.groupby(by=group_col)
    r = df[group_col].copy()
    name = table_name + '_score_' + time_col + '_' + calc_col
    r[name] = (group[calc_col].shift(1).fillna(0) + group[calc_col].shift(-1).fillna(0) + _df[calc_col]) ** 2\
                    / np.exp(-(_df[time_col])**2/144/2)
    r = set_type(r, df, {x: 'float' for x in r.columns})
    del _df
    gc.collect()
    return __df

# sth. / time

In [None]:
def time_ratio(df, group_col, calc_col, time_col, score_map, table_name):
    total = [group_col] + [calc_col, time_col]
    _df = df.sort_values(time_col).reset_index()
    if score_map != None:
        _df[calc_col] = _df[calc_col].map(score_map)
    _df[calc_col] = _df[calc_col] / abs(_df[time_col] - 1)
    group = _df.groupby(by=group_col)
    r = df[group_col].copy()
    name = table_name + '_timeratio_' + time_col + '_' + calc_col
    r[name] = group[calc_col]
    r = set_type(r, df, {x: 'float' for x in r.columns})
    del _df
    gc.collect()
    return __df

# substraction and division

In [None]:
def ratio_name(numerator, denominator): return numerator + '_divide_' + denominator
def ratio(df, numerator, denominator):  return df[numerator] / df[denominator]

def minus_name(col1, col2): return col1 + '_minus_' + col2
def minus(df, col1, col2):  return df[col1] - df[col2]

# count

In [None]:
def count(df, group_col, calc_col, table_name):
    group = df[[group_col, calc_col]].groupby(by=group_col)[[calc_col]].count().reset_index().rename(index=str, columns={calc_col: table_name + '_count_' + calc_col})
    group = set_dtype(group, df, {x: 'uint' for x in group.columns})
    gc.collect()
    return group

# last C on bureau balance

In [None]:
def last_before_C(df_, group_col, value_col, time_col, score_map, table_name):
    l = []
    df = df_.sort_values('MONTHS_BALANCE', ascending=False).reset_index()
    df[value_col] = df[value_col].map(score_map)
    d = {}
    for i in range(df.shape[0]):
        idx = df.loc[i, group_col]
        val = df.loc[i, value_col]
        if d.get(idx) == None:
            d[idx] = 'on C'
        elif d.get(idx) == 'on C':
            if val != 7:
                d[idx] = 'not on C'
                l.append([idx, val])
    tmp = pd.DataFrame(l, columns=[group_col, table_name+'_last_on_C_'+value_col])
    return tmp.astype({group_col: df[group_col].dtype, table_name+'_last_on_C_'+value_col: _set_type(tmp[table_name+'_last_on_C_'+value_col], 'uint')})

# numerical feature delt package

In [None]:
def one_third(series): return series.quantile(1/3)           
def two_third(series): return series.quantile(2/3)
def max_minus_min(series): return (series.max() - series.min())
def two_minus_one_third(series): return(series.quantile(0.66666) - series.quantile(0.33333))
def positive_count(series): return pd.Series(series > 0).sum()
def negative_count(series): return pd.Series(series < 0).sum()
def standard_error(series): return series.std()/np.sqrt(len(series))
def normed_std(series): return series.std()/series.mean()
def trimmed_mean_10_pct(series): return stats.trim_mean(series.dropna(), 0.1)
def trimmed_mean_25_pct(series): return stats.trim_mean(series.dropna(), 0.25)

extra_func = []

def numerical_agg( df, gp_col, agg_col, extrafunc_list):
    
    '''
    standard = ['max', 'min', 'std','mean', 'sum','median', 'mode']
    possible extrafunc_list: ['two_minus_one_third', 'positive_count', 'negative_count', 'standard_error',
                              'trimmed_mean_10_pct', 'trimmed_mean_25_pct', 'normed_std', 'max_minus_min', 'one_third', 
                              'two_third']
    '''
            
        
    list_tocall = []
    for i in extrafunc_list:
        if i not in set(['sum', 'median']):
            list_tocall.append(eval(i))
            
    if extrafunc_list != None:   
        agg_list.extend(list_tocall)
    
    _df = df.groupby(gp_col).agg({agg_col:agg_list})
    columns = []
    for pre in _df.columns.levels[0]:
        for middle in _df.columns.levels[1]:
            columns.append('bureau_%s_%s' %(pre,middle))
    _df.columns = columns
    
    return _df.reset_index()

In [29]:
x = [1]
y = [2]
z = x + y
print(z)
z[0] = 5
print(x)

[1, 2]
[1]
