Forked from excellent kernel : https://www.kaggle.com/jsaguiar/updated-0-792-lb-lightgbm-with-simple-features

From Kaggler : https://www.kaggle.com/jsaguiar

In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Helper Functions
ToDo: Try doing a different type of encoding for the categorical variables

In [44]:
def one_hot_encoder(df, nan_as_category = True):
    # get original columns
    original_columns = list(df.columns)
    # get all of the categorical columns
    categorical_cols = [col for col in df.columns if df[col].dtype == "object"]
    # use pandas pd.get_dummies to turn the categorical columns into dummy variables
    df = pd.get_dummies(df, dummy_na = nan_as_category)
    # return the dataframe and the list of new columns
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# processing the main training set

In [47]:
def application_main_datasets(num_rows = None, nan_as_category=False):    
    "processing the training and test datasets, cleaning and feature engineering"
    # load training
    df = pd.read_csv("data/application_train.csv.zip", nrows=num_rows)
    # load test
    df_test = pd.read_csv("data/application_test.csv.zip", nrows=num_rows)
    # concatenate
    df = df.append(df_test).reset_index()
    
    # TODO understand this docs, live thing
    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]
    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]
    
    # NaN values for DAYS_EMPLOYED = 365243
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    
    # create a map for income by organisation type that will let us have a column that gives
    # the median income from a type of organsiation -- can be used to benchmark that person's
    # income relative to their field...feel like this could be better based on job
    inc_org_map = df[['AMT_INCOME_TOTAL','ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
    
    ## lots of column engineering
    # new credit to annuity ratio
    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    # new credit to goods ratio
    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    # new doc ind kurtosis
    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
    # new live ind sum
    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
    # new inc per child
    df['NEW_INC_PER_CHILD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])
    # new inc by org
    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_org_map)
    # new emply to birth ratio
    df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    # annuity to income ratio
    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])
    # sources prod
    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']
    df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    
    # categorical features
    # binary encode (factorize) columsn w/ 2 values
    for binary_features in ['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY']:
        df[binary_features], uniques = pd.factorize(df[binary_features])
    # one hot encode the other categorical features
    df, catcols = one_hot_encoder(df, nan_as_category)
    
    # clean up the environment
    del df_test
    gc.collect()

    # return dataframe with at least SK_ID_CURR and target
    return df

In [48]:
res = application_main_datasets(num_rows=10000)

In [50]:
res.head()

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,...,0,0,0,0,0,0,0,0,0,1
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


# Processing Bureau and bureau_balance

In [None]:
def bureau_and_balance(num_rows = None, nan_as_category = True):
    # read bureau and bureau_balance datasets
    
    # one hot encode bureau_balance (one_hot_encoder funtion returns two arguments)
    
    # one hot encode bureau
    
    # bureau_balance aggregations - months balance --> min, max, size
    ## mean for categorical columns
    ## apply aggregations with .groupby().agg(agg_dict)
    ## fancy column renaming thing
    ## join bureau with bb_agg 
    ## drop the SK_ID_BUREAU column
    ## clean up the env
    
    # Burea and bureau_balance aggregations
    
    
# return dataframe with at least SK_ID_CURR

# Processing previous applications

In [None]:

# return dataframe with at least SK_ID_CURR

# Processing POS CASH balance

In [3]:

# return dataframe with at least SK_ID_CURR

# Processing Installments

In [4]:

# return dataframe with at least SK_ID_CURR

# Processing credit_card_balance

In [5]:

# return dataframe with at least SK_ID_CURR

# ML algs

In [None]:
# join all of the datasets into one big data set and run learning