# Initial Submission


In [1]:
import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc

import lightgbm as lgb
from sklearn.model_selection import train_test_split
# from sklearn.metrics import roc_auc_score, f1_score
from imblearn.over_sampling import RandomOverSampler

from catboost import CatBoostClassifier


# SET HERE
train_directory = '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/'
test_directory = '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/'

In [2]:
def aggregate(df):
    '''
    Defines aggregation style for group 1 and 2 datasets. take mean of numerics and max + min of strings
    '''
    
    num_cols = [c for c in df.columns if c[-1] in ['P', 'A']]
    other_cols = [c for c in df.columns if c[-1] not in ['P', 'A']]

    num_agg_mean = [pl.mean(c).alias('mean_' + c) for c in num_cols]
    num_agg_max = [pl.max(c).alias('max_' + c) for c in num_cols]
    num_agg_min = [pl.min(c).alias('min_' + c) for c in num_cols]
    str_agg_max = [pl.max(c).alias('max_' + c) for c in other_cols if c not in ['case_id', 'num_group1', 'num_group2']]
    str_agg_min = [pl.min(c).alias('min_' + c) for c in other_cols if c not in ['case_id', 'num_group1', 'num_group2']]
    
#     agg = num_agg_mean + num_agg_max + num_agg_min + str_agg_max + str_agg_min
    agg = num_agg_mean + str_agg_max
    return agg

    
def set_datatypes(df):
    '''
    Tests column data type and reformats.
    '''
    
    for c in df.columns:
        if c in ['case_id', 'WEEK_NUM', 'num_group1', 'num_group2']: # excl MONTH
            df = df.with_columns(pl.col(c).cast(pl.Int64))
        # elif col in ['date_decision']:
        #     df = df.with_columns(pl.col(c).cast(pl.Date))
        elif c[-1] in ['P', 'A'] or c == 'target':
            df = df.with_columns(pl.col(c).cast(pl.Float64))
        elif c[-1] == 'M' or c == 'MONTH':
            df = df.with_columns(pl.col(c).cast(pl.String))
        elif c[-1] == 'D' or c == 'date_decision':
            df = df.with_columns(pl.col(c).cast(pl.Date).dt.truncate('1mo'))
        # else:
        #     print('column {c} is unknown datatype'.format(c=c))

    return df


def reduce_columns(df):
    '''
    Tests for columns with many nulls or string columns with only 1 or many many values.
    '''
    
    for c in df.columns:
        p_null = df[c].is_null().mean() >= 0.70
        uniq = df[c].n_unique() == 1 or df[c].n_unique() > 200
        
        if c in ['target', 'case_id', 'MONTH']:
            pass
        elif p_null:
            df = df.drop(c)
        elif c[-1] == 'M' and uniq:
            df = df.drop(c)
        elif c[-1] == 'D' or c in ['WEEK_NUM', 'date_decision']:
            df = df.drop(c) # for now
    
    return df


def load_from_parquet(path, source):
    '''
    Loads a parquet file at a path and does some formatting. If path includes a set of tables,
    load each and then concat them together. Also determine if the table is of depth 0. if 
    not, perform aggregation.
    '''

    # if split into multiple tables, first combine. tested, and separate files shouldn't have
    # any overlap with case_id
    if type(path) == list:
        d0 = 'static_0' in path[0] or 'static_cb_0' in path[0] or '_base' in path[0]
        tot = []
        for t in path:
            d = pl.read_parquet(source+t)
            d = set_datatypes(d)
            if not d0:
                d = d.group_by('case_id').agg(aggregate(d))
            tot.append(d)

        # combine
        df = pl.concat(tot, how='vertical_relaxed')

    else:
        d0 = 'static_0' in path or 'static_cb_0' in path or '_base' in path
        df = pl.read_parquet(source+path)
        df = set_datatypes(df)
        if not d0:
            df = df.group_by('case_id').agg(aggregate(df))

    # only need to do this for training. later, i'll make sure train and test have the same cols
    if 'train' in source:
        df = reduce_columns(df) # do this after aggregation, if it occurs

    return df

In [3]:
train_tables = [
    'train_base.parquet',
    ['train_applprev_1_0.parquet', 'train_applprev_1_1.parquet'],
    'train_applprev_2.parquet',
    ['train_credit_bureau_a_1_0.parquet', 'train_credit_bureau_a_1_1.parquet', 'train_credit_bureau_a_1_2.parquet', 'train_credit_bureau_a_1_3.parquet'],
    ['train_credit_bureau_a_2_0.parquet', 'train_credit_bureau_a_2_1.parquet', 'train_credit_bureau_a_2_2.parquet', 'train_credit_bureau_a_2_3.parquet', 'train_credit_bureau_a_2_4.parquet', 'train_credit_bureau_a_2_5.parquet', 'train_credit_bureau_a_2_6.parquet', 'train_credit_bureau_a_2_7.parquet', 'train_credit_bureau_a_2_8.parquet', 'train_credit_bureau_a_2_9.parquet', 'train_credit_bureau_a_2_10.parquet'],
    'train_credit_bureau_b_1.parquet',
    'train_credit_bureau_b_2.parquet',
    'train_debitcard_1.parquet',
    'train_deposit_1.parquet',
    'train_other_1.parquet',
    'train_person_1.parquet',
    'train_person_2.parquet',
    ['train_static_0_0.parquet', 'train_static_0_1.parquet'],
    'train_static_cb_0.parquet',
    'train_tax_registry_a_1.parquet',
    'train_tax_registry_b_1.parquet',
    'train_tax_registry_c_1.parquet'
]

test_tables = [
    'test_base.parquet',
    ['test_applprev_1_0.parquet', 'test_applprev_1_1.parquet', 'test_applprev_1_2.parquet'],
    'test_applprev_2.parquet',
    ['test_credit_bureau_a_1_0.parquet', 'test_credit_bureau_a_1_1.parquet', 'test_credit_bureau_a_1_2.parquet', 'test_credit_bureau_a_1_3.parquet', 'test_credit_bureau_a_1_4.parquet'],
    ['test_credit_bureau_a_2_0.parquet', 'test_credit_bureau_a_2_1.parquet', 'test_credit_bureau_a_2_2.parquet', 'test_credit_bureau_a_2_3.parquet', 'test_credit_bureau_a_2_4.parquet', 'test_credit_bureau_a_2_5.parquet', 'test_credit_bureau_a_2_6.parquet', 'test_credit_bureau_a_2_7.parquet', 'test_credit_bureau_a_2_8.parquet', 'test_credit_bureau_a_2_9.parquet', 'test_credit_bureau_a_2_10.parquet', 'test_credit_bureau_a_2_11.parquet'],
    'test_credit_bureau_b_1.parquet',
    'test_credit_bureau_b_2.parquet',
    'test_debitcard_1.parquet',
    'test_deposit_1.parquet',
    'test_other_1.parquet',
    'test_person_1.parquet',
    'test_person_2.parquet',
    ['test_static_0_0.parquet', 'test_static_0_1.parquet', 'test_static_0_2.parquet'],
    'test_static_cb_0.parquet',
    'test_tax_registry_a_1.parquet',
    'test_tax_registry_b_1.parquet',
    'test_tax_registry_c_1.parquet'
]

In [4]:
# actually load data from location

# start with training data
train_data = load_from_parquet(train_tables[0], train_directory)
for t in train_tables[1:]:
    # print('\n', t)
    train_data = train_data.join(load_from_parquet(t, train_directory), on='case_id', how='left')

gc.collect()
    
# test data
test_data = load_from_parquet(test_tables[0], test_directory)
for t in test_tables[1:]:
    # print('\n', t)
    test_data = test_data.join(load_from_parquet(t, test_directory), on='case_id', how='left')

# make sure test and training have same columns
test_data = test_data.select([c for c in train_data.columns if c != 'target'])

gc.collect()

print('train data shape:\t', train_data.shape)
print('test data shape:\t', test_data.shape)

train data shape:	 (1526659, 325)
test data shape:	 (10, 324)


In [5]:
# convert datasets to pandas, using category dtype where relevant

train_data = train_data.to_pandas()#.sample(750000)
obj_cols = list(train_data.select_dtypes('object').columns)
train_data[obj_cols] = train_data[obj_cols].astype('category')

test_data = test_data.to_pandas()
test_data[obj_cols] = test_data[obj_cols].astype('category')

In [6]:
# fit model
# train_data = train_data.to_pandas()
# test_data = test_data.to_pandas()
# cat_features = train_data.select_dtypes(include=['object', 'category']).columns

# # Fill NaN values with a placeholder string such as 'missing'
# train_data[cat_features] = train_data[cat_features].fillna('missing')
# test_data[cat_features] = test_data[cat_features].fillna('missing')

# # Convert all categorical features to type 'category'
# train_data[cat_features] = train_data[cat_features].astype('category')
# test_data[cat_features] = test_data[cat_features].astype('category')

x_train, x_val, y_train, y_val = train_test_split(
    train_data.drop(columns=['target', 'case_id']),
    train_data['target'], 
    test_size=0.3, 
    random_state=0
)

del train_data
gc.collect()

# ros = RandomOverSampler(random_state=0, sampling_strategy=0.3)
# x_train, y_train = ros.fit_resample(x_train, y_train)

# params = {
#     "boosting_type": "gbdt",
#     "metric": "auc",
#     "max_depth": 10,  
#     "learning_rate": 0.05,
#     "n_estimators": 2000,  
#     "colsample_bytree": 0.8,
#     "colsample_bynode": 0.8,
#     "reg_alpha": 0.1,
#     "reg_lambda": 10,
#     'num_leaves':64
# }

m = lgb.LGBMClassifier()
m.fit(x_train, y_train, eval_set=[(x_val, y_val)])

# cat_features_indices = [train_data.columns.get_loc(c) for c in cat_features if c in train_data]

# m = CatBoostClassifier(
#     iterations=3000, 
#     learning_rate=0.03, 
#     depth=6,
#     cat_features=cat_features_indices,
#     eval_metric='AUC',
#     verbose=300
# )
# m.fit(x_train, y_train, eval_set=[(x_val, y_val)])

# print('mean AUC score: {s}'.format(s=np.mean([np.mean(m.evals_result_['valid_0']['auc']) for m in models])))
#print('mean LL score: {s}'.format(s=np.mean(m.evals_result_['valid_0']['binary_logloss'])))

[LightGBM] [Info] Number of positive: 33603, number of negative: 1035058
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.753610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38271
[LightGBM] [Info] Number of data points in the train set: 1068661, number of used features: 321
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031444 -> initscore=-3.427597
[LightGBM] [Info] Start training from score -3.427597


In [7]:
# lgb.plot_importance(m, importance_type="split", figsize=(10,50))
# plt.show()

In [8]:
# prepare to make predictions
inds = test_data['case_id']
test = test_data.drop(columns=['case_id'])

# make predictions on trained model
predictions = m.predict_proba(test)[:, 1]

out = pd.DataFrame(columns=['case_id', 'score'])
out['case_id'] = inds
out['score'] = predictions
out = out.set_index('case_id')

out.to_csv('submission.csv')