In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')
# os.system(f' cp "drive/My Drive/train_1.csv" "train_1.csv" ')
# os.system(f' cp "drive/My Drive/train_2.csv" "train_2.csv" ')

In [None]:
import pandas as pd
import numpy as np
import polars as pl
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
import lightgbm as lgb
import gc
from sklearn.metrics import roc_auc_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Categorical Features:
## Simple, Value counts, Mean target

In [None]:
# https://www.kaggle.com/competitions/playground-series-s4e1/overview
data = pd.read_csv('train_1.csv')
gkf = GroupKFold(n_splits = 5, shuffle = True, random_state = 228)
train_index, test_index = [spl for spl in gkf.split(data, data['Exited'], groups = data['CustomerId'])][0]
train = data.loc[train_index, :].reset_index(drop = True)
test = data.loc[test_index, :].reset_index(drop = True)

In [None]:
data.head(5)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [None]:
gkf = StratifiedGroupKFold(n_splits = 5, shuffle = True, random_state = 228)
split_list = [spl for spl in gkf.split(train, train['Exited'], groups = train['CustomerId'])]

In [None]:
ltr = len(train)
data = pd.concat([train, test]).reset_index(drop = True)

In [None]:
def lgb_train(train, test, target, split_list, param):

    bst_list = []
    for i , (train_index, test_index) in enumerate(split_list):

        tr = lgb.Dataset(train[train_index], target[train_index])
        te = lgb.Dataset(train[test_index], target[test_index], reference=tr)

        bst = lgb.train(param, tr, num_boost_round=900,
                     callbacks = [lgb.early_stopping(400), lgb.log_evaluation(-1)], valid_sets = [te])
        bst_list += [bst]

        gc.collect()
        del tr, te

    return bst_list

In [None]:
param_lgb = {
    'objective':        'binary',
    'verbosity':        -1,
    'boosting_type':    'gbdt',
    'random_state':     42,
    'lambda_l1':        1.5,
    'lambda_l2':        0.5,
    'learning_rate':    0.1,
    'num_leaves':        16,
    'metric': 'auc'
}

In [None]:
num_cols = ['IsActiveMember', 'HasCrCard', 'NumOfProducts', 'Balance', 'Tenure', 'Age', 'CreditScore', 'EstimatedSalary', 'Tenure']
cat_cols = ['Geography', 'Gender']

## simple encoding

In [None]:
add_cols = []
for col in cat_cols:
    data[f'type1_{col}'] = data[col].astype('category').cat.codes
    add_cols += [f'type1_{col}']

In [None]:
data[[f'type1_Geography', 'Geography']].head(5)

Unnamed: 0,type1_Geography,Geography
0,0,France
1,0,France
2,0,France
3,2,Spain
4,0,France


In [None]:
train_cols = add_cols + num_cols
print(len(train_cols))
target_col = 'Exited'

11


In [None]:
bst_list = lgb_train(data[train_cols].values, data[train_cols].values,
                            data[target_col].values, split_list, param_lgb)

pred = []
for bst in bst_list:
    pred += [bst.predict( data.loc[ltr:, train_cols])]

roc_auc_score(data[target_col][ltr:], np.mean(pred, 0))

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[168]	valid_0's auc: 0.889145
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[131]	valid_0's auc: 0.8863
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[116]	valid_0's auc: 0.890692
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[152]	valid_0's auc: 0.88864
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[158]	valid_0's auc: 0.890274


0.8904393267774321

## value counts

In [None]:
add_cols_vc = []
for col in cat_cols:
    data[f'type2_{col}'] = data[col].map(data[col].value_counts())
    add_cols_vc += [f'type2_{col}']

In [None]:
data[[f'type2_Geography', 'Geography']].head(5)

Unnamed: 0,type2_Geography,Geography
0,94215,France
1,94215,France
2,94215,France
3,36213,Spain
4,94215,France


In [None]:
train_cols = add_cols_vc + num_cols
print(len(train_cols))

11


In [None]:
bst_list = lgb_train(data[train_cols].values, data[train_cols].values,
                            data[target_col].values, split_list, param_lgb)

pred = []
for bst in bst_list:
    pred += [bst.predict( data.loc[ltr:, train_cols])]

roc_auc_score(data[target_col][ltr:], np.mean(pred, 0))

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[119]	valid_0's auc: 0.889216
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[137]	valid_0's auc: 0.886327
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[127]	valid_0's auc: 0.890888
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[201]	valid_0's auc: 0.888827
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[136]	valid_0's auc: 0.890525


0.8907425726389944

## mean target

In [None]:
# https://youtu.be/g335THJxkto

In [None]:
add_cols_mean = []
for col in cat_cols:
    data[f'type3_{col}'] = -1.
    for train_index, test_index in split_list:
        data.loc[test_index, f'type3_{col}'] = data.loc[test_index, col].map(data.loc[train_index].groupby(col)[target_col].mean())
    data.loc[ltr:, f'type3_{col}'] = data.loc[ltr:, col].map(data.loc[:ltr - 1].groupby(col)[target_col].mean())
    add_cols_mean += [f'type3_{col}']

In [None]:
data.loc[:ltr - 1].groupby(col)[target_col].mean()

Unnamed: 0_level_0,Exited
Gender,Unnamed: 1_level_1
Female,0.278753
Male,0.159897


In [None]:
data[[f'type3_Geography', 'Geography']].head(5)

Unnamed: 0,type3_Geography,Geography
0,0.1658,France
1,0.16561,France
2,0.165518,France
3,0.171618,Spain
4,0.16561,France


In [None]:
train_cols = add_cols_mean + num_cols

In [None]:
bst_list = lgb_train(data[train_cols].values, data[train_cols].values,
                            data[target_col].values, split_list, param_lgb)

pred = []
for bst in bst_list:
    pred += [bst.predict( data.loc[ltr:, train_cols])]

roc_auc_score(data[target_col][ltr:], np.mean(pred, 0))

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[162]	valid_0's auc: 0.889064
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[136]	valid_0's auc: 0.88634
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[121]	valid_0's auc: 0.891023
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[163]	valid_0's auc: 0.888491
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[156]	valid_0's auc: 0.890592


0.8906697622369084

In [None]:
train_cols = add_cols_mean + num_cols + add_cols + add_cols_vc

In [None]:
bst_list = lgb_train(data[train_cols].values, data[train_cols].values,
                            data[target_col].values, split_list, param_lgb)

pred = []
for bst in bst_list:
    pred += [bst.predict( data.loc[ltr:, train_cols])]

roc_auc_score(data[target_col][ltr:], np.mean(pred, 0))

Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[121]	valid_0's auc: 0.889039
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[156]	valid_0's auc: 0.886341
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[155]	valid_0's auc: 0.89097
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[178]	valid_0's auc: 0.888442
Training until validation scores don't improve for 400 rounds
Early stopping, best iteration is:
[149]	valid_0's auc: 0.890356


0.8906323634076718

In [None]:
sorted(zip(bst.feature_importance(importance_type = 'gain'), train_cols))[::-1]

[(90545.48879224062, 'Age'),
 (78067.42267239094, 'NumOfProducts'),
 (21147.22176808119, 'IsActiveMember'),
 (17179.174762904644, 'Balance'),
 (7355.695566177368, 'type3_Geography'),
 (6760.478400945663, 'type3_Gender'),
 (2340.596083521843, 'EstimatedSalary'),
 (2251.0562502145767, 'CreditScore'),
 (798.2799395918846, 'Tenure'),
 (612.5638509988785, 'HasCrCard'),
 (57.09199023246765, 'type1_Geography'),
 (6.573440074920654, 'type1_Gender'),
 (1.6648000478744507, 'type2_Geography'),
 (0.0, 'type2_Gender'),
 (0.0, 'Tenure')]

## ohe encoding

In [None]:
pd.get_dummies(data, columns = cat_cols)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,...,type1_Gender,type2_Geography,type2_Gender,type3_Geography,type3_Gender,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0,15674932,Okwudilichukwu,668,33.0,3,0.00,2,1.0,0.0,...,1,94215,93150,0.165800,0.159948,True,False,False,False,True
1,1,15749177,Okwudiliolisa,627,33.0,1,0.00,2,1.0,1.0,...,1,94215,93150,0.165610,0.159908,True,False,False,False,True
2,3,15741417,Kao,581,34.0,2,148882.54,1,1.0,1.0,...,1,94215,93150,0.165518,0.160342,True,False,False,False,True
3,4,15766172,Chiemenam,716,33.0,5,0.00,2,1.0,1.0,...,1,36213,93150,0.171618,0.160395,False,False,True,False,True
4,6,15692819,Ch'ang,593,30.0,8,144772.69,1,1.0,0.0,...,0,94215,71884,0.165610,0.279461,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165015,15589296,Walker,586,48.0,5,118022.74,1,0.0,0.0,...,0,36213,71884,0.171618,0.278753,False,False,True,True,False
165030,165019,15604072,Lo Duca,719,32.0,6,134937.10,1,0.0,1.0,...,0,94215,71884,0.165592,0.278753,True,False,False,True,False
165031,165025,15687079,T'ang,635,38.0,9,0.00,2,1.0,1.0,...,0,94215,71884,0.165592,0.278753,True,False,False,True,False
165032,165032,15689614,Hsiung,554,30.0,7,161533.00,1,0.0,1.0,...,0,36213,71884,0.171618,0.278753,False,False,True,True,False


# Fill empty

## mean, median

In [None]:
data['NumOfProducts'].fillna(data['NumOfProducts'].mean()).head(5)

Unnamed: 0,NumOfProducts
0,2
1,2
2,1
3,2
4,1


# Numeric Features:
## Rolling, Diff, Groupby; Mean, Std, Max

In [None]:
# https://www.kaggle.com/competitions/tabular-playground-series-apr-2022/overview
train = pd.read_csv('train_2.csv')
train.head(5)

Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
1,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
2,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147
3,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241
4,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,0.560109,-0.541985,-0.9,1.055636,0.812631,0.123457,-0.223359


In [None]:
train['sensor_00_mean'] = train.groupby(['sequence'], group_keys = False)['sensor_00'].apply(
                    lambda x:x.rolling(3, min_periods = 1, center = True).mean())

# train.groupby(['sequence'], group_keys = False)['sensor_00'].apply(
#                     lambda x:x.rolling(3, min_periods = 1).std())
# train.groupby(['sequence'], group_keys = False)['sensor_00'].apply(
#                     lambda x:x.rolling(3, min_periods = 1).max())

In [None]:
train['sensor_00_mean'].head(5)

Unnamed: 0,sensor_00_mean
0,-0.32187
1,-0.105616
2,0.134209
3,0.374034
4,-0.092736


In [None]:
train['sensor_00_mean_shift'] = train.groupby(['sequence'], group_keys = False)['sensor_00'].apply(
                    lambda x:x.shift(1).rolling(3, min_periods = 1, center = True).mean())
train['sensor_00_mean_shift'].head(5)

Unnamed: 0,sensor_00_mean_shift
0,-0.196291
1,-0.32187
2,-0.105616
3,0.134209
4,0.374034


In [None]:
train['sensor_00_diff'] = train.groupby(['sequence'], group_keys = False)['sensor_00'].apply(lambda x:x.diff())

In [None]:
train['sensor_00_diff'].head(5)

Unnamed: 0,sensor_00_diff
0,
1,-0.251159
2,0.774343
3,0.196291
4,-0.251159


In [None]:
train['sensor_00_diff_std'] = train.groupby(['sequence'], group_keys = False)['sensor_00_diff'].apply(
                    lambda x:x.rolling(3, min_periods = 1, center = True).std())

train['sensor_00_diff_std'].head(5)

Unnamed: 0,sensor_00_diff_std
0,
1,0.72514
2,0.514135
3,0.514135
4,0.793158


## groupby

In [None]:
tmp_col = train.groupby(['subject', 'step'])['sensor_00'].mean().reset_index()
tmp_col.columns = ['subject', 'step', 'sensor_00_mean_feat']
train = train.merge(tmp_col, how = 'left', on = ['subject', 'step'])
train['sensor_00_mean_feat']

Unnamed: 0,sensor_00_mean_feat
0,-0.163803
1,-0.024336
2,0.048382
3,-0.003069
4,-0.065547
...,...
1558075,-0.045750
1558076,-0.149614
1558077,0.105448
1558078,-0.027357


In [None]:
train['sensor_00_mean_feat'] - train['sensor_00_mean']

Unnamed: 0,0
0,0.158067
1,0.081280
2,-0.085828
3,-0.377103
4,0.027189
...,...
1558075,0.188923
1558076,0.094333
1558077,-0.171213
1558078,0.118187


## Polars

In [None]:
import polars as pl
# https://www.kaggle.com/code/chumajin/chumajin-s-room-polars-basic-english-ver

In [None]:
train_pl = pl.read_csv('train_2.csv')

In [None]:
%%time
train_pl = pl.read_csv('train_2.csv')
for c in range(9):
    train_pl = train_pl.with_columns(pl.col(f'sensor_0{c}').rolling_mean(3, min_periods = 1, center = False).over('sequence').alias(
                                f'sensor_0{c}_mean'))

CPU times: user 3.79 s, sys: 1.16 s, total: 4.95 s
Wall time: 3 s


In [None]:
%%time
for c in range(9):
    tmp_col = train.groupby(['sequence'], group_keys = False)[f'sensor_0{c}'].rolling(3, min_periods = 1, center = True).mean().reset_index()
    tmp_col.columns = ['subject', 'step', f'sensor_0{c}_mean']
    train = train.merge(tmp_col, how = 'left', on = ['subject', 'step'])

CPU times: user 19 s, sys: 4.98 s, total: 24 s
Wall time: 30.9 s


In [None]:
%%time
train_pl = pl.scan_csv('train_2.csv')
for c in range(9):
    train_pl = train_pl.with_columns(pl.col(f'sensor_0{c}').rolling_mean(3, min_periods = 1, center = False).over('sequence').alias(
                                f'sensor_0{c}_mean'))

train_pl = train_pl.collect()

CPU times: user 3.15 s, sys: 373 ms, total: 3.52 s
Wall time: 2.22 s


## numba