## AMEX Default Competition - Feature Engineering


- This notebook does the following:

    - aggregating data by customer_ID
        - get the min, max, mean, std, first, last for each numeric features
            - then create a feature that checks if the last number if with 1.5 standard deviation of mean
        - get the unique count for categorical features 

    
- the raw data used for this notebook is generated by:
    - Process Amex Train Data to Parquet Format: https://www.kaggle.com/code/xxxxyyyy80008/process-amex-train-data-to-parquet-format
    - datasets can be accessed here:
        - train file: https://www.kaggle.com/datasets/xxxxyyyy80008/amex-train-20220706
        - test file: https://www.kaggle.com/datasets/xxxxyyyy80008/amex-test-20020706
    
- this notebook also used some insights from this notebook:
     - AMEX - Train Data EDA - Dask for Fast Analysis: https://www.kaggle.com/code/xxxxyyyy80008/amex-train-data-eda-dask-for-fast-analysis
     
- the data output of this notebook can be accessed here:
     - train data: https://www.kaggle.com/datasets/xxxxyyyy80008/amex-agg-data-rev2
   



In [1]:
import numpy as np
import pandas as pd
import gc
import copy
import os
import sys

from pathlib import Path
from datetime import datetime, date, time, timedelta
from dateutil import relativedelta

import pyarrow.parquet as pq
import pyarrow as pa

import dask.dataframe as dd

In [2]:
import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 100
pd.options.display.max_columns = 100


import pytorch_lightning as pl
random_seed=1234
pl.seed_everything(random_seed)

Global seed set to 1234


1234

## aggregate by customer id

In [3]:
all_cols = ['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_63', 'D_64', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'D_66', 'B_20', 'D_68', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'B_30', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'B_36', 'B_37', 'R_26', 'R_27', 'B_38', 'D_108', 'D_109', 'D_110', 'D_111', 'B_39', 'D_112', 'B_40', 'S_27', 'D_113', 'D_114', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_127', 'D_128', 'D_129', 'B_41', 'B_42', 'D_130', 'D_131', 'D_132', 'D_133', 'R_28', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145']

id_feats = ['customer_ID']
date_col =  'S_2'
num_cat_feats = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_66', 'D_68', 'B_31']
str_cat_feats = ['D_63', 'D_64', ]
float_feats = ['B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17', 'B_18', 'B_19', 'B_2', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26', 'B_27', 'B_28', 'B_29', 'B_3', 'B_30', 'B_32', 'B_33', 'B_36', 'B_37', 'B_38', 'B_39', 'B_4', 'B_40', 'B_41', 'B_42', 'B_5', 'B_6', 'B_7', 'B_8', 'B_9', 'D_102', 'D_103', 'D_104', 'D_105', 'D_106', 'D_107', 'D_108', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113', 'D_114', 'D_115', 'D_116', 'D_117', 'D_118', 'D_119', 'D_120', 'D_121', 'D_122', 'D_123', 'D_124', 'D_125', 'D_126', 'D_127', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132', 'D_133', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144', 'D_145', 'D_39', 'D_41', 'D_42', 'D_43', 'D_44', 'D_45', 'D_46', 'D_47', 'D_48', 'D_49', 'D_50', 'D_51', 'D_52', 'D_53', 'D_54', 'D_55', 'D_56', 'D_58', 'D_59', 'D_60', 'D_61', 'D_62', 'D_65', 'D_66', 'D_68', 'D_69', 'D_70', 'D_71', 'D_72', 'D_73', 'D_74', 'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83', 'D_84', 'D_86', 'D_87', 'D_88', 'D_89', 'D_91', 'D_92', 'D_93', 'D_94', 'D_96', 'P_2', 'P_3', 'P_4', 'R_1', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17', 'R_18', 'R_19', 'R_2', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26', 'R_27', 'R_28', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'S_11', 'S_12', 'S_13', 'S_15', 'S_16', 'S_17', 'S_18', 'S_19', 'S_20', 'S_22', 'S_23', 'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_9']
int_feats = ['B_31']

print(len(float_feats))

185


In [4]:
len(all_cols)

190

In [5]:
%%time
# train_file = '/kaggle/input/amex-train-20220706/train.parquet'
# test_file = '/kaggle/input/amex-test-20020706/amex_test_20220706.parquet'

train_file = 'amex/train.parquet'
test_file = 'amex/amex_test_20220706.parquet'

Wall time: 0 ns


## Train file processing

In [6]:
eps =  1e-8

In [7]:
dest_folder = 'amex/test'

In [8]:
stats_df = pd.read_csv(f'amex/train_agg_median.csv', index_col=0)
stats_df

Unnamed: 0_level_0,min,max,mean,std,first,last,mean2std,last2max,last2min,range
feat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B_1,0.008231,0.095357,0.038720,0.020624,0.027948,0.033219,1.0,0.402201,1.754015,0.065136
B_10,0.033851,0.272729,0.130737,0.027920,0.121036,0.096286,1.0,0.333524,0.516970,0.083319
B_11,0.003701,0.074862,0.027941,0.017387,0.015933,0.021256,1.0,0.430243,2.853625,0.054636
B_12,0.008455,0.032243,0.020392,0.005491,0.016311,0.017545,1.0,0.243477,0.526921,0.017443
B_13,0.010968,0.053904,0.033493,0.011981,0.028056,0.028014,1.0,0.308872,1.011068,0.033255
...,...,...,...,...,...,...,...,...,...,...
S_5,0.002369,0.062134,0.024643,0.018744,0.009989,0.012489,1.0,0.575202,3.867261,0.054910
S_6,0.000837,0.009933,0.006124,0.003102,0.007524,0.006561,1.0,0.437023,4.407298,0.009150
S_7,0.101917,0.258384,0.166948,0.048958,0.147449,0.137705,1.0,0.258638,0.230850,0.138686
S_8,0.005128,0.599643,0.284040,0.128605,0.323709,0.318551,1.0,0.389655,1.428870,0.355751


In [17]:
%%time
agg_files = []
stats = []
for c in float_feats:
    df = dd.read_parquet(test_file, columns=['customer_ID', 'S_2',c], engine='pyarrow')
    x = df.compute().sort_values(by='S_2', ascending=True).groupby("customer_ID").agg({c: ['min', 'max', 'mean', 'std', 'first','last']})
    x.columns = [f'{c1}__{c2}' for c1, c2 in x.columns]
    
    x[f'{c}__mean2std'] = (x[f'{c}__last'] >= (x[f'{c}__mean']-1.5*x[f'{c}__std'])) & (x[f'{c}__last'] <= (x[f'{c}__mean']+1.5*x[f'{c}__std']))
    x[f'{c}__mean2std'] = x[f'{c}__mean2std'].astype(int)
    
    x[f'{c}__last2max'] = (x[f'{c}__max']-x[f'{c}__last'])/(x[f'{c}__max'])
    x[f'{c}__last2min'] = (x[f'{c}__last']-x[f'{c}__min'])/(x[f'{c}__min'])
    x[f'{c}__range'] = (x[f'{c}__max']-x[f'{c}__min'])
    
    
    agg_cols = [f'{c}__min', f'{c}__max', f'{c}__mean', f'{c}__std', f'{c}__first', f'{c}__last', f'{c}__mean2std', 
                f'{c}__last2max', f'{c}__last2min', f'{c}__range']
    x.replace([-np.inf, np.inf], np.nan, inplace=True)
    
    if x.isna().sum().sum()>0:
        if stats_df[stats_df.index==c].shape[0]>0:
            train_median = stats_df[stats_df.index==c].iloc[0]
            for stats_c_ in train_median.index.values:
                x[f'{c}__{stats_c_}'] = x[f'{c}__{stats_c_}'].fillna(value=train_median[stats_c_])
        else:
            print(c)
            for col_ in agg_cols:
                x[col_] = x[col_].fillna(value=x[col_].median())
        
    x[agg_cols] = np.float32(x[agg_cols].values)
    pq.write_table(pa.Table.from_pandas(x),  f'{dest_folder}/{c}.parquet', compression = 'GZIP')
    agg_files.append(f'{c}.parquet')
    
    #--calculate the stats ------------------------------------------------------------------

    stats.append([c] + x[agg_cols].median().values.tolist())

Wall time: 1h 29min 39s


In [18]:
stats_cols = ['feat', 'min', 'max', 'mean', 'std', 'first','last', 
              'mean2std', 'last2max', 'last2min', 'range']

stats_df = pd.DataFrame(stats, columns= stats_cols )


stats_df.to_csv(f'{dest_folder}/test_agg_median.csv', index=False)

In [19]:
%%time
for c in str_cat_feats:
    df = dd.read_parquet(test_file, columns=['customer_ID', c], engine='pyarrow')
    df[c] = df[c].fillna(value='NA') 
    x0 = df.compute().groupby("customer_ID")[c].value_counts().to_frame()
    x0=x0.unstack().fillna(0)
    x0.columns = [f'{c0}={c1}' for c0, c1 in x0.columns]
    x0[x0.columns.tolist()] = np.float32(x0[x0.columns.tolist()].values)
    
    x1 = df.compute().groupby("customer_ID").agg({c: ['nunique']})
    x1.columns = [f'{c1}__{c2}' for c1, c2 in x1.columns]
    x1[x1.columns.tolist()] = np.int32(x1[x1.columns.tolist()].values)
    
    df = dd.read_parquet(test_file, columns=['customer_ID', 'S_2',c], engine='pyarrow')
    x2 = df.compute().sort_values(by='S_2', ascending=True).groupby("customer_ID").agg({c: ['last']})
    x2.columns = [f'{c1}__{c2}' for c1, c2 in x2.columns]
    col_ = x2.columns[0]
    x2[col_] = x2[col_].fillna(value='NA')
    dummies_ = pd.get_dummies(x2[col_])
    dummy_feats_ =  [f'{col_}={cc}' for cc in dummies_.columns]
    dummies_.columns = dummy_feats_
    x2[dummy_feats_] = dummies_.values
    x2.drop(columns=[col_], inplace=True)
    x2.fillna(value=0, inplace=True)
    
    x = x0.merge(x1, left_index=True, right_index=True, how='left')
    x = x.merge(x2, left_index=True, right_index=True, how='left')

    pq.write_table(pa.Table.from_pandas(x),  f'{dest_folder}/{c}.parquet', compression = 'GZIP')
    agg_files.append(f'{c}.parquet')

Wall time: 1min 24s


In [20]:
%%time
for c in int_feats + num_cat_feats:
    df = dd.read_parquet(test_file, columns=['customer_ID', c], engine='pyarrow')
    df[c] = df[c].fillna(value=999) 
    x0 = df.compute().groupby("customer_ID")[c].value_counts().to_frame()
    x0=x0.unstack().fillna(0)
    x0.columns = [f'{c0}={c1}' for c0, c1 in x0.columns]
    x0[x0.columns.tolist()] = np.float32(x0[x0.columns.tolist()].values)
    
    x1 = df.compute().groupby("customer_ID").agg({c: ['nunique']})
    x1.columns = [f'{c1}__{c2}' for c1, c2 in x1.columns]
    x1[x1.columns.tolist()] = np.int32(x1[x1.columns.tolist()].values)
    
    df = dd.read_parquet(test_file, columns=['customer_ID', 'S_2',c], engine='pyarrow')
    df[c] = df[c].fillna(value=999) 
    x2 = df.compute().sort_values(by='S_2', ascending=True).groupby("customer_ID").agg({c: ['last']})
    x2.columns = [f'{c1}__{c2}' for c1, c2 in x2.columns]
    
    x = x0.merge(x1, left_index=True, right_index=True, how='left')
    x = x.merge(x2, left_index=True, right_index=True, how='left')

    pq.write_table(pa.Table.from_pandas(x),  f'{dest_folder}/{c}.parquet', compression = 'GZIP')
    agg_files.append(f'{c}.parquet')

Wall time: 7min 1s


In [21]:
def cal_days(v):
    m0 = v['S_2=min']
    m1 = v['S_2=max']
    if m1 is np.nan:
        m1 = m0
    
    return (datetime.strptime(m1, '%Y-%m-%d') - datetime.strptime(m0, '%Y-%m-%d')).days

        

In [22]:
%%time
for c in ['S_2']:
    df = dd.read_parquet(test_file, columns=['customer_ID', c], engine='pyarrow')
    x = df.compute().groupby("customer_ID").agg({c: ['min', 'max', 'count']})
    x.columns = [f'{c1}={c2}' for c1, c2 in x.columns]   
    
    days = []
    for _, row in x.iterrows():
        days.append(cal_days(row))
    x['days']=days
    
    pq.write_table(pa.Table.from_pandas(x[['S_2=count', 'days']]), f'{dest_folder}/{c}.parquet', compression = 'GZIP')
    agg_files.append(f'{c}.parquet')

Wall time: 2min 37s


## combine all files

In [23]:
len(agg_files), agg_files[:1], len(set(agg_files))

(199, ['B_1.parquet'], 189)

In [24]:
agg_files = list(set(agg_files))

In [25]:
%%time
# df = pd.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')
df = pd.read_csv('amex/sample_submission.csv')

df = df[['customer_ID']].copy(deep=True)

for i, file in enumerate(agg_files):
    df_ = pd.read_parquet(f'{dest_folder}/{file}').reset_index()
    df = df.merge(df_,on=['customer_ID'], how='left')
        
    del df_
    gc.collect()
        
print(i)
pq.write_table(pa.Table.from_pandas(df), f'{dest_folder}/agg_test_all_rev3.parquet', compression = 'GZIP')
# del df
# gc.collect()

188
Wall time: 15min 57s


## log features

In [None]:
%%time
df = pd.read_parquet(f'{dest_folder}/agg_test_all_rev3.parquet', engine='pyarrow')


In [26]:
float_feats_ = list(set(float_feats)-set(num_cat_feats)-set(int_feats))
len(float_feats_)

176

In [27]:
all_cols2 = df.columns.tolist()
all_cols2.sort()
len(all_cols2)

1849

In [28]:
na_cnt = df.isna().sum()
na_cnt[na_cnt>0]

Series([], dtype: int64)

In [29]:
%%time
eps =  1e-8
log_feats = []

for c in all_cols2:
    
    if c in ['customer_ID', 'target']:
        continue
    
    if df[c].dtype not in ['float64', 'float32']:
        continue
        
    if '__' in c:
        c0, c1 = c.split('__')
        if (c0 in float_feats_) & (c1 in ['last', 'mean']):
            if df[c].min()>0:
                df[f'{c}__log'] = np.log(df[c].values + eps)
                df[f'{c}__log'].replace([-np.inf, np.inf], np.nan, inplace=True)
                df[f'{c}__log'] = df[f'{c}__log'].fillna(df[f'{c}__log'].median())
                log_feats.append(f'{c}__log')
            
            

Wall time: 4.49 s


In [33]:
na_cnt = df.isna().sum()
na_cnt[na_cnt>0]

Series([], dtype: int64)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 924621 entries, 0 to 924620
Columns: 2116 entries, customer_ID to S_9__mean__log
dtypes: float32(2090), int32(13), int64(2), object(1), uint8(10)
memory usage: 7.3+ GB


In [32]:
%%time
pq.write_table(pa.Table.from_pandas(df), f'{dest_folder}/agg_test_all_rev3.parquet', compression = 'GZIP')

Wall time: 6min 7s


## correlation with target

In [37]:
all_test_cols = df.columns.tolist()
all_test_cols.sort()
len(all_test_cols)

2116

In [38]:
corr_target = pd.read_csv('amex/train/corr_w_target.csv')

In [40]:
all_train_feats = corr_target['feat'].values.tolist()
all_train_feats.sort()
len(all_train_feats)

2120

In [41]:
set(all_train_feats)-set(all_test_cols)

{'B_28__mean__log', 'D_64=-1', 'D_64__last=-1', 'D_66=0.0', 'D_68=0.0'}

In [42]:
both_feats = list(set(all_train_feats) & set(all_test_cols))
len(both_feats)

2115