In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import gc
import numpy as np
import os
import sys
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from tqdm import tqdm

In [10]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [46]:
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import plot_missing_proportion_barchart, get_cols
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, SPECIAL_INT_COLUMNS, BINARY_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, CONFIRM_DROP_FEATURES
)

In [41]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"

In [42]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [43]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read Data

In [15]:
train = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train_data.parquet")
# test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")

Shape of data: (5531451, 191)


In [17]:
train.head(3)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,...,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,...,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,...,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,...,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,...,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827,0


In [22]:
def insert_row_number(df):
    if "row_number" not in df.columns:
        df.insert(1, "row_number", train.groupby("customer_ID")["S_2"].rank(method="first", ascending=False).astype(int))
    if "row_number_inv" not in df.columns:
        df.insert(1, "row_number_inv", train.groupby("customer_ID")["S_2"].rank(method="first", ascending=True).astype(int))
    return df

In [24]:
train = insert_row_number(train)

In [26]:
train.head(3)

Unnamed: 0,customer_ID,row_number_inv,row_number,S_2,P_2,D_39,B_1,B_2,R_1,S_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,1,13,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,...,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2,12,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,...,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,3,11,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,...,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603,0


### Check Data Type

In [29]:
train.dtypes.value_counts()

float32           185
int64               4
object              3
datetime64[ns]      1
dtype: int64

In [None]:
def set_binary_columns(df, column_list=BINARY_COLUMNS):
    for column in tqdm(column_list):
        if df[column].dtype != "float16":
            df[column] = df[column].round().astype("float16")

101548

In [50]:
for column in tqdm(BINARY_COLUMNS):
    print(train[column].dtype, train[column].isnull().sum())

 18%|█████████████████████████▊                                                                                                                    | 6/33 [00:00<00:00, 51.70it/s]

float32 22268
int64 0
float32 0
float32 2016
float32 0
float32 0
float32 0
float32 0
float32 0
float32 0
float32 0


 52%|████████████████████████████████████████████████████████████████████████▋                                                                    | 17/33 [00:00<00:00, 48.08it/s]

float32 0
float32 0
float32 128703
float32 0
float32 0
float32 0
float32 0
float32 2016
float32 0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33/33 [00:00<00:00, 55.66it/s]

float32 0
float32 0
float32 0
float32 101548
float32 1597
float32 0
float32 101548
float32 101548
float32 101548
float32 5336752
float32 101548
float32 40632
float32 101548





In [44]:
SPECIAL_INT_COLUMNS

['target', 'row_number', 'row_number_inv']

In [31]:
train.select_dtypes("int64")

Unnamed: 0,row_number_inv,row_number,B_31,target
0,1,13,1,0
1,2,12,1,0
2,3,11,1,0
3,4,10,1,0
4,5,9,1,0
...,...,...,...,...
5531446,9,5,1,0
5531447,10,4,1,0
5531448,11,3,1,0
5531449,12,2,1,0


In [None]:
for column in tqdm(BINARY_COLUMNS):
    if df[column].dtype != "float16":
        df[column] = df[column].round().astype("float16")

In [None]:
def process_and_feature_engineer(df):
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID', 'S_2']]
    cat_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    
    test_num_agg2 = df.groupby("customer_ID")[num_features].agg(['first', 'last'])
    test_num_agg2.columns = ['_'.join(x) for x in test_num_agg2.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = pd.concat([test_num_agg, test_num_agg2, test_cat_agg], axis=1).reset_index()
    print(df.columns)
    del test_num_agg, test_num_agg2, test_cat_agg
    print('shape after engineering', df.shape )
    
    min_features = [col for col in df.columns if col.endswith("_min")]
    max_features = [col for col in df.columns if col.endswith("_max")]
    for mini, maxi in zip(min_features, max_features):
        df.loc[:, maxi.replace("_max", "_range")] = df[maxi] - df[mini]
    
    first_features = [col for col in df.columns if col.endswith("_first")]
    last_features = [col for col in df.columns if col.endswith("_last")]
    for first, last in zip(first_features, last_features):
        df.loc[:, first.replace("_first", "_disp")] = df[last] - df[first]
        
    mean_features = [col for col in df.columns if col.endswith("_mean")]
    std_features = [col for col in df.columns if col.endswith("_std")]
    for mean, std in zip(mean_features, std_features):
        # CV stands for dispersion ratio => Coefficient of Variation
        df.loc[:, mean.replace("_mean", "_cv")] = df[std] / df[mean]
    
    range_features = [col for col in df.columns if col.endswith("_range")]
    for range_, std in zip(range_features, std_features):
        # DR stands for dispersion ratio => Standard Deviation / Range
        df.loc[:, range_.replace("_range", "_dr")] = df[std] / df[range_]
    return df