In [67]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [68]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [69]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    filter_df_for_feature, get_specific_row_df, get_agg_df,
    process_data
)

In [70]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [71]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [72]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Feature Engineering on Train

In [73]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")

Shape of data: (5531451, 193)
CPU times: user 2.22 s, sys: 2.19 s, total: 4.41 s
Wall time: 6.85 s


In [74]:
%%time
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

CPU times: user 305 ms, sys: 70 ms, total: 375 ms
Wall time: 454 ms


In [75]:
%%time
train_agg = get_agg_df(raw_train)

CPU times: user 21.4 s, sys: 10.4 s, total: 31.8 s
Wall time: 39.6 s


In [76]:
%%time
train_agg["num_statements"] = (
    raw_train.loc[raw_train["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 149 ms, sys: 884 ms, total: 1.03 s
Wall time: 1.55 s


In [77]:
train_agg.head(3)

Unnamed: 0,D_41_avg,D_71_avg,R_1_avg,D_132_avg,S_12_avg,D_39_avg,D_66_avg,D_82_avg,D_127_avg,R_14_avg,...,D_113_std,R_28_std,R_2_std,D_76_std,D_81_std,D_138_std,R_7_std,D_93_std,D_131_std,num_statements
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.0,0.214785,0.004509,,0.247988,0.230769,-1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,13
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.0,0.011508,0.006246,,0.181548,7.153846,-1.0,-1.0,0.076923,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,13
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.0,0.012292,0.006621,,0.190945,0.0,-1.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,13


In [78]:
%%time
train_last_etc = get_specific_row_df(raw_train)

CPU times: user 2.34 s, sys: 4.25 s, total: 6.58 s
Wall time: 11 s


In [79]:
train_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_first,D_137_first,D_138_first,D_139_first,D_140_first,D_141_first,D_142_first,D_143_first,D_144_first,D_145_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,,,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,,0.060646,...,-1,-1,-1,0,0,0.0,,0,2.7e-05,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,,,...,-1,-1,-1,0,0,0.0,,0,0.002738,0


In [80]:
%%time
train_agg = train_last_etc.merge(train_agg, left_index=True, right_index=True, how="inner")
del train_last_etc

CPU times: user 213 ms, sys: 849 ms, total: 1.06 s
Wall time: 2.03 s


In [81]:
numeric_columns = list(set(raw_train.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [82]:
train_agg.shape

(458913, 1461)

In [None]:
%%time
for col in tqdm(numeric_columns):
    train_agg[f"{col}_range"] = train_agg[f"{col}_max"] - train_agg[f"{col}_min"]
    train_agg[f"{col}_displacement"] = train_agg[f"{col}_last"] - train_agg[f"{col}_first"]
    train_agg[f"{col}_velocity"] = train_agg[f"{col}_displacement"] / np.log(train_agg["num_statements"])
    train_agg[f"{col}_sprint"] = train_agg[f"{col}_last"] - train_agg[f"{col}_second_last"]
    train_agg[f"{col}_previous_sprint"] = train_agg[f"{col}_second_last"] - train_agg[f"{col}_third_last"]
    train_agg[f"{col}_last_minus_avg"] = train_agg[f"{col}_last"] - train_agg[f"{col}_avg"]

 11%|█████████████▎                                                                                                        | 20/177 [00:46<05:25,  2.07s/it]

In [None]:
# train_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [18]:
train_agg.shape

(458913, 2158)

In [36]:
# backup = train_agg.copy()

#### Drop Average Columns

In [54]:
avg_cols = get_cols(train_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
train_agg = train_agg.drop(columns=avg_cols, errors="ignore")
train_agg.shape

(458913, 2112)

#### Drop Minimum Columns

In [55]:
min_cols = get_cols(train_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
train_agg = train_agg.drop(columns=min_cols, errors="ignore")
train_agg.shape

(458913, 2008)

#### Drop Maximum Columns

In [56]:
max_cols = get_cols(train_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
train_agg = train_agg.drop(columns=max_cols, errors="ignore")
train_agg.shape

(458913, 1941)

#### Drop First Columns

In [57]:
first_cols = get_cols(train_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
train_agg = train_agg.drop(columns=first_cols, errors="ignore")
train_agg.shape

(458913, 1825)

#### Drop Last Columns

In [58]:
last_cols = get_cols(train_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
train_agg = train_agg.drop(columns=last_cols, errors="ignore")
train_agg.shape

(458913, 1779)

#### Drop Range Columns

In [59]:
range_cols = get_cols(train_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
train_agg = train_agg.drop(columns=range_cols, errors="ignore")
train_agg.shape

(458913, 1678)

In [None]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# train_agg = train_agg.loc[:, feature_list]

In [60]:
train_agg = train_agg.reset_index().rename(columns={"index": "customer_ID"})

In [61]:
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [62]:
train_agg["dummy"] = np.random.randn(train_agg.shape[0])

In [63]:
train_agg.shape

(458913, 1679)

In [64]:
train_agg["target"] = labels["target"].values
train_agg.shape

(458913, 1680)

In [65]:
train_agg.dtypes.value_counts()

float32     830
float64     655
int8        140
int16        31
category      5
category      4
category      3
category      3
category      2
category      2
category      2
int64         2
category      1
dtype: int64

In [66]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg_finer.pkl")

### Feature Engineering on Test

In [None]:
# test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")