In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [2]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [3]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    filter_df_for_feature, get_specific_row_df, get_agg_df
)

In [4]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"
DROP_FEATURES_PATH = "../dropped_features"

In [5]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [6]:
%load_ext autoreload
%autoreload

### Feature Engineering on Train

In [9]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")

Shape of data: (5531451, 193)
CPU times: user 2.06 s, sys: 1.62 s, total: 3.69 s
Wall time: 4.85 s


In [10]:
%%time
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

CPU times: user 258 ms, sys: 33.2 ms, total: 291 ms
Wall time: 290 ms


In [11]:
%%time
train_agg = get_agg_df(raw_train)

CPU times: user 19.6 s, sys: 6.89 s, total: 26.5 s
Wall time: 28.2 s


In [12]:
%%time
train_agg["num_statements"] = (
    raw_train.loc[raw_train["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 134 ms, sys: 499 ms, total: 633 ms
Wall time: 586 ms


In [13]:
train_agg.head(3)

Unnamed: 0,R_9_avg,D_80_avg,D_113_avg,D_128_avg,D_50_avg,S_5_avg,B_23_avg,S_20_avg,D_51_avg,D_56_avg,...,B_42_std,S_6_std,D_131_std,B_29_std,S_8_std,S_16_std,B_36_std,R_21_std,D_82_std,num_statements
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,-1.0,0.384615,0.0,1.004154,0.150326,0.029112,0.026177,0.0,2.923077,0.158571,...,,0.0,0.0,,429.583519,0.003047,0.003333,0.0,0.0,13
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,-1.0,0.0,0.0,1.002631,,0.016785,0.013286,0.0,1.153846,0.705671,...,,0.0,0.0,,772.374544,0.003125,0.002787,0.0,0.0,13
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,-1.0,0.0,0.0,0.0,,0.005948,0.023436,0.0,0.615385,0.208154,...,,0.0,0.0,,0.0,0.002772,0.002501,0.0,0.0,13


In [14]:
%%time
train_last_etc = get_specific_row_df(raw_train)

CPU times: user 1.93 s, sys: 1.47 s, total: 3.4 s
Wall time: 4.04 s


In [15]:
train_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_first,D_137_first,D_138_first,D_139_first,D_140_first,D_141_first,D_142_first,D_143_first,D_144_first,D_145_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,,,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,,0.060646,...,-1,-1,-1,0,0,0.0,,0,2.7e-05,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,,,...,-1,-1,-1,0,0,0.0,,0,0.002738,0


In [16]:
%%time
train_agg = train_last_etc.merge(train_agg, left_index=True, right_index=True, how="inner")
del train_last_etc

CPU times: user 200 ms, sys: 490 ms, total: 690 ms
Wall time: 833 ms


In [17]:
numeric_columns = list(set(raw_train.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [18]:
train_agg.shape

(458913, 1461)

In [19]:
%%time
for col in tqdm(numeric_columns):
    train_agg[f"{col}_range"] = train_agg[f"{col}_max"] - train_agg[f"{col}_min"]
    train_agg[f"{col}_displacement"] = train_agg[f"{col}_last"] - train_agg[f"{col}_first"]
    train_agg[f"{col}_velocity"] = train_agg[f"{col}_displacement"] / np.log(train_agg["num_statements"])
    train_agg[f"{col}_sprint"] = train_agg[f"{col}_last"] - train_agg[f"{col}_second_last"]
    train_agg[f"{col}_previous_sprint"] = train_agg[f"{col}_second_last"] - train_agg[f"{col}_third_last"]
    train_agg[f"{col}_last_minus_avg"] = train_agg[f"{col}_last"] - train_agg[f"{col}_avg"]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:06<00:00, 27.18it/s]

CPU times: user 5.21 s, sys: 1.07 s, total: 6.28 s
Wall time: 6.54 s





In [20]:
# train_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [21]:
train_agg.shape

(458913, 2523)

In [106]:
# backup = train_agg.copy()

#### Drop Average Columns

In [107]:
avg_cols = get_cols(train_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
train_agg = train_agg.drop(columns=avg_cols, errors="ignore")
train_agg.shape

(458913, 2477)

#### Drop Minimum Columns

In [108]:
min_cols = get_cols(train_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
train_agg = train_agg.drop(columns=min_cols, errors="ignore")
train_agg.shape

(458913, 2373)

#### Drop Maximum Columns

In [109]:
max_cols = get_cols(train_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
train_agg = train_agg.drop(columns=max_cols, errors="ignore")
train_agg.shape

(458913, 2306)

#### Drop First Columns

In [None]:
first_cols = get_cols(train_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
train_agg = train_agg.drop(columns=first_cols, errors="ignore")
train_agg.shape

(458913, 2190)

#### Drop Last Columns

In [None]:
last_cols = get_cols(train_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
train_agg = train_agg.drop(columns=last_cols, errors="ignore")
train_agg.shape

(458913, 1956)

#### Drop Range Columns

In [113]:
range_cols = get_cols(train_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
train_agg = train_agg.drop(columns=range_cols, errors="ignore")
train_agg.shape

(458913, 1855)

#### Drop Noob Features (Based on previous experiment)

In [119]:
noob_features = pd.read_csv(f"{DROP_FEATURES_PATH}/noob_features.csv")["feature"].values.tolist()

In [120]:
train_agg = train_agg.drop(columns=noob_features, errors="ignore")
train_agg.shape

(458913, 1626)

In [121]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# train_agg = train_agg.loc[:, feature_list]

In [122]:
train_agg = train_agg.reset_index().rename(columns={"index": "customer_ID"})

In [123]:
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [62]:
train_agg["dummy"] = np.random.randn(train_agg.shape[0])

In [124]:
train_agg.shape

(458913, 1626)

In [125]:
train_agg["target"] = labels["target"].values
train_agg.shape

(458913, 1627)

In [126]:
train_agg.dtypes.value_counts()

float32     896
float64     580
int8         99
int16        30
category      4
category      4
category      3
category      3
category      2
category      2
category      2
int64         2
dtype: int64

In [127]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg_finer.pkl")

### Feature Engineering on Test

In [8]:
del test

In [9]:
%%time
raw_test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")

Shape of data: (11363762, 192)


In [10]:
%%time
test_agg = get_agg_df(raw_test)

CPU times: user 42.4 s, sys: 28.8 s, total: 1min 11s
Wall time: 1min 38s


In [12]:
%%time
test_agg["num_statements"] = (
    raw_test.loc[raw_test["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 282 ms, sys: 2.28 s, total: 2.56 s
Wall time: 6.5 s


In [13]:
test_agg.head(3)

Unnamed: 0,S_15_avg,B_27_avg,D_78_avg,D_96_avg,B_8_avg,D_131_avg,D_66_avg,S_27_avg,D_125_avg,D_61_avg,...,D_70_std,B_36_std,B_11_std,D_121_std,B_32_std,D_79_std,R_13_std,D_62_std,D_105_std,num_statements
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,7.555556,0.00572,0.0,0.0,1.006641,0.0,-1.0,0.299671,-0.444444,0.597552,...,0.333333,0.00377,0.003372,0.002904,0.0,0.333333,0.0,0.01713,,9
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,4.0,0.005269,0.0,0.461538,1.004206,0.0,-1.0,0.455466,0.0,0.070053,...,0.0,0.003279,0.061976,0.007938,0.0,0.0,0.0,0.003154,,13
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,6.153846,0.002974,0.0,0.0,1.004163,0.0,1.0,0.325662,0.0,0.317425,...,0.50637,0.002576,0.007783,0.00614,0.0,0.0,0.0,0.184829,0.059204,13


In [14]:
%%time
test_last_etc = get_specific_row_df(raw_test)

CPU times: user 4.15 s, sys: 8.31 s, total: 12.5 s
Wall time: 17.3 s


In [15]:
test_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_first,D_137_first,D_138_first,D_139_first,D_140_first,D_141_first,D_142_first,D_143_first,D_144_first,D_145_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.56893,4,0.010779,1.009347,0.0,0.149413,0.0,0.003576,0.103745,0.007398,...,-1,-1,-1,-1,0,,,-1,0.008281,-1
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.841177,4,0.016562,1.009245,0.0,0.112195,0.0,0.011386,,,...,-1,-1,-1,0,0,0.0,,0,0.008436,0
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.697522,0,0.001484,0.810072,0.0,0.166165,0.0,0.015938,,0.105303,...,-1,-1,-1,1,0,0.886598,0.103164,1,0.322121,1


In [16]:
%%time
test_agg = test_last_etc.merge(test_agg, left_index=True, right_index=True, how="inner")
del test_last_etc

CPU times: user 423 ms, sys: 1.77 s, total: 2.19 s
Wall time: 4.54 s


In [17]:
numeric_columns = list(set(raw_test.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [18]:
test_agg.shape

(924621, 1461)

In [19]:
%%time
for col in tqdm(numeric_columns):
    test_agg[f"{col}_range"] = test_agg[f"{col}_max"] - test_agg[f"{col}_min"]
    test_agg[f"{col}_displacement"] = test_agg[f"{col}_last"] - test_agg[f"{col}_first"]
    test_agg[f"{col}_velocity"] = test_agg[f"{col}_displacement"] / np.log(test_agg["num_statements"])
    test_agg[f"{col}_sprint"] = test_agg[f"{col}_last"] - test_agg[f"{col}_second_last"]
    test_agg[f"{col}_previous_sprint"] = test_agg[f"{col}_second_last"] - test_agg[f"{col}_third_last"]
    test_agg[f"{col}_last_minus_avg"] = test_agg[f"{col}_last"] - test_agg[f"{col}_avg"]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:08<00:00, 20.67it/s]

CPU times: user 6.18 s, sys: 1.89 s, total: 8.07 s
Wall time: 8.6 s





In [20]:
# test_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [21]:
test_agg.shape

(924621, 2523)

In [22]:
# backup = test_agg.copy()

#### Drop Average Columns

In [23]:
avg_cols = get_cols(test_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
test_agg = test_agg.drop(columns=avg_cols, errors="ignore")
test_agg.shape

(924621, 2477)

#### Drop Minimum Columns

In [24]:
min_cols = get_cols(test_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
test_agg = test_agg.drop(columns=min_cols, errors="ignore")
test_agg.shape

(924621, 2373)

#### Drop Maximum Columns

In [25]:
max_cols = get_cols(test_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
test_agg = test_agg.drop(columns=max_cols, errors="ignore")
test_agg.shape

(924621, 2306)

#### Drop First Columns

In [26]:
first_cols = get_cols(test_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
test_agg = test_agg.drop(columns=first_cols, errors="ignore")
test_agg.shape

(924621, 2190)

#### Drop Last Columns

In [27]:
last_cols = get_cols(test_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
test_agg = test_agg.drop(columns=last_cols, errors="ignore")
test_agg.shape

(924621, 1956)

#### Drop Range Columns

In [28]:
range_cols = get_cols(test_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
test_agg = test_agg.drop(columns=range_cols, errors="ignore")
test_agg.shape

(924621, 1855)

#### Drop Noob Features (Based on previous experiment)

In [29]:
noob_features = pd.read_csv(f"{DROP_FEATURES_PATH}/noob_features.csv")["feature"].values.tolist()

In [30]:
test_agg = test_agg.drop(columns=noob_features, errors="ignore")
test_agg.shape

(924621, 1626)

In [31]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# test_agg = test_agg.loc[:, feature_list]

In [32]:
test_agg = test_agg.reset_index().rename(columns={"index": "customer_ID"})

In [33]:
test_agg = test_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [34]:
# test_agg["dummy"] = np.random.randn(test_agg.shape[0])

In [35]:
test_agg.shape

(924621, 1626)

In [36]:
test_agg.dtypes.value_counts()

float32     896
float64     580
int8         96
int16        33
category      4
category      4
category      3
category      3
category      2
category      2
category      2
int64         1
dtype: int64

In [37]:
test_agg.to_pickle(f"{PROCESSED_DATA_PATH}/test_agg_finer.pkl")