In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import optuna
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from lightgbm import LGBMClassifier, log_evaluation
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.calibration import CalibrationDisplay
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, fbeta_score, make_scorer
from tqdm import tqdm

In [2]:
from matplotlib.ticker import MaxNLocator
from matplotlib.colors import ListedColormap
from cycler import cycler
from IPython.display import display
from colorama import Fore, Back, Style
plt.rcParams['axes.facecolor'] = '#0057b8' # blue
plt.rcParams['axes.prop_cycle'] = cycler(color=['#ffd700'] +
                                         plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])
plt.rcParams['text.color'] = 'w'

In [3]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    filter_df_for_feature, get_specific_row_df, get_agg_df,
    process_data
)

In [4]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [5]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [6]:
%load_ext autoreload
%autoreload

### Feature Engineering on Train

In [12]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")

Shape of data: (5531451, 193)
CPU times: user 2.01 s, sys: 1.58 s, total: 3.58 s
Wall time: 4.61 s


In [13]:
%%time
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

CPU times: user 249 ms, sys: 38.7 ms, total: 287 ms
Wall time: 287 ms


In [15]:
%%time
train_agg = get_agg_df(raw_train)

CPU times: user 19.3 s, sys: 4.87 s, total: 24.2 s
Wall time: 24.2 s


In [20]:
%%time
train_agg["num_statements"] = (
    raw_train.loc[raw_train["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 121 ms, sys: 27.1 ms, total: 148 ms
Wall time: 142 ms


In [23]:
train_agg.head(3)

Unnamed: 0,D_75_avg,R_16_avg,D_83_avg,P_4_avg,B_12_avg,B_2_avg,S_25_avg,D_131_avg,D_96_avg,B_26_avg,...,R_2_std,B_4_std,D_132_std,D_140_std,D_128_std,B_37_std,R_24_std,D_71_std,R_7_std,num_statements
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.461538,0.0,0.0,0.0,0.125683,1.005086,0.974669,0.0,0.0,0.004408,...,0.0,2.44425,,0.0,0.003293,0.006662,0.0,0.13233,0.0,13
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.0,0.076923,0.0,0.0,0.025823,0.991083,0.975606,0.0,0.0,0.005267,...,0.0,0.800641,,0.0,0.002154,0.027278,0.0,0.003392,0.0,13
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.230769,0.0,0.0,0.0,0.011541,0.815677,0.973897,0.0,0.0,0.00621,...,0.0,1.69085,,0.0,0.0,0.003195,0.0,0.002854,0.0,13


In [18]:
%%time
train_last_etc = get_specific_row_df(raw_train)

CPU times: user 1.5 s, sys: 762 ms, total: 2.26 s
Wall time: 2.24 s


In [22]:
train_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_first,D_137_first,D_138_first,D_139_first,D_140_first,D_141_first,D_142_first,D_143_first,D_144_first,D_145_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,,,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,,0.060646,...,-1,-1,-1,0,0,0.0,,0,2.7e-05,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,,,...,-1,-1,-1,0,0,0.0,,0,0.002738,0


In [25]:
%%time
train_agg = train_last_etc.merge(train_agg, left_index=True, right_index=True, how="inner")
del train_last_etc

CPU times: user 133 ms, sys: 166 ms, total: 298 ms
Wall time: 324 ms


In [26]:
numeric_columns = list(set(raw_train.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [27]:
train_agg.shape

(458913, 1273)

In [28]:
%%time
for col in tqdm(numeric_columns):
    train_agg[f"{col}_range"] = train_agg[f"{col}_max"] - train_agg[f"{col}_min"]
    train_agg[f"{col}_displacement"] = train_agg[f"{col}_last"] - train_agg[f"{col}_first"]
    train_agg[f"{col}_velocity"] = train_agg[f"{col}_displacement"] / np.log(train_agg["num_statements"])
    train_agg[f"{col}_sprint"] = train_agg[f"{col}_last"] - train_agg[f"{col}_second_last"]
    train_agg[f"{col}_last_minus_avg"] = train_agg[f"{col}_last"] - train_agg[f"{col}_avg"]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:05<00:00, 32.71it/s]

CPU times: user 4.42 s, sys: 855 ms, total: 5.27 s
Wall time: 5.45 s





In [29]:
train_agg.shape

(458913, 2158)

In [None]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# train_agg = train_agg.loc[:, feature_list]

In [30]:
train_agg = train_agg.reset_index().rename(columns={"index": "customer_ID"})

In [31]:
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [32]:
train_agg["dummy"] = np.random.randn(train_agg.shape[0])

In [33]:
train_agg.shape

(458913, 2159)

In [34]:
train_agg["target"] = labels["target"].values
train_agg.shape

(458913, 2160)

In [35]:
train_agg.dtypes.value_counts()

float32     930
float64     691
int8        450
int16        54
category      9
category      6
category      3
category      3
category      3
category      3
category      3
category      3
int64         2
dtype: int64

In [36]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg.pkl")

### Feature Engineering on Test

In [None]:
# test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")