In [59]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [60]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    filter_df_for_feature, 
    get_specific_row_df, 
    get_agg_df, get_ma_df
)

In [61]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [62]:
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [63]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Feature Engineering on Train

In [6]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")
labels = read_file(f"{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (5531451, 193)
Shape of data: (458913, 2)
CPU times: user 7.24 s, sys: 4.44 s, total: 11.7 s
Wall time: 11.9 s


In [7]:
%%time
train_agg = get_agg_df(raw_train)

CPU times: user 36.3 s, sys: 16.4 s, total: 52.7 s
Wall time: 57.7 s


In [10]:
%%time
train_agg["num_statements"] = (
    raw_train.loc[raw_train["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 2.36 s, sys: 3.19 s, total: 5.55 s
Wall time: 6.38 s


In [11]:
%%time
train_last_etc = get_specific_row_df(raw_train)

CPU times: user 8.14 s, sys: 5.53 s, total: 13.7 s
Wall time: 15.1 s


In [12]:
train_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_first,D_137_first,D_138_first,D_139_first,D_140_first,D_141_first,D_142_first,D_143_first,D_144_first,D_145_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.934745,0,0.009382,1.007647,0.006104,0.135021,0.0,0.007174,,,...,,,,0.0,0.0,0.0,,0.0,0.00061,0.0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.880519,6,0.034684,1.004028,0.006911,0.165509,0.0,0.005068,,0.060646,...,,,,0.0,0.0,0.0,,0.0,2.7e-05,0.0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,,,...,,,,0.0,0.0,0.0,,0.0,0.002738,0.0


In [13]:
%%time
train_agg = train_last_etc.merge(train_agg, left_index=True, right_index=True, how="inner")
del train_last_etc

CPU times: user 1.7 s, sys: 2.94 s, total: 4.64 s
Wall time: 6.32 s


In [65]:
%%time
train_ma_df = get_ma_df(raw_train)

CPU times: user 24.5 s, sys: 19.1 s, total: 43.6 s
Wall time: 50.7 s


In [66]:
%%time
train_agg = train_agg.merge(train_ma_df, left_index=True, right_index=True, how="inner")
del train_ma_df

CPU times: user 4.21 s, sys: 13.6 s, total: 17.8 s
Wall time: 26.2 s


In [67]:
numeric_columns = list(set(raw_train.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [70]:
%%time
for col in tqdm(numeric_columns):
    train_agg[f"{col}_range"] = train_agg[f"{col}_max"] - train_agg[f"{col}_min"]
    train_agg[f"{col}_displacement"] = train_agg[f"{col}_last"] - train_agg[f"{col}_first"]
    train_agg[f"{col}_velocity"] = train_agg[f"{col}_displacement"] / np.log(train_agg["num_statements"])
    train_agg[f"{col}_sprint"] = train_agg[f"{col}_last"] - train_agg[f"{col}_second_last"]
    train_agg[f"{col}_previous_sprint"] = train_agg[f"{col}_second_last"] - train_agg[f"{col}_third_last"]
    train_agg[f"{col}_acceleration"] = train_agg[f"{col}_sprint"] / (train_agg[f"{col}_previous_sprint"] * train_agg[f"{col}_std"])
    train_agg[f"{col}_last_minus_avg"] = train_agg[f"{col}_last"] - train_agg[f"{col}_avg"]
    train_agg[f"{col}_ma3_r1_r2"] = train_agg[f"{col}_ma3_r1"] / train_agg[f"{col}_ma3_r2"]
    train_agg[f"{col}_ma3_r1_r3"] = train_agg[f"{col}_ma3_r1"] / train_agg[f"{col}_ma3_r3"]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:10<00:00, 16.83it/s]

CPU times: user 5.11 s, sys: 3.86 s, total: 8.97 s
Wall time: 10.5 s





In [71]:
train_agg.shape

(458913, 3585)

In [40]:
column = "D_39"

In [15]:
lgbm_feature_imp = read_file(f"{PROCESSED_DATA_PATH}/lgbm_feature_imp_summary.csv")

Shape of data: (188, 14)


In [25]:
# plot_heatmap(lgbm_feature_imp.describe(), annot=True)

In [26]:
lgbm_feature_imp

Unnamed: 0,feature,first,second_last,last,avg,max,min,range,std,displacement,sprint,previous_sprint,last_minus_avg,velocity
0,B_1,75.2,83.6,235.2,62.6,,90.4,70.0,85.6,72.0,179.6,168.6,121.6,87.6
1,B_10,105.2,118.8,140.2,76.8,,121.2,92.4,95.6,115.0,164.0,164.2,144.4,143.6
2,B_11,74.0,111.2,181.6,77.4,,102.8,58.0,82.4,73.6,173.0,173.0,103.8,81.8
3,B_12,115.0,90.0,99.8,105.2,108.6,,78.6,110.2,103.2,148.2,183.6,131.4,116.2
4,B_13,133.6,99.2,93.6,94.2,84.6,106.2,88.4,101.4,145.2,182.0,176.2,136.4,138.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,S_5,147.8,102.0,143.0,116.8,117.2,128.6,81.2,108.4,95.0,210.8,176.2,133.0,105.8
184,S_6,,,,15.6,,,,18.6,0.6,,0.4,9.4,2.6
185,S_7,218.6,187.2,222.0,189.0,197.0,186.2,149.4,151.6,150.4,244.6,190.8,196.6,142.0
186,S_8,,43.6,64.8,162.6,91.0,,89.2,114.8,81.4,39.2,55.4,113.0,94.6


In [None]:
global_threshold = 20
features_dict = {}
for column in lgbm_feature_imp.columns[1:]:
    features_dict[column] = lgbm_feature_imp.loc[lgbm_feature_imp[column] >= global_threshold]["feature"].tolist()

In [None]:
lgbm_feature_imp["first"].mean() / lgbm_feature_imp.mean().mean()

In [None]:
lgbm_feature_imp["last"].mean() / lgbm_feature_imp.mean().mean()

In [None]:
np.percentile(lgbm_feature_imp["first"], 30)

In [None]:
train_agg.head(3)

In [None]:
# train_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [None]:
train_agg.shape

In [None]:
# backup = train_agg.copy()

#### Drop Average Columns

In [None]:
avg_cols = get_cols(train_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
train_agg = train_agg.drop(columns=avg_cols, errors="ignore")
train_agg.shape

#### Drop Minimum Columns

In [None]:
min_cols = get_cols(train_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
train_agg = train_agg.drop(columns=min_cols, errors="ignore")
train_agg.shape

#### Drop Maximum Columns

In [None]:
max_cols = get_cols(train_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
train_agg = train_agg.drop(columns=max_cols, errors="ignore")
train_agg.shape

#### Drop First Columns

In [None]:
first_cols = get_cols(train_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
train_agg = train_agg.drop(columns=first_cols, errors="ignore")
train_agg.shape

#### Drop Last Columns

In [None]:
last_cols = get_cols(train_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
train_agg = train_agg.drop(columns=last_cols, errors="ignore")
train_agg.shape

#### Drop Range Columns

In [None]:
range_cols = get_cols(train_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
train_agg = train_agg.drop(columns=range_cols, errors="ignore")
train_agg.shape

#### Drop Noob Features (Based on previous experiment)

In [None]:
noob_features = pd.read_csv(f"{DROP_FEATURES_PATH}/noob_features.csv")["feature"].values.tolist()

In [None]:
train_agg = train_agg.drop(columns=noob_features, errors="ignore")
train_agg.shape

In [None]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# train_agg = train_agg.loc[:, feature_list]

In [None]:
train_agg = train_agg.reset_index().rename(columns={"index": "customer_ID"})

In [None]:
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [None]:
train_agg["dummy"] = np.random.randn(train_agg.shape[0])

In [None]:
train_agg.shape

In [None]:
train_agg["target"] = labels["target"].values
train_agg.shape

In [None]:
train_agg.dtypes.value_counts()

In [None]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg_finer.pkl")

### Feature Engineering on Test

In [None]:
%%time
raw_test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")

In [None]:
%%time
test_agg = get_agg_df(raw_test)

In [None]:
%%time
test_agg["num_statements"] = (
    raw_test.loc[raw_test["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

In [None]:
test_agg.head(3)

In [None]:
%%time
test_last_etc = get_specific_row_df(raw_test)

In [None]:
test_last_etc.head(3)

In [None]:
%%time
test_agg = test_last_etc.merge(test_agg, left_index=True, right_index=True, how="inner")
del test_last_etc

In [None]:
numeric_columns = list(set(raw_test.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [None]:
test_agg.shape

In [None]:
%%time
for col in tqdm(numeric_columns):
    test_agg[f"{col}_range"] = test_agg[f"{col}_max"] - test_agg[f"{col}_min"]
    test_agg[f"{col}_displacement"] = test_agg[f"{col}_last"] - test_agg[f"{col}_first"]
    test_agg[f"{col}_velocity"] = test_agg[f"{col}_displacement"] / np.log(test_agg["num_statements"])
    test_agg[f"{col}_sprint"] = test_agg[f"{col}_last"] - test_agg[f"{col}_second_last"]
    test_agg[f"{col}_previous_sprint"] = test_agg[f"{col}_second_last"] - test_agg[f"{col}_third_last"]
    test_agg[f"{col}_last_minus_avg"] = test_agg[f"{col}_last"] - test_agg[f"{col}_avg"]

In [None]:
# test_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [None]:
test_agg.shape

In [None]:
# backup = test_agg.copy()

#### Drop Average Columns

In [None]:
avg_cols = get_cols(test_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
test_agg = test_agg.drop(columns=avg_cols, errors="ignore")
test_agg.shape

#### Drop Minimum Columns

In [None]:
min_cols = get_cols(test_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
test_agg = test_agg.drop(columns=min_cols, errors="ignore")
test_agg.shape

#### Drop Maximum Columns

In [None]:
max_cols = get_cols(test_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
test_agg = test_agg.drop(columns=max_cols, errors="ignore")
test_agg.shape

#### Drop First Columns

In [None]:
first_cols = get_cols(test_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
test_agg = test_agg.drop(columns=first_cols, errors="ignore")
test_agg.shape

#### Drop Last Columns

In [None]:
last_cols = get_cols(test_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
test_agg = test_agg.drop(columns=last_cols, errors="ignore")
test_agg.shape

#### Drop Range Columns

In [None]:
range_cols = get_cols(test_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
test_agg = test_agg.drop(columns=range_cols, errors="ignore")
test_agg.shape

#### Drop Noob Features (Based on previous experiment)

In [None]:
noob_features = pd.read_csv(f"{DROP_FEATURES_PATH}/noob_features.csv")["feature"].values.tolist()

In [None]:
test_agg = test_agg.drop(columns=noob_features, errors="ignore")
test_agg.shape

In [None]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# test_agg = test_agg.loc[:, feature_list]

In [None]:
test_agg = test_agg.reset_index().rename(columns={"index": "customer_ID"})

In [None]:
test_agg = test_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [None]:
# test_agg["dummy"] = np.random.randn(test_agg.shape[0])

In [None]:
test_agg.shape

In [None]:
test_agg.dtypes.value_counts()

In [None]:
test_agg.to_pickle(f"{PROCESSED_DATA_PATH}/test_agg_finer.pkl")