In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    filter_df_for_feature, 
    get_specific_row_df, 
    get_agg_df, get_ma_df
)

In [3]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [4]:
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [5]:
%load_ext autoreload
%autoreload

## Feature Engineering on Train

### Read master data & train labels

In [None]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")
labels = read_file(f"{RAW_DATA_PATH}/train_labels.csv")

In [None]:
# numeric_columns = raw_train.select_dtypes(np.number).drop(columns=NON_FEATURE_COLUMNS, errors='ignore').columns.tolist()

In [None]:
# raw_train.loc[:, ["customer_ID"] + numeric_columns].groupby(["customer_ID"]).diff()

In [None]:
%%time
raw_train["P_200"] = raw_train["P_2"] - raw_train.groupby("customer_ID")["P_2"].transform(lambda x: x.ewm(span=13).mean()).values
raw_train["P_300"] = raw_train["P_3"] - raw_train.groupby("customer_ID")["P_3"].transform(lambda x: x.ewm(span=13).mean()).values
raw_train["S_300"] = raw_train["S_3"] - raw_train.groupby("customer_ID")["S_3"].transform(lambda x: x.ewm(span=13).mean()).values
raw_train["S_1200"] = raw_train["S_12"] - raw_train.groupby("customer_ID")["S_12"].transform(lambda x: x.ewm(span=13).mean()).values

In [None]:
%%time
raw_train["P_22"] = raw_train.groupby("customer_ID")["P_2"].pct_change()
raw_train["P_33"] = raw_train.groupby("customer_ID")["P_3"].pct_change()
raw_train["S_33"] = raw_train.groupby("customer_ID")["S_3"].pct_change()
raw_train["S_1212"] = raw_train.groupby("customer_ID")["S_12"].pct_change()

### Get simple aggregation values (average, min, max, std, skew)

In [None]:
%%time
train_agg = get_agg_df(raw_train)

In [None]:
train_agg.shape

In [None]:
train_agg.head()

### Calculate number of statements (data availability for each client)

In [None]:
%%time
train_agg["num_statements"] = (
    raw_train.loc[raw_train["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

### Get Last, Second-Last, Third-Last, First

In [None]:
%%time
train_last_etc = get_specific_row_df(raw_train)

In [None]:
train_last_etc.head(3)

In [None]:
%%time
train_agg = train_last_etc.merge(train_agg, left_index=True, right_index=True, how="inner")
del train_last_etc

### Get Moving Average columns (MA3_R1, MA3_R2, MA3_R3, MA3_R4)

In [None]:
%%time
train_ma_df = get_ma_df(raw_train)

In [None]:
%%time
train_agg = train_agg.merge(train_ma_df, left_index=True, right_index=True, how="inner")
del train_ma_df

In [None]:
numeric_columns = list(set(raw_train.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))
del raw_train

### Feature Crossing between aggregated features

In [None]:
%%time
for col in tqdm(numeric_columns):
    train_agg[f"{col}_range"] = train_agg[f"{col}_max"] - train_agg[f"{col}_min"]
    train_agg[f"{col}_displacement"] = train_agg[f"{col}_last"] - train_agg[f"{col}_first"]
    train_agg[f"{col}_velocity"] = train_agg[f"{col}_displacement"] / np.log(train_agg["num_statements"])
    train_agg[f"{col}_sprint"] = train_agg[f"{col}_last"] - train_agg[f"{col}_second_last"]
    train_agg[f"{col}_previous_sprint"] = train_agg[f"{col}_second_last"] - train_agg[f"{col}_third_last"]
    train_agg[f"{col}_acceleration"] = train_agg[f"{col}_sprint"] / (train_agg[f"{col}_previous_sprint"] * train_agg[f"{col}_std"]).replace(
        [np.inf, -np.inf], np.nan
    )
    train_agg[f"{col}_last_minus_avg"] = train_agg[f"{col}_last"] - train_agg[f"{col}_avg"]
    train_agg[f"{col}_coef_var"] = (train_agg[f"{col}_std"] / train_agg[f"{col}_avg"]).replace([np.inf, -np.inf], np.nan)
    # train_agg[f"{col}_skew_std_ratio"] = (train_agg[f"{col}_skew"] / train_agg[f"{col}_std"]).replace([np.inf, -np.inf], np.nan)
    train_agg[f"{col}_ma3_r1_r2"] = train_agg[f"{col}_ma3_r1"] / train_agg[f"{col}_ma3_r2"]
    train_agg[f"{col}_ma3_r1_r3"] = train_agg[f"{col}_ma3_r1"] / train_agg[f"{col}_ma3_r3"]
    train_agg[f"{col}_ma3_r1_r4"] = train_agg[f"{col}_ma3_r1"] / train_agg[f"{col}_ma3_r4"]

In [None]:
train_agg = train_agg.drop(columns=["num_statements"])

### Final Preparation before export processed data

In [None]:
%%time
train_agg = train_agg.reset_index().rename(columns={"index": "customer_ID"})

In [None]:
train_agg.shape

In [None]:
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [None]:
train_agg["target"] = labels["target"].values
train_agg.shape

In [None]:
train_agg.shape

In [None]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg_complete2.pkl")

In [None]:
# train_agg["dummy"] = np.random.randn(train_agg.shape[0])

In [None]:
# train_agg.shape

In [None]:
train_agg.dtypes.value_counts()

### END

In [None]:
# plot_heatmap(lgbm_feature_imp.describe(), annot=True)

In [None]:
# lgbm_feature_imp

In [None]:
global_threshold = 20
features_dict = {}
for column in lgbm_feature_imp.columns[1:]:
    features_dict[column] = lgbm_feature_imp.loc[lgbm_feature_imp[column] >= global_threshold]["feature"].tolist()

In [None]:
lgbm_feature_imp["first"].mean() / lgbm_feature_imp.mean().mean()

In [None]:
lgbm_feature_imp["last"].mean() / lgbm_feature_imp.mean().mean()

In [None]:
np.percentile(lgbm_feature_imp["first"], 30)

### Feature Engineering on Test

In [6]:
%%time
raw_test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")

Shape of data: (11363762, 192)
CPU times: user 3.18 s, sys: 4.01 s, total: 7.18 s
Wall time: 9.16 s


In [7]:
%%time
raw_test["P_200"] = raw_test["P_2"] - raw_test.groupby("customer_ID")["P_2"].transform(lambda x: x.ewm(span=13).mean()).values
raw_test["P_300"] = raw_test["P_3"] - raw_test.groupby("customer_ID")["P_3"].transform(lambda x: x.ewm(span=13).mean()).values
raw_test["S_300"] = raw_test["S_3"] - raw_test.groupby("customer_ID")["S_3"].transform(lambda x: x.ewm(span=13).mean()).values
raw_test["S_1200"] = raw_test["S_12"] - raw_test.groupby("customer_ID")["S_12"].transform(lambda x: x.ewm(span=13).mean()).values

CPU times: user 3min 34s, sys: 5.94 s, total: 3min 40s
Wall time: 3min 37s


In [8]:
%%time
raw_test["P_22"] = raw_test.groupby("customer_ID")["P_2"].pct_change()
raw_test["P_33"] = raw_test.groupby("customer_ID")["P_3"].pct_change()
raw_test["S_33"] = raw_test.groupby("customer_ID")["S_3"].pct_change()
raw_test["S_1212"] = raw_test.groupby("customer_ID")["S_12"].pct_change()

CPU times: user 5.39 s, sys: 335 ms, total: 5.73 s
Wall time: 5.59 s


In [9]:
%%time
test_agg = get_agg_df(raw_test)

Average done
Minimum done
Maximum done
Standard Deviation done
CPU times: user 41.2 s, sys: 28.5 s, total: 1min 9s
Wall time: 1min 27s


In [10]:
%%time
test_agg["num_statements"] = (
    raw_test.loc[raw_test["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 475 ms, sys: 2.9 s, total: 3.37 s
Wall time: 7.12 s


In [11]:
test_agg.head(3)

Unnamed: 0_level_0,B_3_avg,D_54_avg,B_23_avg,D_112_avg,B_29_avg,D_138_avg,D_51_avg,R_20_avg,D_96_avg,R_3_avg,...,D_42_std,D_61_std,B_4_std,B_9_std,D_142_std,R_28_std,R_16_std,D_115_std,S_22_std,num_statements
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.009732,1.0,0.254266,1.0,0.003894,,0.111111,0.0,0.0,1.0,...,0.006688,0.016234,0.600925,0.003084,,0.0,0.0,0.004236,0.024732,9
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.00616,1.0,0.022771,1.0,,,0.461538,0.0,0.461538,0.0,...,,0.029479,2.785033,0.077545,,0.0,0.0,0.008609,0.022792,13
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.008552,1.0,0.050163,0.849524,0.002994,,0.0,0.0,0.0,0.384615,...,,0.180088,12.873646,0.002235,0.015235,0.0,0.0,0.084998,0.371791,13


In [12]:
%%time
test_last_etc = get_specific_row_df(raw_test)

Last entry done
First entry done
Second last entry done
Third last entry done
CPU times: user 5.32 s, sys: 15.2 s, total: 20.5 s
Wall time: 33.1 s


In [13]:
test_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_144_first,D_145_first,P_200_first,P_300_first,S_300_first,S_1200_first,P_22_first,P_33_first,S_33_first,S_1212_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.56893,4,0.010779,1.009347,0.0,0.149413,0.0,0.003576,0.103745,0.007398,...,0.008281,,0.0,,0.0,0.0,,,,
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.841177,4,0.016562,1.009245,0.0,0.112195,0.0,0.011386,,,...,0.008436,0.0,0.0,0.0,0.0,0.0,,,,
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.697522,0,0.001484,0.810072,0.0,0.166165,0.0,0.015938,,0.105303,...,0.322121,1.0,0.0,0.0,0.0,0.0,,,,


In [14]:
%%time
test_agg = test_last_etc.merge(test_agg, left_index=True, right_index=True, how="inner")
del test_last_etc

CPU times: user 524 ms, sys: 2.26 s, total: 2.78 s
Wall time: 5.71 s


In [15]:
%%time
test_ma_df = get_ma_df(raw_test)

MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done
CPU times: user 14.6 s, sys: 26.6 s, total: 41.2 s
Wall time: 1min 1s


In [16]:
%%time
test_agg = test_agg.merge(test_ma_df, left_index=True, right_index=True, how="inner")
del test_ma_df

CPU times: user 683 ms, sys: 4.58 s, total: 5.26 s
Wall time: 12 s


In [17]:
numeric_columns = list(set(raw_test.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [18]:
test_agg.shape

(924621, 2265)

In [19]:
%%time
for col in tqdm(numeric_columns):
    test_agg[f"{col}_range"] = test_agg[f"{col}_max"] - test_agg[f"{col}_min"]
    test_agg[f"{col}_displacement"] = test_agg[f"{col}_last"] - test_agg[f"{col}_first"]
    test_agg[f"{col}_velocity"] = test_agg[f"{col}_displacement"] / np.log(test_agg["num_statements"])
    test_agg[f"{col}_sprint"] = test_agg[f"{col}_last"] - test_agg[f"{col}_second_last"]
    test_agg[f"{col}_previous_sprint"] = test_agg[f"{col}_second_last"] - test_agg[f"{col}_third_last"]
    test_agg[f"{col}_acceleration"] = test_agg[f"{col}_sprint"] / (test_agg[f"{col}_previous_sprint"] * test_agg[f"{col}_std"]).replace(
        [np.inf, -np.inf], np.nan
    )
    test_agg[f"{col}_last_minus_avg"] = test_agg[f"{col}_last"] - test_agg[f"{col}_avg"]
    test_agg[f"{col}_coef_var"] = (test_agg[f"{col}_std"] / test_agg[f"{col}_avg"]).replace([np.inf, -np.inf], np.nan)
    test_agg[f"{col}_ma3_r1_r2"] = test_agg[f"{col}_ma3_r1"] / test_agg[f"{col}_ma3_r2"]
    test_agg[f"{col}_ma3_r1_r3"] = test_agg[f"{col}_ma3_r1"] / test_agg[f"{col}_ma3_r3"]
    test_agg[f"{col}_ma3_r1_r4"] = test_agg[f"{col}_ma3_r1"] / test_agg[f"{col}_ma3_r4"]
    gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 185/185 [00:24<00:00,  7.60it/s]

CPU times: user 17.4 s, sys: 4.32 s, total: 21.7 s
Wall time: 24.4 s





In [20]:
%%time
test_agg = test_agg.reset_index().rename(columns={"index": "customer_ID"})
test_agg = test_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

CPU times: user 5.81 s, sys: 49.4 s, total: 55.2 s
Wall time: 3min 31s


In [21]:
test_agg.shape

(924621, 4300)

In [22]:
test_agg.dtypes.value_counts()

float64     2184
float32     1843
int8         198
int16         30
category      12
category      12
category       4
category       4
category       4
category       4
category       4
int64          1
dtype: int64

In [23]:
test_agg.to_pickle(f"{PROCESSED_DATA_PATH}/test_agg_complete2.pkl")