In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols, insert_row_number
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    feature_gen_pipeline,
    filter_df_for_feature, 
    get_specific_row_df, 
    get_agg_df, get_ma_df
)
from utils.impute_helpers import impute_col

In [3]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [4]:
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")

In [5]:
%load_ext autoreload
%autoreload

## Feature Engineering on Train

### Read master data & train labels

In [6]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train_bdpsr.parquet")
labels = read_file(f"{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (5531451, 189)
Shape of data: (458913, 2)
CPU times: user 5.68 s, sys: 4.61 s, total: 10.3 s
Wall time: 5.89 s


In [8]:
raw_train.loc[:, CATEGORY_COLUMNS] = raw_train.loc[:, CATEGORY_COLUMNS].astype("category")

In [10]:
%%time
insert_row_number(raw_train)

Done insertion
CPU times: user 10.7 s, sys: 177 ms, total: 10.9 s
Wall time: 10.9 s


In [11]:
# Get last value for all features
last_df = filter_df_for_feature(
    raw_train, 
    cond_col="row_number", 
    equal_to=1, 
    rename_suffix="last"
)

In [12]:
last_df

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_last,D_137_last,D_138_last,D_139_last,D_140_last,D_141_last,D_142_last,D_143_last,D_144_last,D_145_last
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,0.934745,0,0.01,1.00,0.01,0.135,0.0,0.01,,,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,0.880519,6,0.03,1.00,0.01,0.166,0.0,0.01,,0.061,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.880875,0,0.00,0.81,0.01,,0.0,0.01,,,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,0.621776,0,0.01,1.00,0.01,0.288,0.0,0.01,,0.046,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,0.871900,0,0.01,0.82,0.00,,0.0,0.01,,0.045,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff41c8a52833b56430603969b9ca48d208e7c192c6a4081a6acc28cf4f8af7,0.844229,15,0.03,1.00,0.00,0.129,0.0,0.01,,0.113,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
ffff518bb2075e4816ee3fe9f3b152c57fc0e6f01bf7fdd3e5b57cfcbee30286,0.831279,1,0.29,0.06,0.01,,0.0,0.23,,0.135,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
ffff9984b999fccb2b6127635ed0736dda94e544e67e026eee4d20f680639ff6,0.800522,9,0.02,1.00,0.00,0.067,0.0,0.01,,0.050,...,,,,0.0,0.0,0.000000,,0.0,0.1,0.0
ffffa5c46bc8de74f5a4554e74e239c8dee6b9baf388145b2c3d01967fcce461,0.754129,0,0.02,0.71,0.00,0.409,0.0,0.05,,0.046,...,,,,1.0,0.0,0.949723,0.45,1.0,0.1,2.0


In [20]:
corr_df = last_df.corr()

In [31]:
increase = ["B_1", "B_3", "B_4", "B_7", "B_9", "B_11", "B_16", 
            "B_20", "B_21", "B_23", "B_24", "B_28", "B_29", "B_40", "B_41", 
            "D_39", "D_41", "D_42", "D_43", "D_48", "D_53", "D_55", "D_58", "D_59", 
            "D_61", "D_72", "D_111", "D_113", "D_124", "D_136", "D_145", 
            "S_3", "S_7", "S_9", "R_1", "R_11"]

In [32]:
decrease = ["B_2", "B_6", "B_10", "B_18", "B_42", 
            "D_47", "D_50", "D_51", "D_54", "D_56", "D_62", "D_76", "D_77", "D_91", 
            "D_112", "D_115", "D_132", "P_2", "P_3"]

In [33]:
increase_bin = ["B_8", "B_32", "B_36", "D_103", "D_116", "D_120", "D_130", "D_135", 
                "D_137", "D_139", "D_140", "D_143", "S_20", "S_26", "R_2", "R_4", 
                "R_7", "R_15", "R_19", "R_21", "R_22", "R_23", "R_24", "R_25", "R_28"]

In [34]:
decrease_bin = ["B_31", "B_33", "D_66", "D_86", "D_93", "D_94", "D_96", "D_109", "D_114", 
                "D_127", "D_128", "D_129", "S_6", "S_18", "R_12"]

In [35]:
len(increase), len(decrease)

(36, 19)

In [36]:
from itertools import product

In [37]:
for i, d in tqdm(product(increase, decrease)):
    last_df[f"{i}_{d}_ratio"] = last_df[f"{i}_last"] / last_df[f"{d}_last"]

684it [00:01, 657.29it/s] 


In [38]:
last_df["target"] = labels["target"].values


In [39]:
sys.getsizeof(last_df) / 1e9

2.094481072

In [40]:
last_df.shape

(458913, 871)

In [41]:
last_df.to_pickle(f"{PROCESSED_DATA_PATH}/train_last.pkl")

### Feature Engineering on Test

In [6]:
%%time
raw_test = read_file(f"{RAW_TEST_PICKLE_PATH}/raw_test_data.pkl")

Shape of data: (11363762, 192)
CPU times: user 3.45 s, sys: 4.06 s, total: 7.51 s
Wall time: 10.1 s


In [7]:
%%time
test_agg = get_agg_df(raw_test)

Average done
Minimum done
Maximum done
Standard Deviation done
CPU times: user 1min 25s, sys: 25.5 s, total: 1min 50s
Wall time: 2min 4s


In [8]:
%%time
test_agg["num_statements"] = (
    raw_test.loc[raw_test["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

CPU times: user 348 ms, sys: 2.15 s, total: 2.5 s
Wall time: 5.78 s


In [9]:
test_agg.head(3)

Unnamed: 0_level_0,D_61_avg,D_139_avg,S_18_avg,D_75_avg,D_141_avg,D_145_avg,D_138_avg,D_96_avg,B_29_avg,B_28_avg,...,B_31_std,B_36_std,D_131_std,R_18_std,D_42_std,D_91_std,R_28_std,S_5_std,B_23_std,num_statements
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.597552,0.0,0.0,3.0,0.0,0.0,,0.0,0.003894,0.14522,...,0.0,0.0,0.0,0.0,0.006688,0.0,0.0,0.00365,0.331327,9
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.070053,0.0,0.0,0.0,0.0,0.0,,0.461538,,0.03378,...,0.0,0.0,0.0,0.0,,0.518875,0.0,0.016564,0.016254,13
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.317425,1.0,0.0,0.615385,0.887847,1.0,,0.0,0.002994,0.147394,...,0.0,0.0,0.0,0.0,,0.0,0.0,0.009604,0.03296,13


In [10]:
test_agg.shape

(924621, 717)

In [11]:
%%time
test_last_etc = get_specific_row_df(raw_test)

Last entry done
First entry done
Second last entry done
Third last entry done
CPU times: user 4.97 s, sys: 12.4 s, total: 17.4 s
Wall time: 26 s


In [12]:
test_last_etc.head(3)

Unnamed: 0_level_0,P_2_last,D_39_last,B_1_last,B_2_last,R_1_last,S_3_last,D_41_last,B_3_last,D_42_last,D_43_last,...,D_136_first,D_137_first,D_138_first,D_139_first,D_140_first,D_141_first,D_142_first,D_143_first,D_144_first,D_145_first
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.56893,4,0.010779,1.009347,0,0.149413,0.0,0.003576,0.103745,0.007398,...,,,,,0.0,,,,0.008281,
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.841177,4,0.016562,1.009245,0,0.112195,0.0,0.011386,,,...,,,,0.0,0.0,0.0,,0.0,0.008436,0.0
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.697522,0,0.001484,0.810072,0,0.166165,0.0,0.015938,,0.105303,...,,,,1.0,0.0,0.886598,0.103164,1.0,0.322121,1.0


In [13]:
%%time
test_agg = test_last_etc.merge(test_agg, left_index=True, right_index=True, how="inner")
del test_last_etc

CPU times: user 456 ms, sys: 1.87 s, total: 2.32 s
Wall time: 4.98 s


In [14]:
%%time
test_ma_df = get_ma_df(raw_test)

MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done
CPU times: user 13.8 s, sys: 22.1 s, total: 35.9 s
Wall time: 52.2 s


In [15]:
%%time
test_agg = test_agg.merge(test_ma_df, left_index=True, right_index=True, how="inner")
del test_ma_df

CPU times: user 543 ms, sys: 4.02 s, total: 4.56 s
Wall time: 17.5 s


In [16]:
numeric_columns = list(set(raw_test.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))
del raw_test

In [17]:
test_agg.shape

(924621, 2185)

In [18]:
%%time
for col in tqdm(numeric_columns):
    test_agg[f"{col}_range"] = test_agg[f"{col}_max"] - test_agg[f"{col}_min"]
    test_agg[f"{col}_displacement"] = test_agg[f"{col}_last"] - test_agg[f"{col}_first"]
    test_agg[f"{col}_velocity"] = test_agg[f"{col}_displacement"] / np.log(test_agg["num_statements"])
    test_agg[f"{col}_sprint"] = test_agg[f"{col}_last"] - test_agg[f"{col}_second_last"]
    test_agg[f"{col}_previous_sprint"] = test_agg[f"{col}_second_last"] - test_agg[f"{col}_third_last"]
    test_agg[f"{col}_acceleration"] = test_agg[f"{col}_sprint"] / (test_agg[f"{col}_previous_sprint"] * test_agg[f"{col}_std"]).replace(
        [np.inf, -np.inf], np.nan
    )
    test_agg[f"{col}_last_minus_avg"] = test_agg[f"{col}_last"] - test_agg[f"{col}_avg"]
    test_agg[f"{col}_coef_var"] = (test_agg[f"{col}_std"] / test_agg[f"{col}_avg"]).replace([np.inf, -np.inf], np.nan)
    test_agg[f"{col}_ma3_r1_r2"] = test_agg[f"{col}_ma3_r1"] / test_agg[f"{col}_ma3_r2"]
    test_agg[f"{col}_ma3_r1_r3"] = test_agg[f"{col}_ma3_r1"] / test_agg[f"{col}_ma3_r3"]
    test_agg[f"{col}_ma3_r1_r4"] = test_agg[f"{col}_ma3_r1"] / test_agg[f"{col}_ma3_r4"]
    test_agg[f"{col}_general_trend"] = 100 * (test_agg[f"{col}_ma3_r1"] - test_agg[f"{col}_ma3_r4"]) / test_agg["num_statements"]
    gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 179/179 [00:28<00:00,  6.33it/s]

CPU times: user 19 s, sys: 4.79 s, total: 23.8 s
Wall time: 28.3 s





In [19]:
gc.collect()

18

In [20]:
%%time
test_agg = test_agg.reset_index().rename(columns={"index": "customer_ID"})
test_agg = test_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

CPU times: user 4.96 s, sys: 43.1 s, total: 48.1 s
Wall time: 3min 13s


In [21]:
test_agg.shape

(924621, 4333)

In [22]:
test_agg.dtypes.value_counts()

float64     2404
float32     1634
int8         228
int16         30
category      12
category      12
category       4
category       4
category       4
int64          1
dtype: int64

In [23]:
%%time
test_agg.to_pickle(f"{PROCESSED_DATA_PATH}/test_agg_complete2.pkl")

CPU times: user 4.28 ms, sys: 9.05 s, total: 9.05 s
Wall time: 44.4 s
