In [54]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [55]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, 
    get_cols
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    filter_df_for_feature, get_specific_row_df, get_agg_df
)

In [56]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [57]:
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [58]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Feature Engineering on Train

In [None]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")
labels = read_file(f"{RAW_DATA_PATH}/train_labels.csv")

In [None]:
%%time
train_agg = get_agg_df(raw_train)

In [None]:
train_agg

In [None]:
lgbm_feature_imp = read_file(f"{PROCESSED_DATA_PATH}/lgbm_feature_imp_summary.csv")

In [49]:
global_threshold = 20
features_dict = {}
for column in lgbm_feature_imp.columns[1:]:
    features_dict[column] = lgbm_feature_imp.loc[lgbm_feature_imp[column] >= global_threshold]["feature"].tolist()

In [41]:
lgbm_feature_imp["first"].mean() / lgbm_feature_imp.mean().mean()

1.4942603446184535

In [37]:
lgbm_feature_imp["last"].mean() / lgbm_feature_imp.mean().mean()

1.03384708238144

In [29]:
np.percentile(lgbm_feature_imp["first"], 30)

0       75.2
1      105.2
2       74.0
3      115.0
4      133.6
       ...  
183    147.8
184      NaN
185    218.6
186      NaN
187     71.6
Name: first, Length: 188, dtype: float64

In [27]:
lgbm_feature_imp.describe()

Unnamed: 0,first,second_last,last,avg,max,min,range,std,displacement,sprint,previous_sprint,last_minus_avg,velocity
count,72.0,147.0,136.0,129.0,108.0,72.0,76.0,159.0,132.0,122.0,177.0,156.0,139.0
mean,142.961111,77.881633,98.911765,98.79845,99.988889,125.297222,115.692105,81.830189,70.851515,103.832787,71.821469,82.360256,73.528058
std,46.689549,68.400788,83.396413,69.701939,66.757213,66.607752,32.384679,65.464909,59.408641,87.508195,85.265922,68.919613,61.963931
min,52.8,0.0,0.8,4.2,0.8,6.0,58.0,0.8,0.2,0.0,0.0,0.4,0.0
25%,103.35,9.8,21.3,37.2,35.0,82.7,90.85,15.6,9.25,8.5,0.0,13.6,9.1
50%,143.7,76.8,88.1,93.0,105.9,131.4,115.8,82.4,71.6,137.4,13.4,77.4,59.4
75%,175.65,132.4,151.25,149.0,144.35,157.15,134.2,142.6,120.4,172.35,169.6,134.5,129.1
max,248.6,239.2,379.0,340.6,246.6,305.2,211.2,235.2,187.8,401.6,285.2,298.4,201.8


In [None]:
%%time
train_agg["num_statements"] = (
    raw_train.loc[raw_train["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

In [None]:
train_agg.head(3)

In [None]:
%%time
train_last_etc = get_specific_row_df(raw_train)

In [None]:
train_last_etc.head(3)

In [None]:
%%time
train_agg = train_last_etc.merge(train_agg, left_index=True, right_index=True, how="inner")
del train_last_etc

In [None]:
numeric_columns = list(set(raw_train.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [None]:
train_agg.shape

In [None]:
%%time
for col in tqdm(numeric_columns):
    train_agg[f"{col}_range"] = train_agg[f"{col}_max"] - train_agg[f"{col}_min"]
    train_agg[f"{col}_displacement"] = train_agg[f"{col}_last"] - train_agg[f"{col}_first"]
    train_agg[f"{col}_velocity"] = train_agg[f"{col}_displacement"] / np.log(train_agg["num_statements"])
    train_agg[f"{col}_sprint"] = train_agg[f"{col}_last"] - train_agg[f"{col}_second_last"]
    train_agg[f"{col}_previous_sprint"] = train_agg[f"{col}_second_last"] - train_agg[f"{col}_third_last"]
    train_agg[f"{col}_last_minus_avg"] = train_agg[f"{col}_last"] - train_agg[f"{col}_avg"]

In [None]:
# train_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [None]:
train_agg.shape

In [None]:
# backup = train_agg.copy()

#### Drop Average Columns

In [None]:
avg_cols = get_cols(train_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
train_agg = train_agg.drop(columns=avg_cols, errors="ignore")
train_agg.shape

#### Drop Minimum Columns

In [None]:
min_cols = get_cols(train_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
train_agg = train_agg.drop(columns=min_cols, errors="ignore")
train_agg.shape

#### Drop Maximum Columns

In [None]:
max_cols = get_cols(train_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
train_agg = train_agg.drop(columns=max_cols, errors="ignore")
train_agg.shape

#### Drop First Columns

In [None]:
first_cols = get_cols(train_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
train_agg = train_agg.drop(columns=first_cols, errors="ignore")
train_agg.shape

#### Drop Last Columns

In [None]:
last_cols = get_cols(train_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
train_agg = train_agg.drop(columns=last_cols, errors="ignore")
train_agg.shape

#### Drop Range Columns

In [None]:
range_cols = get_cols(train_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
train_agg = train_agg.drop(columns=range_cols, errors="ignore")
train_agg.shape

#### Drop Noob Features (Based on previous experiment)

In [None]:
noob_features = pd.read_csv(f"{DROP_FEATURES_PATH}/noob_features.csv")["feature"].values.tolist()

In [None]:
train_agg = train_agg.drop(columns=noob_features, errors="ignore")
train_agg.shape

In [None]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# train_agg = train_agg.loc[:, feature_list]

In [None]:
train_agg = train_agg.reset_index().rename(columns={"index": "customer_ID"})

In [None]:
train_agg = train_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [None]:
train_agg["dummy"] = np.random.randn(train_agg.shape[0])

In [None]:
train_agg.shape

In [None]:
train_agg["target"] = labels["target"].values
train_agg.shape

In [None]:
train_agg.dtypes.value_counts()

In [None]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg_finer.pkl")

### Feature Engineering on Test

In [None]:
%%time
raw_test = read_file(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")

In [None]:
%%time
test_agg = get_agg_df(raw_test)

In [None]:
%%time
test_agg["num_statements"] = (
    raw_test.loc[raw_test["row_number"] == 1][["row_number", "row_number_inv"]].sum(axis=1) - 1
).reset_index(drop=True).values

In [None]:
test_agg.head(3)

In [None]:
%%time
test_last_etc = get_specific_row_df(raw_test)

In [None]:
test_last_etc.head(3)

In [None]:
%%time
test_agg = test_last_etc.merge(test_agg, left_index=True, right_index=True, how="inner")
del test_last_etc

In [None]:
numeric_columns = list(set(raw_test.columns) - set(CATEGORY_COLUMNS) - set(NON_FEATURE_COLUMNS))
all_columns = list(set(numeric_columns).union(set(CATEGORY_COLUMNS)))

In [None]:
test_agg.shape

In [None]:
%%time
for col in tqdm(numeric_columns):
    test_agg[f"{col}_range"] = test_agg[f"{col}_max"] - test_agg[f"{col}_min"]
    test_agg[f"{col}_displacement"] = test_agg[f"{col}_last"] - test_agg[f"{col}_first"]
    test_agg[f"{col}_velocity"] = test_agg[f"{col}_displacement"] / np.log(test_agg["num_statements"])
    test_agg[f"{col}_sprint"] = test_agg[f"{col}_last"] - test_agg[f"{col}_second_last"]
    test_agg[f"{col}_previous_sprint"] = test_agg[f"{col}_second_last"] - test_agg[f"{col}_third_last"]
    test_agg[f"{col}_last_minus_avg"] = test_agg[f"{col}_last"] - test_agg[f"{col}_avg"]

In [None]:
# test_agg[[f"{col}_previous_sprint", f"{col}_sprint"]]

In [None]:
test_agg.shape

In [None]:
# backup = test_agg.copy()

#### Drop Average Columns

In [None]:
avg_cols = get_cols(test_agg, "_avg", excludes="last_minus")
avg_cols = [c for c in avg_cols if c.replace("_avg", "") not in MEAN_FEATURES]
test_agg = test_agg.drop(columns=avg_cols, errors="ignore")
test_agg.shape

#### Drop Minimum Columns

In [None]:
min_cols = get_cols(test_agg, "_min", excludes="last_minus")
min_cols = [c for c in min_cols if c.replace("_min", "") not in MIN_FEATURES]
test_agg = test_agg.drop(columns=min_cols, errors="ignore")
test_agg.shape

#### Drop Maximum Columns

In [None]:
max_cols = get_cols(test_agg, "_max", excludes="last_minus")
max_cols = [c for c in max_cols if c.replace("_max", "") not in MAX_FEATURES]
test_agg = test_agg.drop(columns=max_cols, errors="ignore")
test_agg.shape

#### Drop First Columns

In [None]:
first_cols = get_cols(test_agg, "_first", excludes="last_minus")
first_cols = [c for c in first_cols if c.replace("_first", "") not in FIRST_FEATURES]
test_agg = test_agg.drop(columns=first_cols, errors="ignore")
test_agg.shape

#### Drop Last Columns

In [None]:
last_cols = get_cols(test_agg, "_last", excludes=["last_minus", "second_last"])
last_cols = [c for c in last_cols if c.replace("_last", "") not in LAST_FEATURES]
test_agg = test_agg.drop(columns=last_cols, errors="ignore")
test_agg.shape

#### Drop Range Columns

In [None]:
range_cols = get_cols(test_agg, "_range", excludes=["last_minus", "second_last"])
range_cols = [c for c in range_cols if c.replace("_range", "") not in RANGE_FEATURES]
test_agg = test_agg.drop(columns=range_cols, errors="ignore")
test_agg.shape

#### Drop Noob Features (Based on previous experiment)

In [None]:
noob_features = pd.read_csv(f"{DROP_FEATURES_PATH}/noob_features.csv")["feature"].values.tolist()

In [None]:
test_agg = test_agg.drop(columns=noob_features, errors="ignore")
test_agg.shape

In [None]:
# feature_list = (
#     [c + "_avg" for c in MEAN_FEATURES] +
#     [c + "_min" for c in MIN_FEATURES] +
#     [c + "_max" for c in MAX_FEATURES] +
#     [c + "_first" for c in FIRST_FEATURES] +
#     [c + "_last" for c in LAST_FEATURES] +
#     [c + "_range" for c in RANGE_FEATURES] +
#     [c + "_velocity" for c in VELOCITY_FEATURES] +
#     [c + "_speed" for c in SPEED_FEATURES] + 
#     [c for c in train.columns if c.endswith(("_lag1_diff", "_last_lift"))]
# )
# test_agg = test_agg.loc[:, feature_list]

In [None]:
test_agg = test_agg.reset_index().rename(columns={"index": "customer_ID"})

In [None]:
test_agg = test_agg.drop(columns=NON_FEATURE_COLUMNS, errors='ignore')

In [None]:
# test_agg["dummy"] = np.random.randn(test_agg.shape[0])

In [None]:
test_agg.shape

In [None]:
test_agg.dtypes.value_counts()

In [None]:
test_agg.to_pickle(f"{PROCESSED_DATA_PATH}/test_agg_finer.pkl")