In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, plot_target_check,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [3]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [4]:
%load_ext autoreload
%autoreload

In [5]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [6]:
START = time.time()

### Read Data

In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [8]:
%%time
train = read_file(f"../{INTERIM_DATA_PATH}/v6/train_parquet/train_all_variables.parquet")
test1 = read_file(f"../{INTERIM_DATA_PATH}/v6/test_parquet/test_all_variables1.parquet")
test2 = read_file(f"../{INTERIM_DATA_PATH}/v6/test_parquet/test_all_variables2.parquet")

Shape of data: (5531451, 186)
Shape of data: (5681079, 185)
Shape of data: (5682683, 185)
CPU times: user 16.8 s, sys: 18.8 s, total: 35.6 s
Wall time: 20.8 s


In [9]:
df_list = [train, test1, test2]

### Feature Crossing

In [10]:
for df in df_list:
    df["DP_392"] = (df["D_39"] / -df["P_2"]).round(1)
    df["BP_92"] = (-df["P_2"] / df["B_9"]).round(1)
    df["PR_21"] = (-df["P_2"] / df["R_1"]).round(1)
    df["PR_21"] = (-df["P_2"] / df["R_1"]).round(1)
    df["B_9323"] = (df["B_9"] / df[["B_3", "B_23"]].sum(axis=1).replace(0, 0.005)).round(2)
    df["B_331825"] = (-df["B_33"] - df["B_18"] + df["S_25"]).round(1)
    df["B_19204"] = (df["B_19"] - df["B_20"] + df["B_4"]).round(1)
    df["R_324"] = (df["R_3"] / (df["R_2"] + df["R_4"]).replace(0, 0.005)).round(2)
    df["DP_483"] = (df["D_48"] / df["P_3"]).round(2)
    df["DP_553"] = (df["D_55"] / df["P_3"]).round(2)
    df["DP_394"] = (df["D_39"] / df["P_4"]).round(2)
    df["BP_94"] = (df["B_9"] / df["P_4"]).round(2)
    df["PR_41"] = (df["B_9"] / df["P_4"]).round(2)

In [13]:
len(missing_columns)

139

In [14]:
# train.to_parquet(f"./train_imputed.parquet")

In [15]:
gc.collect()

1064

In [17]:
temp = train.loc[:, get_cols(train, "B_")]

In [26]:
more_unique_balance_cols = temp.nunique()[temp.nunique() > 150].index.tolist()

In [29]:
diff_cols = [col for col in more_unique_balance_cols if len(col) <= 4]

### Calculate diff for each column

In [30]:
for col in tqdm(diff_cols):
    for df in df_list:
        df[col + "_diff"] = df.groupby("customer_ID")[col].diff()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [14:11<00:00, 53.24s/it]


### Transform Train Set

In [31]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [01:01<00:00,  3.20it/s]

CPU times: user 1min 30s, sys: 44.4 s, total: 2min 14s
Wall time: 2min 34s





In [32]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 16.268403197 GB, Shape: (458913, 5788)


In [33]:
train_agg.to_parquet("./train_agg.parquet")

In [34]:
train_agg = convert_all(train_agg)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3066/3066 [00:39<00:00, 78.36it/s]


In [35]:
train_agg = clip_all(train_agg)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 579/579 [00:12<00:00, 44.64it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:05<00:00, 47.27it/s]


In [36]:
train_agg = round_all(train_agg)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5744/5744 [00:21<00:00, 265.05it/s]


In [37]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 10.833037625 GB, Shape: (458913, 5788)


In [38]:
train_agg.shape

(458913, 5788)

In [39]:
train_agg.to_parquet(f"./train_agg.parquet")

In [40]:
del train_agg, train

### Transform Test

In [43]:
test = pd.concat([test1, test2], ignore_index=True)

In [44]:
del test1, test2

In [45]:
%%time
test_agg, keep_column = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [01:14<00:00,  2.66it/s]

CPU times: user 2min 11s, sys: 1min 47s, total: 3min 58s
Wall time: 5min





In [46]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 32.792222825 GB, Shape: (924621, 5787)


In [47]:
test_agg.to_parquet("./test_agg.parquet")

In [48]:
test_agg = convert_all(test_agg)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3072/3072 [01:23<00:00, 36.74it/s]


In [49]:
test_agg = clip_all(test_agg)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 590/590 [00:26<00:00, 22.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 288/288 [00:11<00:00, 24.35it/s]


In [50]:
test_agg = round_all(test_agg)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5743/5743 [00:47<00:00, 121.17it/s]


In [51]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 21.833614733 GB, Shape: (924621, 5787)


In [52]:
test_agg.shape

(924621, 5787)

In [53]:
test_agg.to_parquet(f"./test_agg.parquet")