In [29]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [76]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, check_psi,
    get_cols, insert_row_number, plot_train_test_distribution, plot_target_check
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import split_public_private

In [77]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [33]:
START = time.time()

### Read Data

In [34]:
%%time
train = read_file(f"./train_parquet/train_all_variables.parquet")
test = read_file(f"./test_parquet/test_all_variables.parquet")

Shape of data: (5531451, 159)
Shape of data: (11363762, 157)
CPU times: user 46.8 s, sys: 29.8 s, total: 1min 16s
Wall time: 8.76 s


In [35]:
%%time
public_test, private_test = split_public_private(test)

Public size: 5719469, Private size: 5644293
CPU times: user 7.86 s, sys: 1.84 s, total: 9.7 s
Wall time: 9.58 s


In [36]:
train.shape, public_test.shape, private_test.shape

((5531451, 159), (5719469, 157), (5644293, 157))

In [37]:
gc.collect()

574

In [68]:
df_list = [train, public_test, private_test]

In [39]:
# for df in df_list:
#     df["D_135"] = df["D_135"].fillna(-1)
#     df["D_137"] = df["D_137"].fillna(-1)

In [40]:
train = train.drop(columns="temp", errors="ignore")

In [41]:
for col in train.columns[5:]:
    if train[col].min() != public_test[col].min():
        print(col, train[col].min(), public_test[col].min())

In [42]:
for col in train.columns[5:]:
    if train[col].min() != private_test[col].min():
        print(col, train[col].min(), private_test[col].min())

In [43]:
for col in train.columns[5:]:
    if train[col].max() != public_test[col].max():
        print(col, train[col].max(), public_test[col].max())

In [44]:
for col in train.columns[5:]:
    if train[col].max() != private_test[col].max():
        print(col, train[col].max(), private_test[col].max())

In [48]:
for col in train.columns[5:]:
    train_pub, train_private = check_psi(df_list, col)
    if train_pub > 0.1 or train_private > 0.1:
        print(col, train_pub, train_private)

In [70]:
for col in CATEGORY_COLUMNS:
    a = set(train[col].unique()) - set(private_test[col].unique())
    if len(a) > 0:
        for df in df_list:
            df[col] = df[col].fillna(0)

### Feature Crossing

In [73]:
for df in tqdm(df_list):
    df["DP_392"] = (df["D_39"] / -df["P_2"]).round(1)
    df["BP_92"] = (-df["P_2"] / df["B_9"]).round(1)
    df["PR_21"] = (-df["P_2"] / df["R_1"]).round(1)
    df["B_9323"] = (df["B_9"] / df[["B_3", "B_23"]].sum(axis=1).replace(0, 0.005)).round(2)
    df["B_19204"] = (df["B_19"] - df["B_20"] + df["B_4"]).round(1)
    df["R_324"] = (df["R_3"] / (df["R_2"] + df["R_4"]).replace(0, 0.005)).round(2)
    df["DP_483"] = (df["D_48"] / df["P_3"]).round(2)
    df["DP_553"] = (df["D_55"] / df["P_3"]).round(2)
    df["DP_394"] = (df["D_39"] / df["P_4"]).round(2)
    df["BP_94"] = (df["B_9"] / df["P_4"]).round(2)
    df["PR_41"] = (df["B_9"] / df["P_4"]).round(2)

100%|██████████| 3/3 [00:07<00:00,  2.37s/it]


In [126]:
for df in tqdm(df_list):
    df["RD_744"] = df["R_7"] - df["D_44"]
    df["PB_33"] = df["P_3"] / df["B_3"].replace(0, 0.005)
    df["SR_261"] = df["S_26"] - df["R_1"]
    df["RB_111"] = df["R_10"] * df["B_11"]
    df["R_30"] = df["R_3"] - df["R_10"]
    df["PB_320"] = df["P_3"] / df["B_20"].replace(0, 0.005)

100%|██████████| 3/3 [00:00<00:00,  6.64it/s]


### Calculate diff for each column

In [134]:
temp = train.loc[:, get_cols(train, "B_")]
more_unique_balance_cols = temp.nunique()[temp.nunique() > 150].index.tolist()
diff_cols = [col for col in more_unique_balance_cols if len(col) <= 3]

In [135]:
len(diff_cols)

4

In [136]:
for col in tqdm(diff_cols):
    for df in df_list:
        df[col + "_diff"] = df.groupby("customer_ID")[col].diff()

100%|██████████| 4/4 [14:30<00:00, 217.64s/it]


In [137]:
%%time
train = df_list[0]
test = pd.concat(df_list[1:], axis=0)
test = test.sort_values(by=["customer_ID", "S_2"]).reset_index(drop=True)

CPU times: user 16.3 s, sys: 6.31 s, total: 22.6 s
Wall time: 22.6 s


In [144]:
train.drop(columns="temp", errors="ignore", inplace=True)

In [145]:
set(train.columns) - set(test.columns)

{'target'}

In [146]:
train.to_parquet(f"./train_parquet/final_train.parquet")
test.to_parquet(f"./train_parquet/final_test.parquet")