In [39]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [40]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, check_psi, describe_all,
    get_cols, insert_row_number, plot_train_test_distribution, plot_target_check
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import split_public_private

In [41]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [43]:
START = time.time()

### Read Data

In [44]:
%%time
train = read_file(f"./train_parquet/train_all_variables.parquet")
test = read_file(f"./test_parquet/test_all_variables.parquet")

Shape of data: (5531451, 158)
Shape of data: (11363762, 157)
CPU times: user 14.5 s, sys: 18.5 s, total: 33 s
Wall time: 23.2 s


In [45]:
%%time
public_test, private_test = split_public_private(test)

Public size: 5719469, Private size: 5644293
CPU times: user 4.16 s, sys: 8.46 s, total: 12.6 s
Wall time: 18.4 s


In [46]:
train.shape, public_test.shape, private_test.shape

((5531451, 158), (5719469, 157), (5644293, 157))

In [47]:
gc.collect()

529

In [48]:
df_list = [train, public_test, private_test]

In [49]:
# for df in df_list:
#     df["D_135"] = df["D_135"].fillna(-1)
#     df["D_137"] = df["D_137"].fillna(-1)

In [50]:
wrong_cols = get_cols(train, "temp")
if len(wrong_cols) > 0:
    print(wrong_cols)
    # train = train.drop(columns="temp", errors="ignore")

In [51]:
for col in train.columns[5:]:
    if train[col].min() != public_test[col].min():
        print(col, train[col].min(), public_test[col].min())

In [52]:
for col in train.columns[5:]:
    if train[col].min() != private_test[col].min():
        print(col, train[col].min(), private_test[col].min())

In [53]:
for col in train.columns[5:]:
    if train[col].max() != public_test[col].max():
        print(col, train[col].max(), public_test[col].max())

In [54]:
for col in train.columns[5:]:
    if train[col].max() != private_test[col].max():
        print(col, train[col].max(), private_test[col].max())

In [55]:
# for col in train.columns[5:]:
#     train_pub, train_private = check_psi(df_list, col)
#     if train_pub > 0.1 or train_private > 0.1:
#         print(col, train_pub, train_private)

In [56]:
train.columns[-60:]

Index(['D_63', 'D_64', 'D_65', 'D_68', 'D_69', 'D_70', 'D_71', 'D_72', 'D_74',
       'D_75', 'D_76', 'D_77', 'D_78', 'D_79', 'D_80', 'D_81', 'D_82', 'D_83',
       'D_84', 'D_87', 'D_91', 'D_99', 'D_888', 'D_813', 'D_102', 'D_104',
       'D_105', 'D_106', 'D_107', 'D_109', 'D_110', 'D_111', 'D_112', 'D_113',
       'D_114', 'D_115', 'D_117', 'D_118', 'D_119', 'D_120', 'D_122', 'D_123',
       'D_124', 'D_125', 'D_126', 'D_128', 'D_129', 'D_130', 'D_131', 'D_132',
       'D_133', 'D_134', 'D_136', 'D_141', 'D_142', 'D_144', 'D_145', 'D_183',
       'D_1314', 'D_1343'],
      dtype='object')

In [57]:
for df in df_list:
    print(sorted(df["D_117"].unique()))

[0, 1, 2, 3, 4, 5, 6]
[0, 1, 2, 3, 4, 5, 6]
[0, 1, 2, 3, 4, 5, 6]


In [58]:
for col in CATEGORY_COLUMNS:
    a = set(train[col].unique()) - set(private_test[col].unique())
    if len(a) > 0:
        print(col)
        for df in df_list:
            # df[col] = df[col].fillna(0)
            pass

### Feature Crossing

In [59]:
for df in tqdm(df_list):
    df["DP_392"] = (df["D_39"] / -df["P_2"]).round(1)
    df["BP_92"] = (-df["P_2"] / df["B_9"]).round(1)
    df["PR_21"] = (-df["P_2"] / df["R_1"]).round(1)
    df["B_9323"] = (df["B_9"] / df[["B_3", "B_23"]].sum(axis=1).replace(0, 0.005)).round(2)
    df["B_19204"] = (df["B_19"] - df["B_20"] + df["B_4"]).round(1)
    df["R_324"] = (df["R_3"] / (df["R_2"] + df["R_4"]).replace(0, 0.005)).round(2)
    df["DP_483"] = (df["D_48"] / df["P_3"]).round(2)
    df["DP_553"] = (df["D_55"] / df["P_3"]).round(2)
    df["DP_394"] = (df["D_39"] / df["P_4"]).round(2)
    df["BP_94"] = (df["B_9"] / df["P_4"]).round(2)
    df["PR_41"] = (df["B_9"] / df["P_4"]).round(2)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:14<00:00,  4.90s/it]


In [60]:
for df in tqdm(df_list):
    df["RD_744"] = df["R_7"] - df["D_44"]
    df["PB_33"] = df["P_3"] / df["B_3"].replace(0, 0.005)
    df["SR_261"] = df["S_26"] - df["R_1"]
    df["RB_111"] = df["R_10"] * df["B_11"]
    df["R_30"] = df["R_3"] - df["R_10"]
    df["PB_320"] = df["P_3"] / df["B_20"].replace(0, 0.005)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  7.32it/s]


### Calculate diff for each column

In [61]:
temp = train.loc[:, get_cols(train, "B_")]
more_unique_balance_cols = temp.nunique()[temp.nunique() > 150].index.tolist()
diff_cols = [col for col in more_unique_balance_cols if len(col) <= 3]

In [62]:
len(diff_cols), diff_cols

(4, ['B_4', 'B_5', 'B_6', 'B_9'])

In [63]:
for col in tqdm(diff_cols):
    for df in df_list:
        df[col + "_diff"] = df.groupby("customer_ID")[col].diff()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:32<00:00, 53.04s/it]


In [64]:
%%time
train = df_list[0]
test = pd.concat(df_list[1:], axis=0)
test = test.sort_values(by=["customer_ID", "S_2"]).reset_index(drop=True)

CPU times: user 8.68 s, sys: 20.1 s, total: 28.7 s
Wall time: 43.5 s


In [65]:
train.drop(columns="temp", errors="ignore", inplace=True)

In [66]:
set(train.columns) - set(test.columns)

{'target'}

In [67]:
train.to_parquet(f"./train_parquet/final_train.parquet")
test.to_parquet(f"./train_parquet/final_test.parquet")