In [73]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [74]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS

In [75]:
from feature_engineering_helpers import feature_gen_pipeline

In [76]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [78]:
START = time.time()

In [79]:
def combine_binary_features(train, test, final_col_name, col_list, sufficient_count=500, drop_ori=False):
    train["temp"] = train[col_list].astype(str).parallel_apply(lambda x: ''.join(x), axis=1)
    test["temp"] = test[col_list].astype(str).parallel_apply(lambda x: ''.join(x), axis=1)
    le = LabelEncoder()
    le.fit(train["temp"])
    le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
    train_df = train.groupby("temp").agg(
        target_count=("target", "count")
    )
    less_count_cat_codes = train_df.loc[train_df["target_count"] < sufficient_count].index.tolist()
    le_dict = {k: v if k not in less_count_cat_codes else np.nan for k, v in le_dict.items()}
    train[final_col_name] = train["temp"].map(le_dict).astype("category")
    test[final_col_name] = test["temp"].map(le_dict).astype("category")
    if drop_ori:
        train.drop(columns=col_list, inplace=True)
        test.drop(columns=col_list, inplace=True)
    return train, test

### Read Data

In [8]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [None]:
%%time
train = read_file(f"../{INTERIM_DATA_PATH}/v2/train_parquet/train_all_variables.parquet")
test1 = read_file(f"../{INTERIM_DATA_PATH}/v2/test_parquet/test_all_variables1.parquet")
test2 = read_file(f"../{INTERIM_DATA_PATH}/v2/test_parquet/test_all_variables2.parquet")

In [None]:
test = pd.concat([test1, test2], ignore_index=True)
del test1, test2

In [None]:
gc.collect()

In [None]:
df_list = [train, test]

In [None]:
for df in df_list:
    df["D_135"] = df["D_135"].fillna(-1)
    df["D_137"] = df["D_137"].fillna(-1)

In [None]:
%%time
n = train.nunique()
binary_features = n[n == 2].index.tolist()

### Group Features

### Risk Binary

In [None]:
risk_binary_features = [c for c in binary_features if c.startswith("R_")]

In [None]:
sparse_risk_binary_features = train.loc[:, risk_binary_features].sum().sort_values().iloc[:13].index.tolist()
print(sorted(sparse_risk_binary_features))

In [None]:
%%time
train, test = combine_binary_features(
    train, 
    test, 
    final_col_name="R_binaries", 
    col_list=sparse_risk_binary_features, 
    sufficient_count=300, 
    drop_ori=True
)

In [None]:
plot_train_test_distribution([train, test], "R_binaries", without_drop_tail=True, is_category=True)

In [None]:
train.shape, test.shape

In [None]:
set(train.columns) - set(test.columns)

### Spend Binary

In [None]:
spend_binary_features = [c for c in binary_features if c.startswith("S_")]
print(spend_binary_features)

In [None]:
%%time
train, test = combine_binary_features(
    train, 
    test, 
    final_col_name="S_binaries", 
    col_list=spend_binary_features, 
    sufficient_count=300
)

In [None]:
plot_train_test_distribution([train, test], "S_binaries", without_drop_tail=True, is_category=True)

### Balance Binary

In [None]:
balance_binary_features = [c for c in binary_features if c.startswith("B_")]
print(balance_binary_features)

In [None]:
sparse_balance_binary_features = ['B_27', 'B_31', 'B_32']

In [None]:
for col in sparse_balance_binary_features:
    print(train.groupby(col)['target'].mean())

In [None]:
%%time
train, test = combine_binary_features(
    train, 
    test, 
    final_col_name="B_binaries", 
    col_list=balance_binary_features, 
    sufficient_count=500,
    drop_ori=True
)

In [None]:
plot_train_test_distribution([train, test], "B_binaries", without_drop_tail=True, is_category=True)

In [None]:
train.shape, test.shape

### Delinquency Binaries

In [None]:
delinquency_binary_features = [c for c in binary_features if c.startswith("D_")]
print(delinquency_binary_features)

In [None]:
sparse_delinquency_binary_features = train.loc[:, delinquency_binary_features].sum().sort_values().iloc[:8].index.tolist()
print(sorted(sparse_delinquency_binary_features))

In [None]:
# for col in sparse_delinquency_binary_features:
#     print(train.groupby(col)['target'].mean())

In [None]:
# %%time
# train, test = combine_binary_features(
#     train, 
#     test, 
#     final_col_name="D_binaries", 
#     col_list=sparse_delinquency_binary_features, 
#     sufficient_count=500,
#     drop_ori=True
# )

In [None]:
# plot_train_test_distribution([train, test], "D_binaries", without_drop_tail=True, is_category=True)

In [None]:
train.shape, test.shape

In [None]:
train.to_parquet("./merged_train.parquet")
test.to_parquet("./merged_test.parquet")

### Aggregation

#### Train

In [8]:
%%time
train = read_file("./merged_train.parquet")

Shape of data: (5531451, 184)
CPU times: user 5.54 s, sys: 5.89 s, total: 11.4 s
Wall time: 6.48 s


In [9]:
bin_cols = get_cols(train, "_binaries")

In [10]:
train.loc[:, bin_cols] = train.loc[:, bin_cols].astype("category")

In [11]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Skewness done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA2 for least Recency done
MA3 for Recency 1 done
MA3 for Recency 2 done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:36<00:00,  4.54it/s]

CPU times: user 3min 36s, sys: 32 s, total: 4min 8s
Wall time: 4min 19s





In [17]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 14.181277593 GB, Shape: (458913, 5068)


In [18]:
float64_columns = train_agg.select_dtypes("float64").columns.tolist()
for col in tqdm(float64_columns):
    temp = train_agg[col].astype(np.float32).values
    if (temp - train_agg[col]).abs().max() < 1e-4:
        train_agg[col] = train_agg[col].astype(np.float32)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2652/2652 [00:34<00:00, 77.85it/s]


In [20]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 9.382883265 GB, Shape: (458913, 5068)


In [21]:
number_columns = train_agg.select_dtypes(np.number).columns.tolist()
for col in tqdm(number_columns):
    temp = train_agg[col].round(3)
    if (temp - train_agg[col]).abs().max() < 1e-4:
        train_agg[col] = train_agg[col].round(3)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5012/5012 [00:18<00:00, 273.08it/s]


In [22]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 9.382883265 GB, Shape: (458913, 5068)


In [23]:
train_agg = train_agg.loc[:, keep_column]

In [24]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 8.760597237 GB, Shape: (458913, 4732)


In [25]:
train_agg.to_pickle(f"./train_agg.pkl")

In [26]:
del train_agg, train

### Test 1

In [80]:
%%time
test = read_file("./merged_test.parquet")

Shape of data: (11363762, 183)
CPU times: user 11.5 s, sys: 18 s, total: 29.4 s
Wall time: 23.2 s


In [81]:
bin_cols = get_cols(test, "_binaries")

In [82]:
test1 = test.iloc[:5681884]
test2 = test.iloc[5681884:]

In [83]:
del test
del test2

In [84]:
test1.loc[:, bin_cols] = test1.loc[:, bin_cols].astype("category")

In [85]:
%%time
test_agg, keep_column = feature_gen_pipeline(test1)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Skewness done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA2 for least Recency done
MA3 for Recency 1 done
MA3 for Recency 2 done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:46<00:00,  3.63it/s]

CPU times: user 3min 49s, sys: 47.9 s, total: 4min 37s
Wall time: 4min 56s





In [86]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 17.136510259 GB, Shape: (462379, 5067)


In [87]:
float64_columns = test_agg.select_dtypes("float64").columns.tolist()
for col in tqdm(float64_columns):
    temp = test_agg[col].astype(np.float32).values
    if (temp - test_agg[col]).abs().max() < 1e-4:
        test_agg[col] = test_agg[col].astype(np.float32)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4194/4194 [01:21<00:00, 51.55it/s]


In [88]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 9.490611115 GB, Shape: (462379, 5067)


In [89]:
test_agg = test_agg.loc[:, keep_column]

In [90]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 8.865474707 GB, Shape: (462379, 4732)


In [91]:
test_agg.to_pickle(f"./test_agg1.pkl")

In [92]:
del test_agg

#### Test 2

In [57]:
%%time
test = read_file("./merged_test.parquet")

Shape of data: (11363762, 183)
CPU times: user 11 s, sys: 22.4 s, total: 33.4 s
Wall time: 25.7 s


In [58]:
bin_cols = get_cols(test, "_binaries")

In [59]:
test1 = test.iloc[:568184]
test2 = test.iloc[5681884:]

In [60]:
del test
del test1

In [61]:
test = test2.copy()
del test2

In [62]:
test.loc[:, bin_cols] = test.loc[:, bin_cols].astype("category")

In [63]:
%%time
test_agg, keep_column = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Skewness done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA2 for least Recency done
MA3 for Recency 1 done
MA3 for Recency 2 done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 167/167 [00:45<00:00,  3.70it/s]

CPU times: user 3min 48s, sys: 47.7 s, total: 4min 36s
Wall time: 4min 55s





In [64]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 17.131437834 GB, Shape: (462242, 5067)


In [65]:
float64_columns = test_agg.select_dtypes("float64").columns.tolist()
for col in tqdm(float64_columns):
    temp = test_agg[col].astype(np.float32).values
    if (temp - test_agg[col]).abs().max() < 1e-4:
        test_agg[col] = test_agg[col].astype(np.float32)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4194/4194 [01:19<00:00, 52.58it/s]


In [66]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 9.463767538 GB, Shape: (462242, 5067)


In [67]:
%%time
test_agg = test_agg.loc[:, keep_column]

In [68]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 8.84251429 GB, Shape: (462242, 4732)


In [69]:
test_agg.to_pickle(f"./test_agg2.pkl")

In [70]:
del test_agg

### END

In [71]:
END = time.time()

In [72]:
print(f"{END - START:.2f} seconds elapsed")

1190.71 seconds elapsed
