In [67]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [95]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [96]:
from feature_engineering_helpers import feature_gen_pipeline

In [97]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [71]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [72]:
START = time.time()

### Read Data

In [48]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [11]:
%%time
train = read_file(f"../{INTERIM_DATA_PATH}/v5/train_parquet/train_all_variables.parquet")
test1 = read_file(f"../{INTERIM_DATA_PATH}/v5/test_parquet/test_all_variables1.parquet")
test2 = read_file(f"../{INTERIM_DATA_PATH}/v5/test_parquet/test_all_variables2.parquet")

Shape of data: (5531451, 189)
Shape of data: (5681079, 188)
Shape of data: (5682683, 188)
CPU times: user 16.3 s, sys: 17 s, total: 33.3 s
Wall time: 18.6 s


In [12]:
%%time
test = pd.concat([test1, test2], ignore_index=True)
del test1, test2

In [38]:
gc.collect()

180

In [39]:
df_list = [train, test]

In [40]:
# %%time
# n = train.nunique()
# binary_features = n[n == 2].index.tolist()

### Transform Train Set

In [73]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 173/173 [00:50<00:00,  3.40it/s]

CPU times: user 1min 15s, sys: 35.4 s, total: 1min 50s
Wall time: 2min 6s





In [74]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 14.388695549 GB, Shape: (458913, 5063)


#### Reduce data size

In [75]:
train_agg.dtypes.value_counts()

float64     2767
float32     1992
int32        258
category      12
category       8
category       4
category       4
category       4
category       4
category       4
category       4
int64          2
dtype: int64

In [76]:
float64_columns = train_agg.select_dtypes("float64").columns.tolist()
for col in tqdm(float64_columns):
    temp = train_agg[col].astype(np.float32).values
    if (temp - train_agg[col]).abs().max() < 1e-4:
        train_agg[col] = train_agg[col].astype(np.float32)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2767/2767 [00:32<00:00, 86.15it/s]


In [77]:
train_agg.dtypes.value_counts()

float32     4723
int32        258
float64       36
category      12
category       8
category       4
category       4
category       4
category       4
category       4
category       4
int64          2
dtype: int64

In [78]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 9.375529937 GB, Shape: (458913, 5063)


#### Clipping

In [93]:
max_ = train_agg.max()
max_columns = max_[max_ > 1e3].index.tolist()

In [98]:
for col in tqdm(max_columns):
    max_threshold1 = np.percentile(train_agg[col].dropna(), 99.9)
    max_threshold2 = np.percentile(train_agg[col].dropna(), 99)
    if max_threshold1 <= 1e3:
        train_agg = clip_col(train_agg, col, top_value=max_threshold1, add_new_col=False)
    elif max_threshold2 <= 1e3:
        train_agg = clip_col(train_agg, col, top_value=max_threshold2, add_new_col=False)
    else:
        train_agg = clip_col(train_agg, col, top_value=1e3, add_new_col=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 349/349 [00:06<00:00, 51.17it/s]


In [99]:
min_ = train_agg.min()
min_columns = min_[min_ < -1e3].index.tolist()

In [100]:
for col in tqdm(min_columns):
    min_threshold1 = np.percentile(train_agg[col].dropna(), 0.1)
    min_threshold2 = np.percentile(train_agg[col].dropna(), 1)
    if min_threshold1 >= -1e3:
        train_agg = clip_col(train_agg, col, btm_value=min_threshold1, add_new_col=False)
    elif min_threshold2 >= -1e3:
        train_agg = clip_col(train_agg, col, btm_value=min_threshold2, add_new_col=False)
    else:
        train_agg = clip_col(train_agg, col, btm_value=-1e3, add_new_col=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 107/107 [00:02<00:00, 43.63it/s]


#### Rounding

In [103]:
number_columns = train_agg.select_dtypes(np.number).columns.tolist()
for col in tqdm(number_columns):
    temp = train_agg[col].round(3)
    if (temp - train_agg[col]).abs().max() < 1e-4:
        train_agg[col] = train_agg[col].round(3)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5019/5019 [00:17<00:00, 293.54it/s]


In [104]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 9.375529937 GB, Shape: (458913, 5063)


In [106]:
train_agg.shape

(458913, 5063)

In [108]:
train_agg.to_parquet(f"./train_agg.parquet")

In [109]:
del train_agg, train