In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [3]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [4]:
%load_ext autoreload
%autoreload

In [5]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [6]:
START = time.time()

### Read Data

#### Impute Train

In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [79]:
%%time
# train = read_file(f"../{INTERIM_DATA_PATH}/v5/train_parquet/train_all_variables.parquet")
train = read_file(f"./train_imputed.parquet")

Shape of data: (5531451, 189)
CPU times: user 4.59 s, sys: 8.31 s, total: 12.9 s
Wall time: 8.04 s


In [80]:
missing = train.isnull().sum()
missing_columns = missing[missing > 0].index.tolist()
len(missing_columns)

38

In [42]:
%%time
for col in tqdm(missing_columns):
    train[col] = train.groupby("customer_ID")[col].bfill().ffill()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [01:35<00:00,  1.21it/s]

CPU times: user 1min 14s, sys: 18.9 s, total: 1min 32s
Wall time: 1min 35s





In [43]:
len(missing_columns)

116

In [86]:
# train.to_parquet(f"./train_imputed.parquet")

In [45]:
gc.collect()

8358

#### Impute Test

In [83]:
%%time
test1 = read_file(f"../{INTERIM_DATA_PATH}/v5/test_parquet/test_all_variables1.parquet")
test2 = read_file(f"../{INTERIM_DATA_PATH}/v5/test_parquet/test_all_variables2.parquet")
test = pd.concat([test1, test2], ignore_index=True)
del test1, test2

Shape of data: (5681079, 188)
Shape of data: (5682683, 188)
CPU times: user 12.2 s, sys: 22.9 s, total: 35.1 s
Wall time: 28.4 s


In [84]:
missing = test.isnull().sum()
missing_columns = missing[missing > 0].index.tolist()
len(missing_columns)

120

In [48]:
%%time
for col in tqdm(missing_columns):
    test[col] = test.groupby("customer_ID")[col].bfill().ffill()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 120/120 [02:40<00:00,  1.34s/it]

CPU times: user 2min 30s, sys: 8.95 s, total: 2min 39s
Wall time: 2min 40s





In [85]:
# test.to_parquet(f"./test_imputed.parquet")

In [50]:
df_list = [train, test]

In [51]:
# %%time
# n = train.nunique()
# binary_features = n[n == 2].index.tolist()

### Calculate diff for each column

In [None]:
%%time
train = read_file(f"./train_imputed.parquet")
test = read_file(f"./test_imputed.parquet")

In [81]:
base_feature_column = train.iloc[:, 5:].columns.tolist()

In [82]:
train = train.drop(columns=CATEGORY_COLUMNS, errors="ignore")

In [None]:
for col in base_feature_column:
    train[col] = train.groupby("customer_ID")[col].diff()

Index(['P_2', 'S_3', 'P_3', 'S_5', 'S_6', 'S_7', 'S_8', 'S_12', 'S_13', 'S_15',
       ...
       'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143', 'D_144',
       'D_145', 'D_43_'],
      dtype='object', length=185)

In [72]:
train[["customer_ID", "D_43_"]].head(26)

Unnamed: 0,customer_ID,D_43_
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
5,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
6,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
7,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
8,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
9,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,


In [70]:
train[["customer_ID", "D_43"]].head(26)

Unnamed: 0,customer_ID,D_43
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
5,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
6,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
7,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
8,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,
9,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,


### Transform Train Set

In [33]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 173/173 [01:40<00:00,  1.72it/s]

CPU times: user 2min 38s, sys: 1min 21s, total: 3min 59s
Wall time: 4min 22s





In [34]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 14.388695549 GB, Shape: (458913, 5063)


In [35]:
train_agg = convert_all(train_agg)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2767/2767 [02:14<00:00, 20.57it/s]


In [36]:
train_agg = clip_all(train_agg)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 349/349 [00:11<00:00, 29.48it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 107/107 [00:03<00:00, 30.12it/s]


In [37]:
train_agg = round_all(train_agg)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5019/5019 [00:39<00:00, 125.59it/s]


In [38]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 9.375529937 GB, Shape: (458913, 5063)


In [25]:
train_agg.shape

(458913, 5063)

In [40]:
# train_agg.to_parquet(f"./train_agg.parquet")

In [44]:
# del train_agg, train

### Transform Test

In [45]:
%%time
test_agg, keep_column = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 173/173 [02:13<00:00,  1.30it/s]

CPU times: user 5min 10s, sys: 7min 2s, total: 12min 13s
Wall time: 14min 55s





In [46]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 29.004975209 GB, Shape: (924621, 5062)


In [None]:
test_agg = convert_all(test_agg)

 28%|█████████████████████████████████▉                                                                                        | 772/2773 [03:01<07:30,  4.44it/s]

In [None]:
test_agg = clip_all(test_agg)

In [None]:
test_agg = round_all(test_agg)

In [None]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

In [None]:
test_agg.shape

In [40]:
# train_agg.to_parquet(f"./train_agg.parquet")

In [39]:
# del train_agg, train