In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, plot_target_check,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [3]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [4]:
%load_ext autoreload
%autoreload

In [5]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [6]:
START = time.time()

### Read Data

In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [8]:
%%time
train = read_file(f"./imputed_train.pkl")
test = read_file(f"./imputed_test.pkl")

Shape of data: (5531451, 157)
Shape of data: (11363762, 156)
CPU times: user 4.74 s, sys: 12.6 s, total: 17.4 s
Wall time: 23.2 s


### Transform Train Set

In [9]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [00:31<00:00,  4.53it/s]

CPU times: user 52.3 s, sys: 17.1 s, total: 1min 9s
Wall time: 1min 13s





In [10]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 11.785740533 GB, Shape: (458913, 4185)


In [11]:
train_agg = convert_all(train_agg)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2221/2221 [00:20<00:00, 107.35it/s]


In [22]:
# m = MinMaxScaler()

In [7]:
train_agg = read_file("./train_agg.pkl")

Shape of data: (458913, 4186)


In [15]:
second_third_last_columns = get_cols(train_agg, ["second_last", "third_last"])

In [51]:
ma_noob_cols = get_cols(train_agg, ["_ma2_r2", "_ma2_r3", "ma3_r2", "ma2_r1_r3"])

In [52]:
%%time
train_agg.drop(columns=second_third_last_columns + ma_noob_cols, errors="ignore", inplace=True)

CPU times: user 284 ms, sys: 1.81 s, total: 2.09 s
Wall time: 3.21 s


In [53]:
train_agg.shape

(458913, 3310)

In [29]:
%%time
missing = train_agg.isnull().sum()
missing_columns = missing[missing > 0].index.tolist()
std_cols = get_cols(train_agg, ["std"])
missing_std_cols = list(set(missing_columns).intersection(std_cols))

CPU times: user 1.94 s, sys: 1.83 s, total: 3.78 s
Wall time: 5.19 s


In [30]:
for col in tqdm(missing_std_cols):
    train_agg[col] = train_agg[col].fillna(0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [06:32<00:00,  4.36s/it]


In [61]:
%%time
missing = train_agg.isnull().sum()
missing_columns = missing[missing > 0].index.tolist()

CPU times: user 1.7 s, sys: 1.25 s, total: 2.95 s
Wall time: 3.75 s


In [62]:
len(missing_columns)

860

In [63]:
for col in tqdm(missing_columns):
    try:
        train_agg[col] = train_agg[col].fillna(train_agg[col].mean())
    except:
        train_agg[col] = train_agg[col].fillna(train_agg[col].mode()[0])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 860/860 [28:07<00:00,  1.96s/it]


In [64]:
train_agg.to_pickle("./train_agg.pkl")

In [67]:
# del train_agg, train

In [68]:
gc.collect()

21398

### Transform Test

In [18]:
%%time
test_agg, keep_column = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [00:53<00:00,  2.66it/s]

CPU times: user 1min 40s, sys: 1min 10s, total: 2min 50s
Wall time: 3min 42s





In [19]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 23.760524417 GB, Shape: (924621, 4184)


In [20]:
test_agg = convert_all(test_agg)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2227/2227 [00:46<00:00, 48.01it/s]


In [69]:
%%time
test_agg = read_file("./test_agg.pkl")

Shape of data: (924621, 4184)


In [70]:
second_third_last_columns = get_cols(test_agg, ["second_last", "third_last"])

In [71]:
ma_noob_cols = get_cols(test_agg, ["_ma2_r2", "_ma2_r3", "ma3_r2", "ma2_r1_r3"])

In [72]:
%%time
test_agg.drop(columns=second_third_last_columns + ma_noob_cols, errors="ignore", inplace=True)

CPU times: user 696 ms, sys: 4.88 s, total: 5.58 s
Wall time: 10.1 s


In [73]:
test_agg.shape

(924621, 3308)

In [75]:
%%time
missing = test_agg.isnull().sum()
missing_columns = missing[missing > 0].index.tolist()
std_cols = get_cols(test_agg, ["std"])
missing_std_cols = list(set(missing_columns).intersection(std_cols))

CPU times: user 3.25 s, sys: 2.52 s, total: 5.77 s
Wall time: 6.99 s


In [76]:
for col in tqdm(missing_std_cols):
    test_agg[col] = test_agg[col].fillna(0)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [21:42<00:00,  9.11s/it]


In [77]:
%%time
missing = test_agg.isnull().sum()
missing_columns = missing[missing > 0].index.tolist()

CPU times: user 3.5 s, sys: 5.69 s, total: 9.19 s
Wall time: 24.8 s


In [78]:
len(missing_columns)

1020

In [79]:
for col in tqdm(missing_columns):
    try:
        test_agg[col] = test_agg[col].fillna(test_agg[col].mean())
    except:
        test_agg[col] = test_agg[col].fillna(test_agg[col].mode()[0])

 30%|██████████████████████████████████████▏                                                                                          | 302/1020 [36:19<1:26:20,  7.22s/it]


KeyboardInterrupt: 

In [81]:
test_agg.to_pickle("./test_agg.pkl")

In [80]:
test_agg.head()

Unnamed: 0_level_0,P_2_last,S_3_last,P_3_last,S_5_last,S_6_last,S_7_last,S_8_last,S_12_last,S_13_last,S_15_last,...,D_111_previous_sprint,D_111_acceleration,D_111_range,D_111_displacement,D_111_displacement_ratio,D_111_velocity,D_111_last_minus_avg,D_111_last_minus_midpoint,D_111_coef_var,D_111_trend_index
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.56,0.14,0.56,0.01,0,0.15,0.46,0.18,6.0,0.4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.0,0.0
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.84,0.11,0.52,0.0,0,0.08,0.76,0.19,2.0,0.2,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.0,0.0
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.69,0.16,0.56,0.0,0,0.17,0.12,0.18,1.0,0.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.0,0.0
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.51,0.18,0.62,0.32,0,0.55,0.46,0.25,9.0,0.4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.0,0.0
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.25,0.16,0.56,0.06,0,0.11,0.24,-1.0,1.0,0.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.0,0.0
