In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, plot_target_check,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [3]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [4]:
%load_ext autoreload
%autoreload

In [5]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [6]:
START = time.time()

### Read Data

In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [8]:
labels["target"].mean()

0.2589336105100531

In [9]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v1/train_agg.pkl")

Shape of data: (458913, 4083)
Shape of data: (924621, 4081)
CPU times: user 10.6 s, sys: 38 s, total: 48.6 s
Wall time: 1min 27s


In [10]:
split_indices = joblib.load(f"../{INTERIM_DATA_PATH}/split_indices.pkl")

In [11]:
split_indices.keys()

dict_keys(['held_out_ids', 'fold0', 'fold1', 'fold2', 'fold3', 'fold4'])

In [12]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 11.402159586 GB, Shape: (458913, 4083)


In [13]:
train_agg["customer_ID"] = labels["customer_ID"].values

In [14]:
train_held_out = train_agg.loc[train_agg["customer_ID"].isin(split_indices["held_out_ids"])].reset_index(drop=True)
train_held_out.to_pickle(f"./held_out.pkl")
for i in range(5):
    t = train_agg.loc[train_agg["customer_ID"].isin(split_indices[f"fold{i}"])].reset_index(drop=True)
    t.to_pickle(f"./validation_fold{i}.pkl")

In [15]:
# del train_agg

### Transform Test

In [9]:
test_agg = read_file(f"../{PROCESSED_DATA_PATH}/v1/test_agg.pkl").reset_index()

Shape of data: (924621, 4081)


In [10]:
test_agg.head()

Unnamed: 0,customer_ID,P_2_last,S_3_last,P_3_last,S_5_last,S_6_last,S_7_last,S_8_last,S_9_last,S_11_last,...,D_145_displacement,D_145_last_first_ratio,D_145_velocity,D_145_sprint,D_145_last_minus_avg,D_145_coef_var,D_145_ma3_r1_r2,D_145_ma2_r1_r2,D_145_ma2_r1_r3,D_145_general_trend
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.56,0.14,0.56,0.01,0,0.15,0.46,0.01,44,...,,,,0.0,0.0,,,,,0.0
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.84,0.11,0.52,0.0,0,0.08,0.76,0.01,3,...,0.0,,0.0,0.0,0.0,,,,,0.0
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.69,0.16,0.56,0.0,0,0.17,0.12,0.0,22,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.51,0.18,0.62,0.32,0,0.55,0.47,0.01,24,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.25,0.16,0.56,0.06,0,0.11,0.24,0.0,11,...,0.0,,0.0,0.0,0.0,,,,,0.0


In [11]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 29.242070886 GB, Shape: (924621, 4082)


In [12]:
# test_agg = clip_all(test_agg)

In [13]:
# test_agg = round_all(test_agg)

In [14]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 29.242070886 GB, Shape: (924621, 4082)


In [15]:
test_agg.shape

(924621, 4082)

In [16]:
test_agg.to_pickle(f"./test_agg.pkl")