In [31]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [32]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, plot_target_check,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [33]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [34]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [36]:
START = time.time()

### Read Data

In [16]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [9]:
%%time
train = read_file(f"../{INTERIM_DATA_PATH}/v7/train_parquet/final_train.parquet")
test = read_file(f"../{INTERIM_DATA_PATH}/v7/test_parquet/final_test.parquet")

Shape of data: (5531451, 182)
Shape of data: (11363762, 181)
CPU times: user 58.8 s, sys: 34.6 s, total: 1min 33s
Wall time: 11.4 s


In [22]:
split_indices = joblib.load(f"../{INTERIM_DATA_PATH}/split_indices.pkl")

In [23]:
split_indices.keys()

dict_keys(['held_out_ids', 'fold0', 'fold1', 'fold2', 'fold3', 'fold4'])

### Transform Train Set

In [17]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|██████████| 168/168 [00:55<00:00,  3.03it/s]

CPU times: user 2min 6s, sys: 18.8 s, total: 2min 25s
Wall time: 2min 25s





In [26]:
%%time
train_agg = train_agg.reset_index()

CPU times: user 4.2 s, sys: 5.38 s, total: 9.57 s
Wall time: 9.57 s


In [38]:
train_agg = train_agg.loc[:, keep_column]

In [42]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 12.412678968 GB, Shape: (458913, 4369)


In [40]:
train_agg = clip_all(train_agg)

100%|██████████| 217/217 [04:29<00:00,  1.24s/it]
100%|██████████| 133/133 [00:21<00:00,  6.24it/s]


In [43]:
train_agg = round_all(train_agg)

100%|██████████| 4369/4369 [14:59<00:00,  4.86it/s] 


In [44]:
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 12.412678968 GB, Shape: (458913, 4369)


In [47]:
train_agg["customer_ID"] = labels["customer_ID"].values

In [48]:
train_held_out = train_agg.loc[train_agg["customer_ID"].isin(split_indices["held_out_ids"])].reset_index(drop=True)
train_held_out.to_parquet(f"./held_out.parquet")
for i in range(5):
    t = train_agg.loc[train_agg["customer_ID"].isin(split_indices[f"fold{i}"])].reset_index(drop=True)
    t.to_parquet(f"./validation_fold{i}.parquet")

In [49]:
# train_agg.to_parquet(f"./train_agg.parquet")

In [50]:
del train_agg, train

### Transform Test

In [None]:
%%time
test_agg, keep_column = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


  0%|          | 0/168 [00:00<?, ?it/s]

In [None]:
%%time
test_agg = test_agg.loc[:, keep_column]

In [None]:
%%time
test_agg = test_agg.reset_index()

In [None]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

In [40]:
test_agg = clip_all(test_agg)

100%|██████████| 217/217 [04:29<00:00,  1.24s/it]
100%|██████████| 133/133 [00:21<00:00,  6.24it/s]


In [43]:
test_agg = round_all(test_agg)

100%|██████████| 4369/4369 [14:59<00:00,  4.86it/s] 


In [44]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

Size: 12.412678968 GB, Shape: (458913, 4369)


In [52]:
test_agg.shape

(924621, 5787)

In [53]:
test_agg.to_parquet(f"./test_agg.parquet")