In [17]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from tqdm import tqdm

In [18]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, plot_target_check,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [19]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [20]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [22]:
START = time.time()

### Read Data

In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [8]:
%%time
train = read_file(f"../{INTERIM_DATA_PATH}/v7/train_parquet/train_all_variables.parquet")
test = read_file(f"../{INTERIM_DATA_PATH}/v7/test_parquet/test_all_variables.parquet")

Shape of data: (5531451, 158)
Shape of data: (11363762, 157)
CPU times: user 14.8 s, sys: 22.3 s, total: 37.1 s
Wall time: 34.4 s


### Transform Train Set

In [23]:
%%time
train_agg, keep_column = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 144/144 [00:41<00:00,  3.46it/s]

CPU times: user 1min 1s, sys: 47.3 s, total: 1min 48s
Wall time: 2min 42s





In [27]:
%%time
train_agg = train_agg.loc[:, keep_column]

In [28]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 10.218093725 GB, Shape: (458913, 3637)


In [None]:
# train_agg = convert_all(train_agg)

In [22]:
# m = MinMaxScaler()

In [31]:
second_third_last_columns = get_cols(train_agg, ["second_last", "third_last"])

In [36]:
ma_noob_cols = get_cols(train_agg, ["_ma2_r2", "_ma2_r3", "ma3_r2"])

In [38]:
%%time
train_agg.drop(columns=second_third_last_columns + ma_noob_cols, errors="ignore", inplace=True)

CPU times: user 823 ms, sys: 4.83 s, total: 5.65 s
Wall time: 9.39 s


In [39]:
train_agg.shape

(458913, 3475)

In [41]:
train_agg.to_pickle("./train_agg.pkl")

In [43]:
del train_agg, train

In [45]:
gc.collect()

18

### Transform Test

In [46]:
%%time
test_agg, keep_column = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for least Recency done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 144/144 [01:20<00:00,  1.78it/s]

CPU times: user 1min 50s, sys: 1min 35s, total: 3min 26s
Wall time: 6min 42s





In [None]:
test_agg = test_agg.loc[:, keep_column]

In [None]:
%%time
test_agg.drop(columns=second_third_last_columns + ma_noob_cols, errors="ignore", inplace=True)

In [None]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

In [None]:
test_agg.to_pickle("./test_agg.pkl")

In [None]:
test_agg.head()