In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from collections import defaultdict
from itertools import repeat, combinations
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap, plot_target_check,
    get_cols, insert_row_number, plot_train_test_distribution
)
from utils.extraction_helpers import read_file
from utils.feature_group import CATEGORY_COLUMNS
from utils.preprocess_helpers import clip_col

In [3]:
from feature_engineering_helpers import feature_gen_pipeline, clip_all, round_all, convert_all

In [4]:
%load_ext autoreload
%autoreload

In [5]:
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=False)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [6]:
START = time.time()

### Read Data

In [7]:
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (458913, 2)


In [8]:
labels["target"].mean()

0.2589336105100531

In [9]:
%%time
train_agg = read_file(f"../{PROCESSED_DATA_PATH}/v2/train_agg.pkl")

Shape of data: (458913, 4083)
CPU times: user 4.28 s, sys: 17.6 s, total: 21.9 s
Wall time: 46 s


In [10]:
split_indices = joblib.load(f"../{INTERIM_DATA_PATH}/split_indices.pkl")

In [11]:
split_indices.keys()

dict_keys(['held_out_ids', 'fold0', 'fold1', 'fold2', 'fold3', 'fold4'])

In [12]:
train_agg["target"] = labels["target"].values
print(f"Size: {sys.getsizeof(train_agg) / 1e9} GB, Shape: {train_agg.shape}")

Size: 11.402159586 GB, Shape: (458913, 4083)


In [13]:
train_agg["customer_ID"] = labels["customer_ID"].values

In [14]:
train_held_out = train_agg.loc[train_agg["customer_ID"].isin(split_indices["held_out_ids"])].reset_index(drop=True)
train_held_out.to_pickle(f"./held_out.pkl")
for i in range(5):
    t = train_agg.loc[train_agg["customer_ID"].isin(split_indices[f"fold{i}"])].reset_index(drop=True)
    t.to_pickle(f"./validation_fold{i}.pkl")

In [15]:
# del train_agg

### Transform Test

In [None]:
test_agg = read_file(f"../{PROCESSED_DATA_PATH}/v2/test_agg.pkl").reset_index()

In [None]:
# test_agg = clip_all(test_agg)

In [None]:
# test_agg = round_all(test_agg)

In [None]:
print(f"Size: {sys.getsizeof(test_agg) / 1e9} GB, Shape: {test_agg.shape}")

In [None]:
test_agg.shape

In [None]:
test_agg.to_pickle(f"./test_agg.pkl")