In [1]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import (
    RAW_DATA_PATH, PROCESSED_DATA_PATH
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols, insert_row_number
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    feature_gen_pipeline,
    filter_df_for_feature, 
    get_specific_row_df, 
    get_agg_df, get_ma_df
)
from utils.impute_helpers import impute_col

In [3]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"
EXP_PATH = "../experiments"

In [4]:
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")

In [5]:
%load_ext autoreload
%autoreload

In [6]:
START = time.time()

## Feature Engineering

### Read master data & train labels

In [7]:
%%time
raw_train = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train_psrbd.parquet")
labels = read_file(f"{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (5531451, 193)
Shape of data: (458913, 2)
CPU times: user 6.15 s, sys: 7.03 s, total: 13.2 s
Wall time: 8.44 s


In [8]:
raw_train.loc[:, CATEGORY_COLUMNS] = raw_train.loc[:, CATEGORY_COLUMNS].astype("category")

In [9]:
%%time
train_agg = feature_gen_pipeline(raw_train)

Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:18<00:00,  9.50it/s]


CPU times: user 1min 8s, sys: 49 s, total: 1min 57s
Wall time: 2min 26s


In [10]:
train_agg["target"] = labels["target"].values

In [11]:
sys.getsizeof(train_agg) / 1e9, train_agg.shape

(11.985440005, (458913, 4294))

In [12]:
train_agg.to_pickle(f"{PROCESSED_DATA_PATH}/train_agg.pkl")

In [13]:
del raw_train, train_agg

In [14]:
%%time
raw_test = read_file(f"{RAW_TEST_PARQUET_PATH}/test_psrbd.parquet")
raw_test.loc[:, CATEGORY_COLUMNS] = raw_test.loc[:, CATEGORY_COLUMNS].astype("category")
test_agg = feature_gen_pipeline(raw_test)

Shape of data: (11363762, 192)
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
Third last entry done
MA3 for Recency 1 done
MA3 for Recency 2 done
MA3 for Recency 3 done
MA3 for Recency 4 done


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:21<00:00,  8.31it/s]


In [15]:
sys.getsizeof(test_agg) / 1e9, test_agg.shape

(24.076214571, (924621, 4293))

In [16]:
test_agg.to_pickle(f"{PROCESSED_DATA_PATH}/test_agg.pkl")

In [17]:
END = time.time()

In [18]:
print(f"{END - START:.2f} seconds elapsed")

624.29 seconds elapsed
