In [3]:
import datetime
import gc
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scipy.stats
import seaborn as sns
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
from itertools import repeat, combinations
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [4]:
from utils.common import (
    sigmoid, pad_column_name
)
from utils.constants import *
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, plot_heatmap,
    get_cols, insert_row_number
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, FIRST_FEATURES,
    RANGE_FEATURES, VELOCITY_FEATURES, SPEED_FEATURES
)
from utils.feature_engineering_helpers import (
    feature_gen_pipeline,
    filter_df_for_feature, 
    get_specific_row_df, 
    get_agg_df, get_ma_df
)
from utils.impute_helpers import impute_col

In [5]:
%load_ext autoreload
%autoreload

In [6]:
START = time.time()

## Feature Engineering

In [10]:
%%time
train = read_file(f"../{INTERIM_TRAIN_PARQUET_PATH}/train_all_variables.parquet")
labels = read_file(f"../{RAW_DATA_PATH}/train_labels.csv")

Shape of data: (5531451, 190)
Shape of data: (458913, 2)
CPU times: user 5.6 s, sys: 5.33 s, total: 10.9 s
Wall time: 6.37 s


In [11]:
%%time
cat_columns = set(train.columns).intersection(set(CATEGORY_COLUMNS))
train.loc[:, CATEGORY_COLUMNS] = train.loc[:, CATEGORY_COLUMNS].astype("category")

CPU times: user 901 ms, sys: 1.52 s, total: 2.42 s
Wall time: 2.42 s


In [12]:
sys.getsizeof(train) / 1e9

5.774837336

In [13]:
train.dtypes.value_counts()

float32           89
float64           47
int32             39
category           3
int64              2
object             1
datetime64[ns]     1
category           1
category           1
category           1
category           1
category           1
category           1
category           1
category           1
dtype: int64

In [14]:
%%time
train_agg = feature_gen_pipeline(train)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA2 for least Recency done
MA3 for Recency 1 done
MA3 for Recency 2 done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 176/176 [00:21<00:00,  8.26it/s]


CPU times: user 53.8 s, sys: 40.4 s, total: 1min 34s
Wall time: 1min 53s


In [15]:
train_agg["target"] = labels["target"].values

In [16]:
sys.getsizeof(train_agg) / 1e9, train_agg.shape

(11.419067786, (458913, 4082))

In [17]:
train_agg.to_pickle(f"./train_agg2.pkl")

In [19]:
del train, train_agg

### Test

In [20]:
%%time
test = read_file(f"../{INTERIM_TEST_PARQUET_PATH}/test_all_variables.parquet")

Shape of data: (11363762, 189)
CPU times: user 12 s, sys: 21.3 s, total: 33.2 s
Wall time: 28.3 s


In [21]:
sys.getsizeof(test) / 1e9

16.466091282

In [22]:
test.dtypes.value_counts()

float64           142
int32              44
object              1
datetime64[ns]      1
int64               1
dtype: int64

In [23]:
%%time
test_agg = feature_gen_pipeline(test)

Done insertion
Average done
Minimum done
Maximum done
Standard Deviation done
Last entry done
First entry done
Second last entry done
MA2 for Recency 1 done
MA2 for Recency 2 done
MA2 for Recency 3 done
MA2 for least Recency done
MA3 for Recency 1 done
MA3 for Recency 2 done


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 176/176 [00:31<00:00,  5.63it/s]


CPU times: user 1min 43s, sys: 3min, total: 4min 44s
Wall time: 8min 18s


In [24]:
sys.getsizeof(test_agg) / 1e9, test_agg.shape

(29.275887374, (924621, 4081))

In [25]:
test_agg.to_pickle(f"./test_agg2.pkl")

In [26]:
del test_agg, test

In [27]:
END = time.time()

In [28]:
print(f"{END - START:.2f} seconds elapsed")

917.44 seconds elapsed
