In [15]:
import gc
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format
import random
import sys
import time
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from pandarallel import pandarallel
from tqdm import tqdm

In [16]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [17]:
from utils.constants import *
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, plot_target_check, 
    plot_int_feature_distribution, plot_train_test_distribution, check_overlap_missing,
    insert_row_number, plot_sampled_time_series
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, BINARY_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, NON_FEATURE_COLUMNS
)
from utils.preprocess_helpers import *

In [18]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
gc.collect()

108

In [20]:
START = time.time()

### Combine Train Data

In [21]:
%%time
train_sp = read_file(f"{INTERIM_TRAIN_PARQUET_PATH}/spend_payment.parquet", replace_negative127=False)
train_r = read_file(f"{INTERIM_TRAIN_PARQUET_PATH}/risk.parquet", replace_negative127=False)
train_b = read_file(f"{INTERIM_TRAIN_PARQUET_PATH}/balance.parquet", replace_negative127=False)
train_d1 = read_file(f"{INTERIM_TRAIN_PARQUET_PATH}/delinquency_part1.parquet", replace_negative127=False)
train_d2 = read_file(f"{INTERIM_TRAIN_PARQUET_PATH}/delinquency_part2.parquet", replace_negative127=False)

Shape of data: (5531451, 27)
Shape of data: (5531451, 31)
Shape of data: (5531451, 42)
Shape of data: (5531451, 55)
Shape of data: (5531451, 47)
CPU times: user 4.71 s, sys: 2.44 s, total: 7.15 s
Wall time: 2.94 s


In [22]:
%%time
train = pd.concat([
    train_sp.set_index(["customer_ID", "S_2", "target"]),
    train_r.set_index(["customer_ID", "S_2", "target"]),
    train_b.set_index(["customer_ID", "S_2", "target"]),
    train_d1.set_index(["customer_ID", "S_2", "target"]),
    train_d2.set_index(["customer_ID", "S_2", "target"])
], axis=1).reset_index()

CPU times: user 7.56 s, sys: 6.52 s, total: 14.1 s
Wall time: 17.9 s


In [23]:
train.shape

(5531451, 190)

In [24]:
train = train.replace(-127, np.nan)

In [35]:
# train.to_parquet(f"{INTERIM_TRAIN_PARQUET_PATH}/all_variables.parquet")

In [36]:
# train.head()

In [37]:
# del train, train_sp, train_r, train_b, train_d1, train_d2

### Combine Test Data

In [28]:
%%time
test_sp = read_file(f"{INTERIM_TEST_PARQUET_PATH}/spend_payment.parquet", replace_negative127=False)
test_r = read_file(f"{INTERIM_TEST_PARQUET_PATH}/risk.parquet", replace_negative127=False)
test_b = read_file(f"{INTERIM_TEST_PARQUET_PATH}/balance.parquet", replace_negative127=False)
test_d1 = read_file(f"{INTERIM_TEST_PARQUET_PATH}/delinquency_part1.parquet", replace_negative127=False)
test_d2 = read_file(f"{INTERIM_TEST_PARQUET_PATH}/delinquency_part2.parquet", replace_negative127=False)

Shape of data: (11363762, 26)
Shape of data: (11363762, 30)
Shape of data: (11363762, 41)
Shape of data: (11363762, 54)
Shape of data: (11363762, 46)
CPU times: user 10.4 s, sys: 10.1 s, total: 20.5 s
Wall time: 10.8 s


In [29]:
%%time
test = pd.concat([
    test_sp.set_index(["customer_ID", "S_2"]),
    test_r.set_index(["customer_ID", "S_2"]),
    test_b.set_index(["customer_ID", "S_2"]),
    test_d1.set_index(["customer_ID", "S_2"]),
    test_d2.set_index(["customer_ID", "S_2"])
], axis=1).reset_index()

CPU times: user 15.8 s, sys: 21.1 s, total: 36.9 s
Wall time: 51.1 s


In [30]:
test.shape

(11363762, 189)

In [31]:
test = test.replace(-127, np.nan)

In [38]:
# test.to_parquet(f"{INTERIM_TEST_PARQUET_PATH}/all_variables.parquet")

In [39]:
# test.head()

In [40]:
# del test, test_sp, test_r, test_b, test_d1, test_d2