In [11]:
import gc
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format
import random
import sys
sys.path.append("../../")
import time
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from pandarallel import pandarallel
from tqdm import tqdm

In [12]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [13]:
from utils.constants import *
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, plot_target_check, 
    plot_int_feature_distribution, plot_train_test_distribution, check_overlap_missing,
    insert_row_number, plot_sampled_time_series
)
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, NON_FEATURE_COLUMNS
)
from utils.preprocess_helpers import *

In [14]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
gc.collect()

552

In [16]:
START = time.time()

In [17]:
NON_FEATURE_COLUMNS

['customer_ID', 'target', 'row_number', 'row_number_inv', 'S_2']

### Combine Train Data

In [18]:
%%time
train_sp = read_file(f"./train_parquet/train_spend_payment.parquet")
train_r = read_file(f"./train_parquet/train_risk.parquet")
train_b = read_file(f"./train_parquet/train_balance.parquet")
train_d1 = read_file(f"./train_parquet/train_delinquency_part1.parquet")
train_d2 = read_file(f"./train_parquet/train_delinquency_part2.parquet")

Shape of data: (5531451, 24)
Shape of data: (5531451, 33)
Shape of data: (5531451, 46)
Shape of data: (5531451, 55)
Shape of data: (5531451, 48)
CPU times: user 7.75 s, sys: 7.52 s, total: 15.3 s
Wall time: 13.2 s


In [19]:
%%time
train = pd.concat([
    train_sp.set_index(NON_FEATURE_COLUMNS),
    train_r.set_index(NON_FEATURE_COLUMNS),
    train_b.set_index(NON_FEATURE_COLUMNS),
    train_d1.set_index(NON_FEATURE_COLUMNS),
    train_d2.set_index(NON_FEATURE_COLUMNS)
], axis=1).reset_index()

CPU times: user 10.4 s, sys: 14 s, total: 24.4 s
Wall time: 43.6 s


In [20]:
train.shape

(5531451, 186)

In [21]:
train.columns

Index(['customer_ID', 'target', 'row_number', 'row_number_inv', 'S_2', 'P_2',
       'S_3', 'P_3', 'S_5', 'S_6',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=186)

In [22]:
%%time
train.to_parquet(f"./train_parquet/train_all_variables_.parquet")

CPU times: user 15.9 s, sys: 3.68 s, total: 19.6 s
Wall time: 19.4 s


In [23]:
train.head()

Unnamed: 0,customer_ID,target,row_number,row_number_inv,S_2,P_2,S_3,P_3,S_5,S_6,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,13,1,2017-03-09,0.93,0.12,0.73,0.02,0,...,,,,0.0,0.0,,,0.0,,0.0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,12,2,2017-04-07,0.93,0.12,0.72,0.03,0,...,,,,0.0,0.0,,,0.0,,0.0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,11,3,2017-05-28,0.95,0.12,0.73,0.04,0,...,,,,0.0,0.0,,,0.0,,0.0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,10,4,2017-06-13,0.96,0.11,0.74,0.03,0,...,,,,0.0,0.0,,,0.0,,0.0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,9,5,2017-07-16,0.94,0.11,0.69,0.05,0,...,,,,0.0,0.0,,,0.0,,0.0


In [24]:
del train, train_sp, train_r, train_b, train_d1, train_d2

### Combine Test Data

In [25]:
%%time
test_sp = read_file(f"./test_parquet/test_spend_payment.parquet")
test_r = read_file(f"./test_parquet/test_risk.parquet")
test_b = read_file(f"./test_parquet/test_balance.parquet")
test_d1 = read_file(f"./test_parquet/test_delinquency_part1.parquet")
test_d2 = read_file(f"./test_parquet/test_delinquency_part2.parquet")

Shape of data: (11363762, 23)
Shape of data: (11363762, 32)
Shape of data: (11363762, 45)
Shape of data: (11363762, 54)
Shape of data: (11363762, 47)
CPU times: user 15 s, sys: 18.3 s, total: 33.2 s
Wall time: 32.9 s


In [26]:
cols = [c for c in NON_FEATURE_COLUMNS if c not in "target"]

In [27]:
insert_row_number(test_r)

Done insertion


In [28]:
%%time
test = pd.concat([
    test_sp.set_index(cols),
    test_r.set_index(cols),
    test_b.set_index(cols),
    test_d1.set_index(cols),
    test_d2.set_index(cols)
], axis=1).reset_index()

CPU times: user 20.3 s, sys: 28.1 s, total: 48.4 s
Wall time: 1min 32s


In [29]:
test.shape

(11363762, 185)

In [30]:
cid_list = test["customer_ID"].unique().tolist()
len(cid_list)

924621

In [31]:
split_index = 462311

In [32]:
test.loc[test["customer_ID"].isin(cid_list[:split_index])].to_parquet(f"./test_parquet/test_all_variables1_.parquet")
test.loc[test["customer_ID"].isin(cid_list[split_index:])].to_parquet(f"./test_parquet/test_all_variables2_.parquet")

In [33]:
test.head()

Unnamed: 0,customer_ID,row_number,row_number_inv,S_2,P_2,S_3,P_3,S_5,S_6,S_7,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,9,1,2019-02-19,0.63,0.16,,0.0,1,0.1,...,,,,,0.0,,,,,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,8,2,2019-03-25,0.58,0.24,,0.0,1,0.16,...,,,,0.0,0.0,,,0.0,,0.0
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,7,3,2019-04-25,0.6,0.26,,0.0,0,0.21,...,,,,0.0,0.0,,,0.0,,0.0
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,6,4,2019-05-20,0.61,0.18,,0.0,0,0.19,...,,,,0.0,0.0,,,0.0,,0.0
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,5,5,2019-06-15,0.59,0.18,0.59,0.0,0,0.2,...,,,,0.0,0.0,,,0.0,,0.0


In [34]:
del test, test_sp, test_r, test_b, test_d1, test_d2