### Split raw [train](https://www.kaggle.com/competitions/amex-default-prediction/data?select=train_data.csv) & [test](https://www.kaggle.com/competitions/amex-default-prediction/data?select=test_data.csv) data into 4 different parquet files
- Spend & Payment
- Risk
- Balance
- Delinquency
#### All the parquet will contains the common columns **customer_ID** and **S_2** (statement_date)

In [1]:
import dask.dataframe as dd
import gc
import os
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format
import sys
import time

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [3]:
from utils.constants import *
from utils.eda_helpers import get_cols
from utils.extraction_helpers import read_file

In [4]:
%load_ext autoreload
%autoreload

In [5]:
START = time.time()

### Split Train data into 4 different parquet files

In [6]:
%%time
train = dd.read_csv(f"{RAW_DATA_PATH}/train_raw/train_data.csv")

CPU times: user 10.7 ms, sys: 1.76 ms, total: 12.4 ms
Wall time: 14.6 ms


In [7]:
%%time
(
    train.loc[:, ["customer_ID"] + get_cols(train, ["S_", "P_"])].compute()
    .to_parquet(f"{RAW_DATA_PATH}/train_raw/spend_payment_only.parquet")
)

CPU times: user 1min 41s, sys: 22.7 s, total: 2min 4s
Wall time: 29.3 s


In [8]:
%%time
(
    train.loc[:, ["customer_ID"] + get_cols(train, "R_")].compute()
    .to_parquet(f"{RAW_DATA_PATH}/train_raw/risk_only.parquet")
)

CPU times: user 1min 41s, sys: 24.2 s, total: 2min 6s
Wall time: 30.6 s


In [9]:
gc.collect()

18

In [10]:
%%time
(
    train.loc[:, ["customer_ID"] + get_cols(train, "B_")].compute()
    .to_parquet(f"{RAW_DATA_PATH}/train_raw/balance_only.parquet")
)

CPU times: user 1min 45s, sys: 28.6 s, total: 2min 14s
Wall time: 37.1 s


In [11]:
%%time
(
    train.loc[:, ["customer_ID"] + get_cols(train, "D_")].compute()
    .to_parquet(f"{RAW_DATA_PATH}/train_raw/delinquency_only.parquet")
)

CPU times: user 1min 47s, sys: 31.2 s, total: 2min 18s
Wall time: 43.6 s


In [12]:
gc.collect()

18

In [13]:
del train

### Split Test data into 4 different parquet files

In [14]:
%%time
test = dd.read_csv(f"{RAW_DATA_PATH}/test_raw/test_data.csv")

CPU times: user 13.9 ms, sys: 2.44 ms, total: 16.3 ms
Wall time: 18.9 ms


In [15]:
%%time
(
    test.loc[:, ["customer_ID"] + get_cols(test, ["S_", "P_"])].compute()
    .to_parquet(f"{RAW_DATA_PATH}/test_raw/spend_payment_only.parquet")
)

CPU times: user 3min 28s, sys: 48.2 s, total: 4min 17s
Wall time: 1min 1s


In [16]:
%%time
(
    test.loc[:, ["customer_ID"] + get_cols(test, "R_")].compute()
    .to_parquet(f"{RAW_DATA_PATH}/test_raw/risk_only.parquet")
)

CPU times: user 3min 30s, sys: 53.3 s, total: 4min 23s
Wall time: 1min 7s


In [17]:
gc.collect()

18

In [18]:
%%time
(
    test.loc[:, ["customer_ID"] + get_cols(test, "B_")].compute()
    .to_parquet(f"{RAW_DATA_PATH}/test_raw/balance_only.parquet")
)

CPU times: user 3min 31s, sys: 54.1 s, total: 4min 25s
Wall time: 1min 8s


In [19]:
%%time
(
    test.loc[:, ["customer_ID"] + get_cols(test, "D_")].compute()
    .to_parquet(f"{RAW_DATA_PATH}/test_raw/delinquency_only.parquet")
)

CPU times: user 3min 42s, sys: 1min 11s, total: 4min 53s
Wall time: 1min 58s


In [20]:
gc.collect()

18

In [21]:
del test

In [22]:
END = time.time()

In [23]:
print(f"{END - START:.2f} seconds elapsed")

457.17 seconds elapsed
