### Reduce the data size by reducing the bytes in the data type format
* float64
* float32
* int32
* int16
* int8
#### Need to fix S_2 missing in other 3 dfs

In [9]:
import gc
import numpy as np
import os
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format
import sys
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [3]:
from utils.constants import *
from utils.eda_helpers import get_cols
from utils.extraction_helpers import read_file

In [4]:
%load_ext autoreload
%autoreload

In [5]:
# Check if there is any occurence of a certain value is higher than threshold (of 2)
def check_unique_value_counts(df, thr=2, skip_col=2):
    col_list = []
    for col in tqdm(df.columns[skip_col:]):
        max_ = df[col].value_counts().max()
        if max_ > thr:
            print(col, max_)
        else:
            col_list.append(col)
    return col_list

In [6]:
# Round all columns to 4 decimals
def round_four_decimals(df, columns):
    df.loc[:, columns] = df.loc[:, columns].round(4)
    print("Rounded all columns to 4 decimal places")
    return df

In [7]:
def reduce_data_size_to_32(df, skip_col=1, thr=1e-05, type_="train"):
    for col in df.columns[skip_col:]:
        temp = df[col].copy()
        if col == "S_2":
            continue
        try:
            diff_series = (temp - temp.astype(np.float32))
            if abs(diff_series.min()) + abs(diff_series.max()) > thr:
                print(f"Failed to convert column {col}, difference exceeded threshold {thr}")
            elif df[col].dtype != np.float64:
                print(f"Stop converting column {col}, its original data format is not np.float64 / float64")
                df[col] = df[col].astype(np.float32).round(3)
            else:
                pass
                # print(f"Reduced column {col} to np.float32")
        except:
            if type_ == "train":
                label_enc = LabelEncoder()
                label_enc.fit(df[col])
                df[col] = label_enc.transform(df[col])
                label_enc_dict[col] = label_enc
                print(f"Column {col} is in string, use label encoding instead")
            else:
                df[col] = label_enc_dict[col].transform(df[col])
    return df

In [8]:
label_enc_dict = {}

### For Train

#### Spend & Payment

In [68]:
%%time
train_sp = read_file(f"{RAW_DATA_PATH}/train_raw/spend_payment_only.parquet")
train_sp = train_sp.reset_index(drop=True)

Shape of data: (5531451, 26)
CPU times: user 1.44 s, sys: 1.79 s, total: 3.23 s
Wall time: 2.85 s


In [69]:
safe_columns = check_unique_value_counts(train_sp)
train_sp = round_four_decimals(train_sp, safe_columns)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:11<00:00,  2.01it/s]


Rounded all columns to 4 decimal places


In [70]:
train_sp = reduce_data_size_to_32(train_sp)

Failed to convert column S_16, difference exceeded threshold 1e-05
Failed to convert column S_23, difference exceeded threshold 1e-05


#### Risk

In [71]:
%%time
train_r = read_file(f"{RAW_DATA_PATH}/train_raw/risk_only.parquet")
train_r = train_r.reset_index(drop=True)

Shape of data: (5531451, 29)
CPU times: user 1.1 s, sys: 1.87 s, total: 2.96 s
Wall time: 2.2 s


In [72]:
safe_columns = check_unique_value_counts(train_r)
train_r = round_four_decimals(train_r, safe_columns)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:13<00:00,  1.97it/s]


Rounded all columns to 4 decimal places


In [73]:
train_r = reduce_data_size_to_32(train_r)

Failed to convert column R_7, difference exceeded threshold 1e-05
Failed to convert column R_14, difference exceeded threshold 1e-05


#### Balance

In [74]:
%%time
train_b = read_file(f"{RAW_DATA_PATH}/train_raw/balance_only.parquet")
train_b = train_b.reset_index(drop=True)

Shape of data: (5531451, 41)
CPU times: user 1.43 s, sys: 3.09 s, total: 4.52 s
Wall time: 3.37 s


In [75]:
safe_columns = check_unique_value_counts(train_b)
train_b = round_four_decimals(train_b, safe_columns)

 69%|████████████████████████████████████████████████████████████████████████████████████████████▊                                         | 27/39 [00:14<00:06,  1.85it/s]

B_30 4710663
B_31 5514544


 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 34/39 [00:16<00:02,  2.35it/s]

B_38 1953232


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:17<00:00,  2.21it/s]


Rounded all columns to 4 decimal places


In [76]:
train_b = reduce_data_size_to_32(train_b)

Failed to convert column B_5, difference exceeded threshold 1e-05
Failed to convert column B_6, difference exceeded threshold 1e-05
Failed to convert column B_10, difference exceeded threshold 1e-05
Failed to convert column B_12, difference exceeded threshold 1e-05
Failed to convert column B_13, difference exceeded threshold 1e-05
Failed to convert column B_21, difference exceeded threshold 1e-05
Failed to convert column B_26, difference exceeded threshold 1e-05
Stop converting column B_31, its original data format is not np.float64 / float64
Failed to convert column B_40, difference exceeded threshold 1e-05


#### Delinquency

In [77]:
%%time
train_d = read_file(f"{RAW_DATA_PATH}/train_raw/delinquency_only.parquet")
train_d = train_d.reset_index(drop=True)

Shape of data: (5531451, 97)
CPU times: user 3.66 s, sys: 9.22 s, total: 12.9 s
Wall time: 11.2 s


In [78]:
safe_columns = check_unique_value_counts(train_d)
train_d = round_four_decimals(train_d, safe_columns)

 24%|████████████████████████████████▍                                                                                                     | 23/95 [00:08<00:20,  3.54it/s]

D_63 4119621
D_64 2913244


 25%|█████████████████████████████████▊                                                                                                    | 24/95 [00:09<00:25,  2.75it/s]

D_66 617066
D_68 2782455


 45%|████████████████████████████████████████████████████████████▋                                                                         | 43/95 [00:17<00:24,  2.10it/s]

D_87 3865


 68%|███████████████████████████████████████████████████████████████████████████████████████████▋                                          | 65/95 [00:25<00:09,  3.29it/s]

D_114 3316478
D_116 5348109
D_117 1456084


 75%|████████████████████████████████████████████████████████████████████████████████████████████████████▏                                 | 71/95 [00:26<00:07,  3.39it/s]

D_120 4729723


 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 77/95 [00:29<00:06,  2.60it/s]

D_126 4262414


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:35<00:00,  2.66it/s]


Rounded all columns to 4 decimal places


In [79]:
train_d = reduce_data_size_to_32(train_d)

Failed to convert column D_50, difference exceeded threshold 1e-05
Failed to convert column D_61, difference exceeded threshold 1e-05
Column D_63 is in string, use label encoding instead
Column D_64 is in string, use label encoding instead
Failed to convert column D_65, difference exceeded threshold 1e-05
Failed to convert column D_69, difference exceeded threshold 1e-05
Failed to convert column D_106, difference exceeded threshold 1e-05


In [81]:
if "S_2" not in train_r.columns:
    train_r.insert(1, "S_2", train_sp["S_2"])
    train_b.insert(1, "S_2", train_sp["S_2"])
    train_d.insert(1, "S_2", train_sp["S_2"])

In [82]:
train_sp.shape, train_r.shape, train_b.shape, train_d.shape

((5531451, 26), (5531451, 30), (5531451, 42), (5531451, 98))

In [83]:
%%time
train_sp.to_parquet(f"{RAW_TRAIN_PARQUET_PATH}/raw_spend_payment.parquet")
train_r.to_parquet(f"{RAW_TRAIN_PARQUET_PATH}/raw_risk.parquet")
train_b.to_parquet(f"{RAW_TRAIN_PARQUET_PATH}/raw_balance.parquet")
train_d.to_parquet(f"{RAW_TRAIN_PARQUET_PATH}/raw_delinquency.parquet")

CPU times: user 18.4 s, sys: 3.2 s, total: 21.6 s
Wall time: 18.9 s


In [84]:
del train_sp, train_r, train_b, train_d

### For Test

#### Spend & Payment

In [43]:
%%time
test_sp = read_file(f"{RAW_DATA_PATH}/test_raw/spend_payment_only.parquet")
test_sp = test_sp.reset_index(drop=True)

Shape of data: (11363762, 26)
CPU times: user 2.92 s, sys: 2.15 s, total: 5.07 s
Wall time: 3.25 s


In [44]:
safe_columns = check_unique_value_counts(test_sp)
test_sp = round_four_decimals(test_sp, safe_columns)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:27<00:00,  1.13s/it]


Rounded all columns to 4 decimal places


In [45]:
test_sp = reduce_data_size_to_32(test_sp, type_="test")

Failed to convert column S_5, difference exceeded threshold 1e-05
Failed to convert column S_16, difference exceeded threshold 1e-05
Failed to convert column S_23, difference exceeded threshold 1e-05
Failed to convert column S_26, difference exceeded threshold 1e-05


#### Risk

In [46]:
%%time
test_r = read_file(f"{RAW_DATA_PATH}/test_raw/risk_only.parquet")
test_r = test_r.reset_index(drop=True)

Shape of data: (11363762, 29)
CPU times: user 2.33 s, sys: 4.48 s, total: 6.81 s
Wall time: 5.77 s


In [47]:
safe_columns = check_unique_value_counts(test_r)
test_r = round_four_decimals(test_r, safe_columns)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:30<00:00,  1.14s/it]


Rounded all columns to 4 decimal places


In [48]:
test_r = reduce_data_size_to_32(test_r, type_="test")

Failed to convert column R_7, difference exceeded threshold 1e-05
Failed to convert column R_14, difference exceeded threshold 1e-05


#### Balance

In [49]:
%%time
test_b = read_file(f"{RAW_DATA_PATH}/test_raw/balance_only.parquet")
test_b = test_b.reset_index(drop=True)

Shape of data: (11363762, 41)
CPU times: user 2.98 s, sys: 8.51 s, total: 11.5 s
Wall time: 8.69 s


In [50]:
safe_columns = check_unique_value_counts(test_b)
test_b = round_four_decimals(test_b, safe_columns)

 72%|████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 28/39 [00:32<00:09,  1.11it/s]

B_30 9618034
B_31 11324528


 87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 34/39 [00:37<00:04,  1.06it/s]

B_38 4133804


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:39<00:00,  1.02s/it]

KeyboardInterrupt



In [52]:
test_b = reduce_data_size_to_32(test_b, type_="test")

Failed to convert column B_6, difference exceeded threshold 1e-05
Failed to convert column B_10, difference exceeded threshold 1e-05
Failed to convert column B_12, difference exceeded threshold 1e-05
Failed to convert column B_13, difference exceeded threshold 1e-05
Failed to convert column B_21, difference exceeded threshold 1e-05
Failed to convert column B_26, difference exceeded threshold 1e-05
Stop converting column B_31, its original data format is not np.float64 / float64
Failed to convert column B_40, difference exceeded threshold 1e-05


#### Delinquency

In [53]:
%%time
test_d = read_file(f"{RAW_DATA_PATH}/test_raw/delinquency_only.parquet")
test_d = test_d.reset_index(drop=True)

Shape of data: (11363762, 97)
CPU times: user 7.23 s, sys: 24.3 s, total: 31.5 s
Wall time: 37.2 s


In [54]:
safe_columns = check_unique_value_counts(test_d)
test_d = round_four_decimals(test_d, safe_columns)

 23%|███████████████████████████████                                                                                                       | 22/95 [00:20<01:01,  1.18it/s]

D_63 8429213


 24%|████████████████████████████████▍                                                                                                     | 23/95 [00:20<00:47,  1.51it/s]

D_64 5993483


 27%|████████████████████████████████████▋                                                                                                 | 26/95 [00:22<00:33,  2.05it/s]

D_66 1349834
D_68 5623121


 45%|████████████████████████████████████████████████████████████▋                                                                         | 43/95 [00:39<00:56,  1.08s/it]

D_87 10972


 66%|████████████████████████████████████████████████████████████████████████████████████████▊                                             | 63/95 [00:55<00:22,  1.42it/s]

D_114 6957846


 71%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 67/95 [00:57<00:12,  2.16it/s]

D_116 11048854
D_117 3097065


 73%|█████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 69/95 [00:59<00:18,  1.39it/s]

D_120 8803450


 79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                            | 75/95 [01:05<00:19,  1.00it/s]

D_126 8628867


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [01:20<00:00,  1.18it/s]


Rounded all columns to 4 decimal places


In [56]:
test_d = reduce_data_size_to_32(test_d, type_="test")

Failed to convert column D_50, difference exceeded threshold 1e-05
Failed to convert column D_61, difference exceeded threshold 1e-05
Failed to convert column D_65, difference exceeded threshold 1e-05
Failed to convert column D_69, difference exceeded threshold 1e-05
Failed to convert column D_106, difference exceeded threshold 1e-05
Failed to convert column D_123, difference exceeded threshold 1e-05


In [None]:
if "S_2" not in test_r.columns:
    test_r.insert(1, "S_2", test_sp["S_2"])
    test_b.insert(1, "S_2", test_sp["S_2"])
    test_d.insert(1, "S_2", test_sp["S_2"])

In [None]:
test_sp.shape, test_r.shape, test_b.shape, test_d.shape

In [66]:
%%time
test_sp.to_parquet(f"{RAW_TEST_PARQUET_PATH}/raw_spend_payment.parquet")
test_r.to_parquet(f"{RAW_TEST_PARQUET_PATH}/raw_risk.parquet")
test_b.to_parquet(f"{RAW_TEST_PARQUET_PATH}/raw_balance.parquet")
test_d.to_parquet(f"{RAW_TEST_PARQUET_PATH}/raw_delinquency.parquet")

CPU times: user 37.8 s, sys: 13.9 s, total: 51.8 s
Wall time: 47.9 s


In [67]:
del test_sp, test_r, test_b, test_d