### 1. Clean Parquet data from Kaggle (integer dtypes parquet)
- This notebook aims to further clean the data taken from Kaggle's user, without affecting data quality

In [1]:
import pandas as pd
pd.options.display.float_format = "{:,.4f}".format
import matplotlib.pyplot as plt
import gc
import numpy as np
import os
import sys
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [4]:
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, plot_target_check, 
    plot_int_feature_distribution, plot_train_test_distribution, check_overlap_missing,
    insert_row_number
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, BINARY_COLUMNS, 
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, NON_FEATURE_COLUMNS
)
from utils.constants import (
    PROCESSED_DATA_PATH
)
from utils.feature_engineering_helpers import integerize

In [5]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"

In [6]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [7]:
%load_ext autoreload
%autoreload

### Read Data

In [8]:
%%time
train = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train_bdpsr.parquet", replace_negative_one=True)
test = read_file(f"{RAW_TEST_PARQUET_PATH}/test_bdpsr.parquet", replace_negative_one=True)

Shape of data: (5531451, 189)
Shape of data: (11363762, 188)
CPU times: user 16.5 s, sys: 19.2 s, total: 35.7 s
Wall time: 23.5 s


In [9]:
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

In [10]:
%%time
if "target" not in train.columns:
    train = train.merge(labels, on="customer_ID", how="left")

CPU times: user 10 µs, sys: 35 µs, total: 45 µs
Wall time: 46.7 µs


In [12]:
%%time
train.loc[:, CATEGORY_COLUMNS] = train.loc[:, CATEGORY_COLUMNS].astype("category")
test.loc[:, CATEGORY_COLUMNS] = test.loc[:, CATEGORY_COLUMNS].astype("category")

CPU times: user 4.72 s, sys: 17.2 s, total: 21.9 s
Wall time: 30 s


In [13]:
%%time
insert_row_number(train)
insert_row_number(test)

Done insertion
Done insertion
CPU times: user 34.6 s, sys: 508 ms, total: 35.1 s
Wall time: 35.1 s


In [14]:
train = train.drop(columns="temp", errors="ignore")
test = test.drop(columns="temp", errors="ignore")

In [15]:
train.shape, test.shape

((5531451, 191), (11363762, 190))

In [17]:
%%time
train.to_pickle(f"{RAW_TRAIN_PICKLE_PATH}/raw_train_bdpsr.pkl")
test.to_pickle(f"{RAW_TEST_PICKLE_PATH}/raw_test_bdpsr.pkl")

CPU times: user 823 ms, sys: 4.48 s, total: 5.3 s
Wall time: 7.6 s
