In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import gc
import numpy as np
import os
import sys
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from pandarallel import pandarallel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [3]:
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, single_col_target_check
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, BINARY_COLUMNS, ROUND_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, NON_FEATURE_COLUMNS
)
from utils.constants import (
    PROCESSED_DATA_PATH
)

In [4]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"

In [5]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [6]:
%load_ext autoreload
%autoreload

### Read Data

In [7]:
%%time
train = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train.parquet")
test = read_file(f"{RAW_TEST_PARQUET_PATH}/test.parquet")

Shape of data: (5531451, 190)
Shape of data: (11363762, 190)
CPU times: user 16 s, sys: 16.4 s, total: 32.4 s
Wall time: 21.9 s


In [8]:
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

In [9]:
train = train.merge(labels, on="customer_ID", how="left")

#### Fix category columns

In [10]:
train.loc[:, CATEGORY_COLUMNS] = train.loc[:, CATEGORY_COLUMNS].astype("category")
test.loc[:, CATEGORY_COLUMNS] = test.loc[:, CATEGORY_COLUMNS].astype("category")

In [11]:
for col in CATEGORY_COLUMNS:
    if train[col].dtype != test[col].dtype:
        print(col)

D_64
D_68


#### D_64

In [None]:
train["D_64"].value_counts(normalize=True)

In [None]:
test["D_64"].value_counts(normalize=True)

In [None]:
train["D_64"].isnull().sum(), test["D_64"].isnull().sum()

In [None]:
train["D_64"] = train["D_64"].replace(1, -1).astype("category")

#### D_68

In [None]:
train["D_68"].value_counts(normalize=True)

In [None]:
test["D_68"].value_counts(normalize=True)

In [None]:
train.groupby("D_68")["target"].mean()

* 0 similar to 6, encode as 6

In [None]:
train["D_68"] = train["D_68"].replace(0, 6).astype("category")

### Insert row_number

In [None]:
def insert_row_number(df):
    if "row_number_inv" not in df.columns:
        df.insert(1, "row_number_inv", df.groupby("customer_ID")["S_2"].rank(method="first", ascending=True).astype(int))
    if "row_number" not in df.columns:
        df.insert(1, "row_number", df.groupby("customer_ID")["S_2"].rank(method="first", ascending=False).astype(int))
    print("Done insertion")

In [None]:
insert_row_number(train)

In [None]:
train.head(3)

In [None]:
insert_row_number(test)

In [None]:
test.head(3)

In [None]:
train.shape, test.shape

### Impute D_88

In [None]:
train["D_88"] = train["D_88"].fillna(0)
test["D_88"] = test["D_88"].fillna(0)

### Export data out

In [None]:
train.to_pickle(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")
test.to_pickle(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")