In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import gc
import numpy as np
import os
import sys
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings("ignore")
from pandarallel import pandarallel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm
from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer

In [2]:
from pathlib import Path
rootpath = Path.cwd().parent
sys.path.append(os.path.join(rootpath))

In [3]:
from utils.eval_helpers import (
    plot_roc_curves, plot_feature_importance, 
    amex_metric, get_final_metric_df, amex_metric_np, lgb_amex_metric
)
from utils.eda_helpers import (
    plot_missing_proportion_barchart, get_cols, plot_scatterplot, single_col_target_check
)
from utils.extraction_helpers import read_file
from utils.feature_group import (
    CATEGORY_COLUMNS, CONTINUOUS_COLUMNS, BINARY_COLUMNS, ROUND_COLUMNS,
    MEAN_FEATURES, MIN_FEATURES, MAX_FEATURES, LAST_FEATURES, NON_FEATURE_COLUMNS
)
from utils.constants import (
    PROCESSED_DATA_PATH
)

In [4]:
RAW_DATA_PATH = "../raw_data"
PROCESSED_DATA_PATH = "../processed_data"
SUBMISSION_DATA_PATH = "../submissions"
EVALUATION_DATA_PATH = "../evaluation_data"
MODELS_PATH = "../models"

In [5]:
RAW_TRAIN_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "train_parquet")
RAW_TRAIN_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "train_pickle")
RAW_TEST_PARQUET_PATH = os.path.join(RAW_DATA_PATH, "test_parquet")
RAW_TEST_PICKLE_PATH = os.path.join(RAW_DATA_PATH, "test_pickle")

In [6]:
%load_ext autoreload
%autoreload

### Read Data

In [7]:
%%time
train = read_file(f"{RAW_TRAIN_PARQUET_PATH}/train.parquet")
test = read_file(f"{RAW_TEST_PARQUET_PATH}/test.parquet")

Shape of data: (5531451, 190)
Shape of data: (11363762, 190)
CPU times: user 16 s, sys: 16.4 s, total: 32.4 s
Wall time: 21.9 s


In [8]:
labels = pd.read_csv(f"{RAW_DATA_PATH}/train_labels.csv")

In [9]:
train = train.merge(labels, on="customer_ID", how="left")

### Fix category columns

In [10]:
train.loc[:, CATEGORY_COLUMNS] = train.loc[:, CATEGORY_COLUMNS].astype("category")
test.loc[:, CATEGORY_COLUMNS] = test.loc[:, CATEGORY_COLUMNS].astype("category")

In [11]:
for col in CATEGORY_COLUMNS:
    if train[col].dtype != test[col].dtype:
        print(col)

D_64
D_68


#### D_64

In [12]:
train["D_64"].value_counts(normalize=True)

0     0.526669
3     0.275416
2     0.151879
-1    0.039310
1     0.006726
Name: D_64, dtype: float64

In [13]:
test["D_64"].value_counts(normalize=True)

0     0.527421
3     0.259844
2     0.179485
-1    0.033249
Name: D_64, dtype: float64

In [14]:
train["D_64"].isnull().sum(), test["D_64"].isnull().sum()

(0, 0)

In [15]:
train["D_64"] = train["D_64"].replace(1, -1).astype("category")

In [16]:
train["D_64"].value_counts(normalize=True)

0     0.526669
3     0.275416
2     0.151879
-1    0.046036
Name: D_64, dtype: float64

In [17]:
test["D_64"].value_counts(normalize=True)

0     0.527421
3     0.259844
2     0.179485
-1    0.033249
Name: D_64, dtype: float64

#### D_68

In [18]:
train["D_68"].value_counts(normalize=True)

6     0.503024
5     0.217250
3     0.087580
4     0.086268
2     0.039793
-1    0.039140
1     0.024066
0     0.002879
Name: D_68, dtype: float64

In [19]:
test["D_68"].value_counts(normalize=True)

6     0.494829
5     0.228375
4     0.091344
3     0.088224
2     0.042881
-1    0.030667
1     0.023679
Name: D_68, dtype: float64

In [20]:
train.groupby("D_68")["target"].mean()

D_68
-1    0.414493
0     0.179717
1     0.403990
2     0.392902
3     0.369780
4     0.343603
5     0.262219
6     0.174952
Name: target, dtype: float64

* 0 similar to 6, encode as 6

In [21]:
train["D_68"] = train["D_68"].replace(0, 6).astype("category")

### Insert row_number

In [22]:
def insert_row_number(df):
    if "row_number_inv" not in df.columns:
        df.insert(1, "row_number_inv", df.groupby("customer_ID")["S_2"].rank(method="first", ascending=True).astype(int))
    if "row_number" not in df.columns:
        df.insert(1, "row_number", df.groupby("customer_ID")["S_2"].rank(method="first", ascending=False).astype(int))
    print("Done insertion")

In [23]:
insert_row_number(train)

Done insertion


In [24]:
train.head(3)

Unnamed: 0,customer_ID,row_number,row_number_inv,S_2,P_2,D_39,B_1,B_2,R_1,S_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,13,1,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,...,-1,-1,0,0,0.0,,0,0.00061,0,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,12,2,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,...,-1,-1,0,0,0.0,,0,0.005492,0,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,11,3,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,...,-1,-1,0,0,0.0,,0,0.006986,0,0


In [25]:
insert_row_number(test)

Done insertion


In [26]:
test.head(3)

Unnamed: 0,customer_ID,row_number,row_number_inv,S_2,P_2,D_39,B_1,B_2,R_1,S_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,9,1,2019-02-19,0.631315,0,0.010728,0.814497,0.0,0.168651,...,-1,-1,-1,-1,0,,,-1,0.008281,-1
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,8,2,2019-03-25,0.587042,0,0.011026,0.810848,0.0,0.241389,...,-1,-1,-1,0,0,0.0,,0,0.003753,0
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,7,3,2019-04-25,0.609056,0,0.01639,1.00462,0.0,0.266976,...,-1,-1,-1,0,0,0.0,,0,0.002156,0


In [27]:
train.shape, test.shape

((5531451, 193), (11363762, 192))

### Export data out

In [35]:
train.to_pickle(f"{RAW_TRAIN_PICKLE_PATH}/train_data.pkl")
test.to_pickle(f"{RAW_TEST_PICKLE_PATH}/test_data.pkl")