In [1]:
import numpy as np
import pandas as pd
import os
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
import gc
from math import sqrt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

import math as mt
from math import *
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict


import joblib
import random
import itertools
import scipy as sp

from itertools import combinations
import warnings; warnings.filterwarnings('ignore')

In [3]:
def get_difference(data, num_features):
    df1 = []
    customer_ids = []
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        diff_df1 = df[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    df1 = np.concatenate(df1, axis = 0)
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    df1['customer_ID'] = customer_ids
    return df1

In [6]:
def read_preprocess_data():

    train = pd.read_parquet('../src/data/raw/train.parquet')
    aug_train = pd.read_parquet('../src/data/raw/aug_trn.parquet')
    train = pd.concat([train, aug_train], axis=0).sort_values(['customer_ID','S_2'])
    
    del aug_train
    gc.collect()
    
#     train.describe()
#     train.plot()
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
#     features.describe()
#     features.plot()
    
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
#     train_num_agg.describe()
#     train_num_agg.plot()
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in train_num_agg:
        if 'last' in col and col.replace('last', 'first') in train_num_agg:
            train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('last', 'first')]
            train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('last', 'first')]

    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
#     train_num_agg.describe()
#     train_num_agg.plot()
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    
    train_labels = pd.read_csv('../src/data/raw/train_labels.csv')
#     train_labels.describe()
#     train_labels.plot()
    # Transform float64 columns to float32
    cols = list(train_num_agg.dtypes[train_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        train_num_agg[col] = train_num_agg[col].astype(np.float32)
    # Transform int64 columns to int32
    cols = list(train_cat_agg.dtypes[train_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        train_cat_agg[col] = train_cat_agg[col].astype(np.int32)
    # Get the difference
    train_diff = get_difference(train, num_features)
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_diff, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
#     train.describe()
#     train.plot()
    del train_num_agg, train_cat_agg, train_diff
    gc.collect()
    
    train.to_parquet('../src/data/processed/train_v3.parquet')

    # Test FE
    test = pd.read_parquet('../src/data/raw/test.parquet')    
    aug_test = pd.read_parquet('../src/data/raw/aug_tst.parquet')
    test = pd.concat([test, aug_test], axis=0).sort_values(['customer_ID','S_2'])
    
    del aug_test
    gc.collect()
    
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last'])
#     test_num_agg.describe()
#     test_num_agg.plot()
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in test_num_agg:
        if 'last' in col and col.replace('last', 'first') in test_num_agg:
            test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'first')]
            test_num_agg[col + '_lag_div'] = test_num_agg[col] / test_num_agg[col.replace('last', 'first')]

    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'first', 'last', 'nunique'])
#     test_num_agg.describe()
#     test_num_agg.plot()
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    # Transform float64 columns to float32
    cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    for col in tqdm(cols):
        test_num_agg[col] = test_num_agg[col].astype(np.float32)
    # Transform int64 columns to int32
    cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    for col in tqdm(cols):
        test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
    # Get the difference
    test_diff = get_difference(test, num_features)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(test_diff, how = 'inner', on = 'customer_ID')
#     test.describe()
#     test.plot()
    del test_num_agg, test_cat_agg, test_diff
    gc.collect()
    
    test.to_parquet('../src/data/processed/test_v3.parquet')
    

In [7]:
# Read & Preprocess Data
read_preprocess_data()

Starting training feature engineer...


100%|█████████████████████████████████████████| 345/345 [01:25<00:00,  4.04it/s]
100%|██████████████████████████████████████████| 22/22 [00:00<00:00, 650.10it/s]
100%|██████████████████████████████████| 458913/458913 [12:48<00:00, 597.16it/s]


Starting test feature engineer...


100%|█████████████████████████████████████████| 345/345 [02:52<00:00,  1.99it/s]
100%|██████████████████████████████████████████| 22/22 [00:00<00:00, 352.07it/s]
100%|██████████████████████████████████| 924621/924621 [26:01<00:00, 592.30it/s]


In [None]:
# !kaggle datasets init -p ../src/data/processed

In [None]:
# !kaggle datasets create -p ../src/data/processed