In [1]:
from scipy.stats import linregress
import gc
import os
import joblib
import random
import warnings
import itertools
import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
from itertools import combinations
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn.preprocessing import LabelEncoder
import warnings; warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold, train_test_split

def get_difference(data, num_features):
    df1 = []
    customer_ids = []
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        diff_df1 = df[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    df1 = np.concatenate(df1, axis = 0)
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    df1['customer_ID'] = customer_ids
    return df1

In [2]:
train = pd.read_parquet('../src/data/raw/train.parquet')
features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68",
]

num_features = [col for col in features if col not in cat_features]
print('Starting training feature engineer...')
train_num_agg = train.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last','quantile'])
train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
train_num_agg.reset_index(inplace = True)

# Lag Features
for col in train_num_agg:
    if 'last' in col and col.replace('last', 'first') in train_num_agg:
        train_num_agg[col + '_lag_sub'] = train_num_agg[col] - train_num_agg[col.replace('last', 'first')]
        train_num_agg[col + '_lag_div'] = train_num_agg[col] / train_num_agg[col.replace('last', 'first')]

train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
train_cat_agg.reset_index(inplace = True)

train_labels = pd.read_csv('../src/data/raw/train_labels.csv')
# Transform float64 columns to float32
cols = list(train_num_agg.dtypes[train_num_agg.dtypes == 'float64'].index)
for col in tqdm(cols):
    train_num_agg[col] = train_num_agg[col].astype(np.float32)

# Transform int64 columns to int32
cols = list(train_cat_agg.dtypes[train_cat_agg.dtypes == 'int64'].index)
for col in tqdm(cols):
    train_cat_agg[col] = train_cat_agg[col].astype(np.int32)
# Get the difference
train_diff = get_difference(train, num_features)
train1 = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_diff, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
# train1 = train_num_agg.merge(train_diff, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID')
del train_num_agg, train_cat_agg, train_diff
gc.collect()
num_features.append('customer_ID')
cat_features.append('customer_ID')
train_num_agg = train[num_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg(['mean', 'min', 'max', 'quantile'])
train_num_agg.columns = ['_L3M'.join(x) for x in train_num_agg.columns]
train_num_agg.reset_index(inplace = True)
train_cat_agg = train[cat_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg([ 'min', 'max'])
train_cat_agg.columns = ['_L3M'.join(x) for x in train_cat_agg.columns]
train_cat_agg.reset_index(inplace = True)

num_features.remove('customer_ID')
cat_features.remove('customer_ID')
train2 = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID')
del train_num_agg, train_cat_agg
train = train1.merge(train2, how = 'left', on = 'customer_ID')
gc.collect()

train_cat_enc=pd.read_parquet('../src/data/processed/train_cat_enc_5fold_seed42.parquet')
train=train.merge(train_cat_enc, how='left',on='customer_ID')
print(train.shape)
train.to_parquet('../src/data/processed/train_fe_rich.parquet')
del train
gc.collect()

Starting training feature engineer...


100%|█████████████████████████████████████████| 522/522 [03:24<00:00,  2.56it/s]
100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 467.25it/s]
100%|██████████████████████████████████| 458913/458913 [12:48<00:00, 596.80it/s]


(458913, 2539)


0

In [3]:
# Test FE
test = pd.read_parquet('../src/data/raw/test.parquet')
print('Starting test feature engineer...')
test_num_agg = test.groupby("customer_ID")[num_features].agg(['first', 'mean', 'std', 'min', 'max', 'last','quantile'])
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_num_agg.reset_index(inplace = True)

# Lag Features
for col in test_num_agg:
    if 'last' in col and col.replace('last', 'first') in test_num_agg:
        test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'first')]
        test_num_agg[col + '_lag_div'] = test_num_agg[col] / test_num_agg[col.replace('last', 'first')]

test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['first', 'last', 'nunique'])
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
test_cat_agg.reset_index(inplace = True)

# Transform float64 columns to float32
cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
for col in tqdm(cols):
    test_num_agg[col] = test_num_agg[col].astype(np.float32)
# Transform int64 columns to int32
cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
for col in tqdm(cols):
    test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
# Get the difference
test_diff = get_difference(test, num_features)
test1 = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(test_diff, how = 'inner', on = 'customer_ID')
# test1 = test_num_agg.merge(test_diff, how = 'inner', on = 'customer_ID')
del test_num_agg, test_cat_agg, test_diff
gc.collect()

num_features.append('customer_ID')
cat_features.append('customer_ID')
test_num_agg = test[num_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg(['mean', 'min', 'max', 'quantile'])
test_num_agg.columns = ['_L3M'.join(x) for x in test_num_agg.columns]
test_num_agg.reset_index(inplace = True)
test_cat_agg = test[cat_features].groupby("customer_ID").tail(3).groupby("customer_ID").agg([ 'min', 'max'])
test_cat_agg.columns = ['_L3M'.join(x) for x in test_cat_agg.columns]
test_cat_agg.reset_index(inplace = True)

num_features.remove('customer_ID')
cat_features.remove('customer_ID')
test2 = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID')
del test_num_agg, test_cat_agg
test = test1.merge(test2, how = 'left', on = 'customer_ID')
gc.collect()


Starting test feature engineer...


100%|█████████████████████████████████████████| 522/522 [06:25<00:00,  1.35it/s]
100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 257.07it/s]
100%|██████████████████████████████████| 924621/924621 [26:34<00:00, 580.05it/s]


FileNotFoundError: [Errno 2] No such file or directory: '../src/data/processed/test_cat_enc_5fold_seed42.parquet'

In [4]:
# Save files to disk
test_cat_enc=pd.read_parquet('../src/data/processed/test_cat_enc_5fold_seed42.parquet')
test=test.merge(test_cat_enc,how='left',on='customer_ID')

print(test.shape)
test.to_parquet('../src/data/processed/test_fe_rich.parquet')

(924621, 2538)


In [5]:
f = ['pred_fm10_0', 'pred_fm10_1', 'pred_fm10_2', 'pred_fm10_3','customer_ID']
train = pd.read_parquet('../src/data/processed/train_fe_rich.parquet')
train = train.drop(['pred_fm10_0', 'pred_fm10_1', 'pred_fm10_2', 'pred_fm10_3'], axis=1)
print(train.shape)

(458913, 2535)


In [8]:
train_cat_enc = pd.read_parquet('../src/data/processed/train_cat_enc_5fold_seed42.parquet')
train = train.merge(train_cat_enc, how='left', on='customer_ID')

print(train.shape)

(458913, 2535)


In [14]:
train = pd.concat([train, train_cat_enc[['pred_fm10_0', 'pred_fm10_1', 'pred_fm10_2', 'pred_fm10_3']]], axis=1)
print(train.shape)

(458913, 2539)


In [15]:
train.to_parquet('../src/data/processed/train_fe_rich.parquet')