In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_parquet('../data/raw/train.parquet')
print(df.shape)
df.head()

In [None]:
def hex_to_int(hex_string):
    last_16_chars = hex_string[-16:]
    return int(last_16_chars, 16)

df['customer_ID'] = df['customer_ID'].apply(hex_to_int).astype('int64')
df['S_2'] = pd.to_datetime(df['S_2'])
df = df.fillna(-127)
# NA data contains signal: it cannot just be dropped
# We impute an integer for compatibility with categirical columns
# We want the value to be out of range for most features
df.head()

In [None]:
''' Feature aggregation based on https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793, too expensive for data augmentation.
def process_and_feature_engineer(df):

    cs = [c for c in list(df.columns) if c not in ['customer_ID', 'S_2']]
    cat_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'D_87']
    num_features = [c for c in cs if c not in cat_features]

    # Careful: std assumes 1 dof and will return NaN if just one value
    test_num_agg = df.groupby('customer_ID')[num_features].agg(['mean', 'std', 'min', 'max', 'last']).fillna(0)
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby('customer_ID')[cat_features].agg(['last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = pd.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg    
    return df

features = process_and_feature_engineer(df)
print(features.shape)
'''

most_recent_indices = df.groupby('customer_ID')['S_2'].idxmax()
features = df.loc[most_recent_indices]
features = features.set_index('customer_ID')
print(features.shape)

In [None]:
targets = pd.read_csv('../data/raw/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].apply(hex_to_int).astype('int64')
targets = targets.set_index('customer_ID')
train_data = features.merge(targets, left_index=True, right_index=True, how='left')
train_data['target'] = train_data['target'].astype('int8')
print(train_data.shape)
train_data.head()

In [None]:
from sklearn.model_selection import train_test_split

train_data.to_csv('../data/processed/train.csv')
X = train_data.drop(columns=['S_2', 'target']).values
y = train_data['target'].values

''' In practice, categorical columns have minimal impact on model performance.
feature_cols = list(X.columns)
cat_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68', 'D_87']
num_features = [c for c in feature_cols if c not in cat_features]
X = X[num_features]
'''

# 50000 for model validation, 50000 for data valuation, rest for training
X_train, X_, y_train, y_ = train_test_split(X, y, test_size=100000, random_state=42, stratify=y)
X_test, X_v, y_test, y_v = train_test_split(X_, y_, test_size=0.5, random_state=42, stratify=y_)

np.savez('../data/processed/train', x=X_train, y=y_train)
np.savez('../data/processed/test', x=X_test, y=y_test)
np.savez('../data/processed/v', x=X_v, y=y_v)