# Feature Engineering

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    precision_recall_curve,
    average_precision_score,
    confusion_matrix,
    ConfusionMatrixDisplay
)

In [6]:
ROOT = Path(os.path.abspath('')).resolve().parents[0]
DATA = ROOT / "data"
INTERIM_DATA = DATA / "interim"

In [9]:
X_train = pd.read_parquet(INTERIM_DATA / 'x_train_split.parquet.gzip')
X_val = pd.read_parquet(INTERIM_DATA / 'x_val_split.parquet.gzip')
X_test = pd.read_parquet(INTERIM_DATA / 'x_test_split.parquet.gzip')

In [10]:
AMOUNT_FEATURES = {
    'tx_amount': set(),
    'tx_amount_log': set()
}
WINDOWS = ['7d', '14d', '21d', '30d']
SECTOR_FEATURES = set()
CUSTOMER_FEATURES = set()
DATE_FEATURES = set()
ROLLING_FEATURES = set()
RISK_FEATURES = set()
TOP_PERCENTILE_FEATURES = []

## Common features

### Date-related features

In [11]:
DATE_FEATURES = DATE_FEATURES.union([
    'day_of_week',
    'hour',
    'month',
    'is_month_start',
    'is_month_end',
    'is_weekend'
])
DATE_FEATURES

{'day_of_week',
 'hour',
 'is_month_end',
 'is_month_start',
 'is_weekend',
 'month'}

## Rolling features

### Difference in time between transactions

In [13]:
for df in X_train, X_val, X_test:
    df['secs_since_prev_tx'] = (
        df.groupby('customer_id')['tx_datetime']
        .diff()
        .dt.total_seconds()
        .fillna(-1)
    )
SECTOR_FEATURES.add('secs_since_prev_tx')

In [14]:
X_train = X_train.sort_values(['customer_id','tx_datetime']).reset_index(drop=True)
X_val = X_val.sort_values(['customer_id','tx_datetime']).reset_index(drop=True)
X_test = X_test.sort_values(['customer_id','tx_datetime']).reset_index(drop=True)

### Datetime intervals' aggregations for window

For each client and interval window (`7d`, `14d`, `21d`, `30d`), calculate interval mean, standard deviation, zscore and burstiness of seconds since previous transaction:

In [15]:
eps = 1e-6
for df in X_train, X_val, X_test:
       for window in WINDOWS:
              mean_col = f'interval_mean_{window}'
              std_col  = f'interval_std_{window}'
              mean_vals = (
                     df.groupby('customer_id', group_keys=False)
                            .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
                                         .rolling(window).mean())
              ).fillna(0).to_numpy()

              std_vals = (
                     df.groupby('customer_id', group_keys=False)
                            .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
                                          .shift()
                                          .rolling(window).std())
              ).fillna(0).to_numpy()

              df[f'interval_mean_{window}'] = mean_vals
              df[f'interval_std_{window}']  = std_vals
              df[f'interval_zscore_{window}'] = (
                     (df['secs_since_prev_tx'] - df[mean_col]) / (df[std_col] + eps)
              )

              df[f'interval_burstiness_{window}'] = (
                     df[std_col] / (df[mean_col] + eps)
              )
              ROLLING_FEATURES = ROLLING_FEATURES.union([
                     mean_col,
                     std_col,
                     f'interval_zscore_{window}',
                     f'interval_burstiness_{window}'
              ])

  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx']
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_prev_tx'].shift()
  .apply(lambda d: d.set_index('tx_datetime')['secs_since_pr

In [16]:
ROLLING_FEATURES

{'interval_burstiness_14d',
 'interval_burstiness_21d',
 'interval_burstiness_30d',
 'interval_burstiness_7d',
 'interval_mean_14d',
 'interval_mean_21d',
 'interval_mean_30d',
 'interval_mean_7d',
 'interval_std_14d',
 'interval_std_21d',
 'interval_std_30d',
 'interval_std_7d',
 'interval_zscore_14d',
 'interval_zscore_21d',
 'interval_zscore_30d',
 'interval_zscore_7d'}

### Number of transactions in window

In [17]:
for df in X_train, X_val, X_test:
    for window in WINDOWS + ['1h', '1d']:
        vals = (
            df.groupby('customer_id', group_keys=False)
              .apply(lambda d: d.set_index('tx_datetime')['ones']
                                .shift() # Exclude current transaction
                                .rolling(window)
                                .count())
        ).to_numpy()
        df[f'tx_count_{window}'] = vals
        ROLLING_FEATURES.add(f'tx_count_{window}')

  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']
  .apply(lambda d: d.set_index('tx_datetime')['ones']


### Number of client's transactions in sector

In [18]:
for df in X_train, X_val, X_test:
    for window in WINDOWS + ['1h', '1d']:
        vals = (
            df.groupby(['customer_id','sector_id'])
            .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
                                .shift()
                                .rolling(window, closed='both')
                                .count())
        ).to_numpy()
        df[f'tx_count_sector_{window}'] = vals
        ROLLING_FEATURES.add(f'tx_count_sector_{window}')

  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount']
  .apply(lambda d: d.set_index('tx_datetime')['tx_amount

In [19]:
ROLLING_FEATURES

{'interval_burstiness_14d',
 'interval_burstiness_21d',
 'interval_burstiness_30d',
 'interval_burstiness_7d',
 'interval_mean_14d',
 'interval_mean_21d',
 'interval_mean_30d',
 'interval_mean_7d',
 'interval_std_14d',
 'interval_std_21d',
 'interval_std_30d',
 'interval_std_7d',
 'interval_zscore_14d',
 'interval_zscore_21d',
 'interval_zscore_30d',
 'interval_zscore_7d',
 'tx_count_14d',
 'tx_count_1d',
 'tx_count_1h',
 'tx_count_21d',
 'tx_count_30d',
 'tx_count_7d',
 'tx_count_sector_14d',
 'tx_count_sector_1d',
 'tx_count_sector_1h',
 'tx_count_sector_21d',
 'tx_count_sector_30d',
 'tx_count_sector_7d'}

### Transaction's amount relatively to history

In [20]:
for df in X_train, X_val, X_test:
    for amount_col in AMOUNT_FEATURES.keys():
        exp_mean = (df.groupby('customer_id', group_keys=False)[amount_col]
                    .apply(lambda s: s.shift().expanding().mean()))
        exp_std  = (df.groupby('customer_id', group_keys=False)[amount_col]
                    .apply(lambda s: s.shift().expanding().std()))

        df['customer_exp_mean'] = exp_mean.to_numpy()
        df['customer_exp_std']  = exp_std.to_numpy()

        df['ratio_to_customer_mean']  = df[amount_col] / (df['customer_exp_mean'] + eps)
        df['zscore_amount_customer']  = (df[amount_col] - df['customer_exp_mean']) / (df['customer_exp_std'] + eps)
        for window in WINDOWS:
            mean_vals = (df.groupby('customer_id', group_keys=False)
                       .apply(lambda d: d.set_index('tx_datetime')[amount_col]
                                         .rolling(window, closed='left').mean())
                    ).to_numpy()
            df[f'{amount_col}_mean_{window.lower()}'] = mean_vals

            median_vals = (df.groupby('customer_id', group_keys=False)
                            .apply(lambda d: d.set_index('tx_datetime')[amount_col]
                                            .rolling(window, closed='left').median())
                        ).to_numpy()
            df[f'{amount_col}_median_{window.lower()}'] = median_vals

            std_vals = (df.groupby('customer_id', group_keys=False)
                        .apply(lambda d: d.set_index('tx_datetime')[amount_col]
                                            .rolling(window, closed='left').std())
                    ).to_numpy()
            df[f'{amount_col}_std_{window.lower()}'] = std_vals
            AMOUNT_FEATURES[amount_col] = AMOUNT_FEATURES[amount_col].union([
                f'{amount_col}_mean_{window.lower()}',
                f'{amount_col}_median_{window.lower()}',
                f'{amount_col}_std_{window.lower()}'
            ])

        fill_cols = ['customer_exp_mean','customer_exp_std','ratio_to_customer_mean','zscore_amount_customer']
        df[fill_cols] = df[fill_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
        AMOUNT_FEATURES[amount_col] = AMOUNT_FEATURES[amount_col].union(fill_cols)



  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambda d: d.set_index('tx_datetime')[amount_col]
  .apply(lambd

## Sector-related features

In [21]:
for df in X_train, X_val, X_test:
    last_seen = (
        df.groupby(['customer_id','sector_id'])['tx_datetime']
          .shift()
    )
    df[f'is_new_sector_30d'] = (
        (last_seen.isna()) | 
        ((df['tx_datetime'] - last_seen) > pd.Timedelta('30d'))
    ).astype(int)
    df['sector_switch_flag'] = (
        df.groupby('customer_id')['sector_id'].diff().ne(0).astype(int)
    )
    for window in WINDOWS:
        unique_sectors_vals = (
            df.groupby('customer_id', group_keys=False)
              .apply(lambda d: d.set_index('tx_datetime')['sector_id']
                                .shift()                          # исключаем текущую транзакцию
                                .rolling(window, closed='left')
                                .apply(lambda x: x.nunique(), raw=False))
        ).fillna(0).to_numpy()
        df[f'unique_sectors_{window}'] = unique_sectors_vals

        sectors_switches = (
            df.groupby('customer_id', group_keys=False)
              .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
                                .shift()
                                .rolling(window, closed='left').sum())
        ).fillna(0).to_numpy()
        df[f'sector_switches_{window}'] = sectors_switches
        SECTOR_FEATURES = SECTOR_FEATURES.union([
            f'unique_sectors_{window}',
            f'sector_switches_{window}'
        ])
SECTOR_FEATURES = SECTOR_FEATURES.union(['sector_switch_flag', f'is_new_sector_30d'])

  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['sector_switch_flag']
  .apply(lambda d: d.set_index('tx_datetime')['sector_id']
  .apply(lambda d: d.set_index('tx_datetime')['secto

## Risk-related features

In [22]:
def _delayed_risk_group(g: pd.DataFrame, time_col: str, target_col: str,
                        w_days: int, delay_days: int) -> pd.Series:
    g = g.sort_values(time_col)
    s = g.set_index(time_col)[target_col].astype(int)
  
    wd_sum = s.rolling(f'{w_days + delay_days}D', closed='left').sum()
    d_sum  = s.rolling(f'{delay_days}D',       closed='left').sum()
    fraud_window = wd_sum - d_sum

    wd_cnt = s.rolling(f'{w_days + delay_days}D', closed='left').count()
    d_cnt  = s.rolling(f'{delay_days}D',       closed='left').count()
    cnt_window = wd_cnt - d_cnt

    risk = (fraud_window / cnt_window.replace(0, np.nan)).fillna(0.0)
    return pd.Series(risk.values, index=g.index)

def add_delayed_risk_features(df: pd.DataFrame,
                              windows=(7, 30),
                              delay_days=7,
                              group_cols=('sector_id',),
                              time_col='tx_datetime',
                              target_col='tx_fraud',
                              prefix='sector') -> tuple[pd.DataFrame, list[str]]:
    feats = []

    # порядок и типы
    df = df.sort_values([*group_cols, time_col]).reset_index(drop=True)
    df[time_col] = pd.to_datetime(df[time_col], errors='coerce')
    df[target_col] = df[target_col].astype(int)

    for w in windows:
        col = f'{prefix}_risk_{w}d_delay{delay_days}d'
        vals = (
            df.groupby(list(group_cols), group_keys=False)
              .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
        ).to_numpy()
        df[col] = vals
        feats.append(col)

    return df, feats

In [23]:
X_train, sector_feats_tr = add_delayed_risk_features(
    X_train, windows=(7,30), delay_days=7,
    group_cols=('sector_id',), prefix='sector'
)
X_val,   sector_feats_va = add_delayed_risk_features(
    X_val,   windows=(7,30), delay_days=7,
    group_cols=('sector_id',), prefix='sector'
)
X_test,  sector_feats_te = add_delayed_risk_features(
    X_test,  windows=(7,30), delay_days=7,
    group_cols=('sector_id',), prefix='sector'
)

X_train, custsec_feats_tr = add_delayed_risk_features(
    X_train, windows=(7,30), delay_days=7,
    group_cols=('customer_id','sector_id'), prefix='customer_sector'
)
X_val,   custsec_feats_va = add_delayed_risk_features(
    X_val,   windows=(7,30), delay_days=7,
    group_cols=('customer_id','sector_id'), prefix='customer_sector'
)
X_test,  custsec_feats_te = add_delayed_risk_features(
    X_test,  windows=(7,30), delay_days=7,
    group_cols=('customer_id','sector_id'), prefix='customer_sector'
)

RISK_FEATURES = RISK_FEATURES.union(sector_feats_tr + custsec_feats_tr)

  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _delayed_risk_group(g, time_col, target_col, w_days=w, delay_days=delay_days))
  .apply(lambda g: _

Define helper function for calculating rolling mean value by window

In [24]:
def rolling_mean_timebased(df, window, column_name, agg_name):
    s = (df.set_index('tx_datetime')[column_name]
           .rolling(window, closed='left')
           .agg(agg_name))
    return s.reset_index(drop=False).set_index(df.index).iloc[:, 1]

### Aggregations of transaction amount by given window

In [25]:

eps = 1e-6
for df in [X_train, X_val, X_test]:
    for window in WINDOWS:
        for col in AMOUNT_FEATURES.keys():
            mean_ = (
                df.groupby('customer_id')
                    .apply(lambda d: rolling_mean_timebased(d, window, col, 'mean')).fillna(0)
                    .to_numpy()
            )
            std_ = (
                df.groupby('customer_id')
                    .apply(lambda d: rolling_mean_timebased(d, window, col, 'std')).fillna(0)
                    .to_numpy()
            )
            median_ = (
                df.groupby('customer_id')
                    .apply(lambda d: rolling_mean_timebased(d, window, col, 'median')).fillna(0)
                    .to_numpy()
            )
            sum_ = (
                df.groupby('customer_id')
                    .apply(lambda d: rolling_mean_timebased(d, window, col, 'sum')).fillna(0)
                    .to_numpy()
            )
            count_ = (
                df.groupby('customer_id')
                    .apply(lambda d: rolling_mean_timebased(d, window, col, 'count')).fillna(0)
                    .to_numpy()
            )
            df[f'{col}_mean_{window}'] = mean_
            df[f'{col}_std_{window}'] = std_
            df[f'{col}_median_{window}'] = median_
            df[f'{col}_sum_{window}'] = sum_
            df[f'{col}_count_{window}'] = count_

            df[f'{col}_zscore_{window}'] = (df[col] - mean_) / (std_ + eps)
            df[f'{col}_ration_to_mean_{window}'] = df[col] / (mean_ + eps)

            AMOUNT_FEATURES[col] = AMOUNT_FEATURES[col].union([
                f'{col}_mean_{window}',
                f'{col}_std_{window}',
                f'{col}_median_{window}',
                f'{col}_sum_{window}',
                f'{col}_count_{window}',
                f'{col}_zscore_{window}',
                f'{col}_ration_to_mean_{window}'
            ])

  .apply(lambda d: rolling_mean_timebased(d, window, col, 'mean')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'std')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'median')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'sum')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'count')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'mean')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'std')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'median')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'sum')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'count')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'mean')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'std')).fillna(0)
  .apply(lambda d: rolling_mean_timebased(d, window, col, 'median')).fillna

### Delay of fraud transaction for sector

In [26]:
def delayed_sector_stats_group(g: pd.DataFrame, time_col: str, target_col: str,
                               delay_days: int, w_days: int) -> pd.DataFrame:
    g = g.sort_values(time_col)
    y = g[target_col].astype(int)
    s = pd.Series(y.values, index=pd.to_datetime(g[time_col].values))

    wd = s.rolling(f'{w_days + delay_days}D', closed='left')
    d  = s.rolling(f'{delay_days}D',          closed='left')

    fraud_window = wd.sum() - d.sum()
    cnt_window   = wd.count() - d.count()

    risk = (fraud_window / cnt_window.replace(0, np.nan)).fillna(0.0)
    count = cnt_window.fillna(0.0)

    return pd.DataFrame({
        f'risk_{w_days}d_delay{delay_days}d': risk.to_numpy(),
        f'tx_{w_days}d_delay{delay_days}d': count.to_numpy()
    }, index=g.index)

In [27]:
delay_period = 7
for df in [X_train, X_val, X_test]:
    for w in [7, 14, 21, 30]:
        df_stats = (df.groupby('sector_id', group_keys=False)
                    .apply(lambda g: delayed_sector_stats_group(
                        g, time_col='tx_datetime', target_col='tx_fraud',
                        delay_days=delay_period, w_days=w)))
        for col in df_stats.columns:
            df[f'sector_{col}'] = df_stats[col].to_numpy()
            SECTOR_FEATURES.add(f'sector_{col}')

  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(
  .apply(lambda g: delayed_sector_stats_group(


### Average and std transaction amount for customer within window

In [28]:
for df in [X_train, X_val, X_test]:
    for col in  AMOUNT_FEATURES.keys():
        for window in WINDOWS:
                tx_window_col = f'customer_id_{col}_mean_{window}_window'
                tx_avg_amount_col = f'customer_id_{col}_std_{window}_window'

                mean_ = df.set_index('tx_datetime').groupby('customer_id')[col].apply(lambda s: s.shift().rolling(window).mean())
                std_ = df.set_index('tx_datetime').groupby('customer_id')[col].apply(lambda s: s.shift().rolling(window).std())
                df[tx_window_col] = mean_.to_numpy()
                df[tx_avg_amount_col] = std_.to_numpy()
                df[tx_window_col] = df[tx_window_col].fillna(0)
                df[tx_avg_amount_col] = df[tx_avg_amount_col].fillna(0)
                AMOUNT_FEATURES[col] = AMOUNT_FEATURES[col].union([
                    tx_window_col,
                    tx_avg_amount_col
                ])

### Top percentile of transaction for customer

In [29]:
def add_customer_top_percentile_flags(
    df: pd.DataFrame,
    amount_col: str = "tx_amount_log",
    percentiles = (95, 99),
    windows = (None, "30D", "7D")
) -> pd.DataFrame:
    df = df.sort_values(['customer_id','tx_datetime']).reset_index(drop=True)
    df['tx_datetime'] = pd.to_datetime(df['tx_datetime'], errors='coerce')

    if None in windows:
        for p in percentiles:
            thr = (df.groupby('customer_id', group_keys=False)[amount_col]
                     .apply(lambda s: s.shift().expanding().quantile(p/100.0)))
            col_thr = f'{amount_col}_thr_hist_p{p}'
            df[col_thr] = thr.fillna(0).to_numpy()
            df[f'top_p_hist_{p}'] = (df[amount_col] >= df[col_thr]).astype('int8')

    for w in windows:
        if w is None:
            continue
        for p in percentiles:
            thr = (df.groupby('customer_id', group_keys=False)
                     .apply(lambda g: g.set_index('tx_datetime')[amount_col]
                                        .rolling(w, closed='left')
                                        .quantile(p/100.0))
                  ).fillna(0).to_numpy()
            col_thr = f'{amount_col}_thr_{w}_p{p}'
            df[col_thr] = thr
            df[f'top_p_{w}_p{p}'] = (df[amount_col] >= df[col_thr]).astype('int8')

    flag_cols = [c for c in df.columns if c.startswith('top_p_')]
    df[flag_cols] = df[flag_cols].fillna(0)

    return df

In [30]:
X_train = add_customer_top_percentile_flags(X_train, amount_col='tx_amount_log', percentiles=(95,99), windows=WINDOWS + [None])
X_val   = add_customer_top_percentile_flags(X_val,   amount_col='tx_amount_log', percentiles=(95,99), windows=WINDOWS + [None])
X_test  = add_customer_top_percentile_flags(X_test,  amount_col='tx_amount_log', percentiles=(95,99), windows=WINDOWS + [None])

TOP_PERCENTILE_FEATURES += [c for c in X_train.columns if c.startswith('top_p_')]

  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambda g: g.set_index('tx_datetime')[amount_col]
  .apply(lambd

### Calculate `tx_amount` features for customer and globally

In [31]:
for df in [X_train, X_val, X_test]:
    for col in AMOUNT_FEATURES.keys():
        df[f'{col}_mean_customer'] = df.groupby('customer_id')[col].transform('mean').fillna(0)
        df[f'{col}_std_customer'] = df.groupby('customer_id')[col].transform('std').fillna(0)
        df[f'{col}_median_customer'] = df.groupby('customer_id')[col].transform('median').fillna(0)

        df[f'{col}_zscore_customer'] = (df[col] - df[f'{col}_mean_customer']) / (df[f'{col}_std_customer'] + eps)
        df[f'{col}_ration_to_mean_customer'] = df[col] / (df[f'{col}_mean_customer'] + eps)

        df[f'{col}_mean_global'] = df[col].mean()
        df[f'{col}_std_global'] = df[col].std()
        df[f'{col}_median_global'] = df[col].median()

        df[f'{col}_zscore_global'] = (df[col] - df[f'{col}_mean_global']) / (df[f'{col}_std_global'] + eps)
        df[f'{col}_ration_to_mean_global'] = df[col] / (df[f'{col}_mean_global'] + eps)

        AMOUNT_FEATURES[col] = AMOUNT_FEATURES[col].union([
            f'{col}_mean_customer',
            f'{col}_std_customer',
            f'{col}_median_customer',
            f'{col}_zscore_customer',
            f'{col}_ration_to_mean_customer',
            f'{col}_mean_global',
            f'{col}_std_global',
            f'{col}_median_global',
            f'{col}_zscore_global',
            f'{col}_ration_to_mean_global'
        ])

#### Datetime difference-based features

In [32]:
for df in [X_train, X_val, X_test]:
    df['tx_datetime_diff_customer'] = df.groupby('customer_id')['tx_datetime'].diff().dt.total_seconds()
    df['tx_datetime_diff_customer'] = df['tx_datetime_diff_customer'].fillna(0)
    df['tx_datetime_diff_mean_customer'] = df.groupby('customer_id')['tx_datetime_diff_customer'].transform('mean')
    df['tx_datetime_diff_mean_customer'] = df['tx_datetime_diff_mean_customer'].fillna(0)
    df['tx_datetime_diff_median_customer'] = df.groupby('customer_id')['tx_datetime_diff_customer'].transform('median')
    df['tx_datetime_diff_median_customer'] = df['tx_datetime_diff_median_customer'].fillna(0)
    df['tx_datetime_diff_std_customer'] = df.groupby('customer_id')['tx_datetime_diff_customer'].transform('std')
    df['tx_datetime_diff_std_customer'] = df['tx_datetime_diff_std_customer'].fillna(0)

    df['tx_datetime_diff_global'] = df['tx_datetime'].diff().dt.total_seconds()
    df['tx_datetime_diff_global'] = df['tx_datetime_diff_global'].fillna(0)
    df['tx_datetime_diff_mean_global'] = df['tx_datetime_diff_global'].mean()
    df['tx_datetime_diff_mean_global'] = df['tx_datetime_diff_mean_global'].fillna(0)
    df['tx_datetime_diff_median_global'] = df['tx_datetime_diff_global'].median()
    df['tx_datetime_diff_median_global'] = df['tx_datetime_diff_median_global'].fillna(0)
    df['tx_datetime_diff_std_global'] = df['tx_datetime_diff_global'].std()
    df['tx_datetime_diff_std_global'] = df['tx_datetime_diff_std_global'].fillna(0)

DATE_FEATURES = DATE_FEATURES.union([
    'tx_datetime_diff_customer',
    'tx_datetime_diff_mean_customer',
    'tx_datetime_diff_median_customer',
    'tx_datetime_diff_std_customer',
    'tx_datetime_diff_global',
    'tx_datetime_diff_mean_global',
    'tx_datetime_diff_median_global',
    'tx_datetime_diff_std_global'
])

## Final features list

Prepare final features list and dataframes for model training

In [33]:
AMOUNT_FEATURES['tx_amount_log']

{'customer_exp_mean',
 'customer_exp_std',
 'customer_id_tx_amount_log_mean_14d_window',
 'customer_id_tx_amount_log_mean_21d_window',
 'customer_id_tx_amount_log_mean_30d_window',
 'customer_id_tx_amount_log_mean_7d_window',
 'customer_id_tx_amount_log_std_14d_window',
 'customer_id_tx_amount_log_std_21d_window',
 'customer_id_tx_amount_log_std_30d_window',
 'customer_id_tx_amount_log_std_7d_window',
 'ratio_to_customer_mean',
 'tx_amount_log_count_14d',
 'tx_amount_log_count_21d',
 'tx_amount_log_count_30d',
 'tx_amount_log_count_7d',
 'tx_amount_log_mean_14d',
 'tx_amount_log_mean_21d',
 'tx_amount_log_mean_30d',
 'tx_amount_log_mean_7d',
 'tx_amount_log_mean_customer',
 'tx_amount_log_mean_global',
 'tx_amount_log_median_14d',
 'tx_amount_log_median_21d',
 'tx_amount_log_median_30d',
 'tx_amount_log_median_7d',
 'tx_amount_log_median_customer',
 'tx_amount_log_median_global',
 'tx_amount_log_ration_to_mean_14d',
 'tx_amount_log_ration_to_mean_21d',
 'tx_amount_log_ration_to_mean_30

In [34]:
SECTOR_FEATURES

{'is_new_sector_30d',
 'secs_since_prev_tx',
 'sector_risk_14d_delay7d',
 'sector_risk_21d_delay7d',
 'sector_risk_30d_delay7d',
 'sector_risk_7d_delay7d',
 'sector_switch_flag',
 'sector_switches_14d',
 'sector_switches_21d',
 'sector_switches_30d',
 'sector_switches_7d',
 'sector_tx_14d_delay7d',
 'sector_tx_21d_delay7d',
 'sector_tx_30d_delay7d',
 'sector_tx_7d_delay7d',
 'unique_sectors_14d',
 'unique_sectors_21d',
 'unique_sectors_30d',
 'unique_sectors_7d'}

In [35]:
DATE_FEATURES

{'day_of_week',
 'hour',
 'is_month_end',
 'is_month_start',
 'is_weekend',
 'month',
 'tx_datetime_diff_customer',
 'tx_datetime_diff_global',
 'tx_datetime_diff_mean_customer',
 'tx_datetime_diff_mean_global',
 'tx_datetime_diff_median_customer',
 'tx_datetime_diff_median_global',
 'tx_datetime_diff_std_customer',
 'tx_datetime_diff_std_global'}

In [36]:
RISK_FEATURES

{'customer_sector_risk_30d_delay7d',
 'customer_sector_risk_7d_delay7d',
 'sector_risk_30d_delay7d',
 'sector_risk_7d_delay7d'}

In [37]:
ROLLING_FEATURES

{'interval_burstiness_14d',
 'interval_burstiness_21d',
 'interval_burstiness_30d',
 'interval_burstiness_7d',
 'interval_mean_14d',
 'interval_mean_21d',
 'interval_mean_30d',
 'interval_mean_7d',
 'interval_std_14d',
 'interval_std_21d',
 'interval_std_30d',
 'interval_std_7d',
 'interval_zscore_14d',
 'interval_zscore_21d',
 'interval_zscore_30d',
 'interval_zscore_7d',
 'tx_count_14d',
 'tx_count_1d',
 'tx_count_1h',
 'tx_count_21d',
 'tx_count_30d',
 'tx_count_7d',
 'tx_count_sector_14d',
 'tx_count_sector_1d',
 'tx_count_sector_1h',
 'tx_count_sector_21d',
 'tx_count_sector_30d',
 'tx_count_sector_7d'}

In [38]:
TOP_PERCENTILE_FEATURES

['top_p_hist_95',
 'top_p_hist_99',
 'top_p_7d_p95',
 'top_p_7d_p99',
 'top_p_14d_p95',
 'top_p_14d_p99',
 'top_p_21d_p95',
 'top_p_21d_p99',
 'top_p_30d_p95',
 'top_p_30d_p99']

In [39]:
FEATURES = list(
    AMOUNT_FEATURES['tx_amount_log'] | 
    SECTOR_FEATURES | 
    DATE_FEATURES | 
    RISK_FEATURES | 
    ROLLING_FEATURES | 
    set(TOP_PERCENTILE_FEATURES)
)
FEATURES

['tx_amount_log_zscore_30d',
 'tx_amount_log_ration_to_mean_7d',
 'top_p_hist_99',
 'tx_amount_log_mean_global',
 'tx_count_sector_1d',
 'tx_amount_log_zscore_21d',
 'tx_amount_log_sum_7d',
 'tx_amount_log_ration_to_mean_21d',
 'tx_count_30d',
 'tx_amount_log_sum_21d',
 'is_month_end',
 'top_p_21d_p99',
 'tx_amount_log_count_21d',
 'sector_tx_21d_delay7d',
 'tx_count_sector_7d',
 'tx_count_14d',
 'day_of_week',
 'tx_amount_log_mean_14d',
 'tx_count_1h',
 'unique_sectors_21d',
 'tx_amount_log_std_customer',
 'tx_amount_log_sum_30d',
 'ratio_to_customer_mean',
 'customer_id_tx_amount_log_mean_30d_window',
 'tx_amount_log_ration_to_mean_customer',
 'tx_datetime_diff_mean_customer',
 'tx_datetime_diff_std_global',
 'interval_zscore_21d',
 'top_p_7d_p95',
 'tx_amount_log_count_7d',
 'tx_amount_log_median_30d',
 'unique_sectors_14d',
 'interval_mean_30d',
 'top_p_7d_p99',
 'tx_datetime_diff_std_customer',
 'sector_switches_30d',
 'top_p_14d_p99',
 'tx_amount_log_ration_to_mean_global',
 'int

Check if there is no `NaN` values in columns

In [40]:
null_columns = X_train.columns[X_train.isna().any()].to_list()
null_columns

[]

In [41]:
len(X_val[FEATURES].columns) == len(X_train[FEATURES].columns) == len(X_test[FEATURES].columns)

True

## Correlations

Since number of create features is pretty huge, let us find out, what of them have high correlation

In [47]:
corr = X_train[FEATURES].corr(method='pearson')

In [None]:
corr_abs = corr.abs()
high_corr = corr_abs.unstack().sort_values(ascending=False)
high_corr = high_corr[(high_corr < 1) & (high_corr > 0.97)]

In [58]:
high_corr

interval_zscore_21d                  interval_zscore_30d                    1.000000
interval_zscore_30d                  interval_zscore_21d                    1.000000
tx_amount_log_ration_to_mean_global  tx_amount_log_zscore_global            1.000000
tx_amount_log_zscore_global          tx_amount_log_ration_to_mean_global    1.000000
tx_amount_log_zscore_21d             tx_amount_log_zscore_30d               1.000000
                                                                              ...   
interval_zscore_14d                  interval_zscore_21d                    0.974495
sector_tx_14d_delay7d                sector_tx_21d_delay7d                  0.974152
sector_tx_21d_delay7d                sector_tx_14d_delay7d                  0.974152
                                     sector_tx_30d_delay7d                  0.973696
sector_tx_30d_delay7d                sector_tx_21d_delay7d                  0.973696
Length: 288, dtype: float64

In [98]:
import networkx as nx

pairs = [tuple(sorted(k)) for k in high_corr.keys()]
G = nx.Graph()
G.add_edges_from(pairs)

groups = list(nx.connected_components(G))
print(f"Found {len(groups)} correlated groups")

Found 17 correlated groups


In [None]:
columns_to_delete = []
for gr in groups:
    columns_to_delete.extend(sorted(gr)[1:])

columns_to_delete

['interval_zscore_21d',
 'interval_zscore_30d',
 'tx_amount_log_median_customer',
 'tx_amount_log_ration_to_mean_global',
 'tx_amount_log_zscore_global',
 'sector_switch_flag',
 'tx_amount_log_ration_to_mean_14d',
 'tx_amount_log_ration_to_mean_21d',
 'tx_amount_log_ration_to_mean_30d',
 'tx_amount_log_zscore_14d',
 'tx_amount_log_zscore_21d',
 'tx_amount_log_zscore_30d',
 'tx_datetime_diff_global',
 'tx_datetime_diff_customer',
 'tx_count_30d',
 'tx_count_sector_30d',
 'tx_count_21d',
 'tx_count_sector_21d',
 'tx_count_14d',
 'tx_count_sector_14d',
 'tx_count_7d',
 'tx_count_sector_7d',
 'tx_count_sector_1d',
 'customer_id_tx_amount_log_mean_14d_window',
 'customer_id_tx_amount_log_mean_21d_window',
 'customer_id_tx_amount_log_mean_30d_window',
 'customer_id_tx_amount_log_mean_7d_window',
 'tx_amount_log_mean_14d',
 'tx_amount_log_mean_21d',
 'tx_amount_log_mean_30d',
 'tx_amount_log_mean_7d',
 'tx_amount_log_median_14d',
 'tx_amount_log_median_21d',
 'tx_amount_log_median_30d',
 'tx_

In [96]:
print(f"Overall features count: {len(set(FEATURES))}")
print(f"Columns to delete (due to high correlation) count: {len(set(columns_to_delete))}")

Overall features count: 79
Columns to delete (due to high correlation) count: 44


In [97]:
FEATURES = set(FEATURES) - set(columns_to_delete)

In [100]:
len(FEATURES)

79

Number of features decreased drastically (approximately, one third)

## Save dataframes

In [42]:
filenames = {
    'x_train_features.parquet.gzip': X_train,
    'x_val_features.parquet.gzip': X_val,
    'x_test_features.parquet.gzip': X_test,
}

for fn, df in filenames.items():
    df.to_parquet(INTERIM_DATA / fn, compression='gzip')

In [95]:
with open(INTERIM_DATA / "features_list.txt", "w") as f:
    for feature in FEATURES:
        f.write(feature + '\n')