In [1]:
import pandas as pd
import datetime
import sklearn
import xgboost as xgboost
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
def scaleData(train,test,features):
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(train[features])
    train[features]=scaler.transform(train[features])
    test[features]=scaler.transform(test[features])

    return (train,test)

In [3]:
def get_train_test_set(transactions_df,
                       start_date_training,
                       delta_train=7,delta_delay=7,delta_test=7):

    # Get the training set data
    train_df = transactions_df[(transactions_df.TX_DATETIME>=start_date_training) &
                               (transactions_df.TX_DATETIME<start_date_training+datetime.timedelta(days=delta_train))]

    # Get the test set data
    test_df = []

    # Note: Cards known to be compromised after the delay period are removed from the test set
    # That is, for each test day, all frauds known at (test_day-delay_period) are removed

    # First, get known defrauded customers from the training set
    known_defrauded_customers = set(train_df[train_df.TX_FRAUD==1].CUSTOMER_ID)

    # Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
    start_tx_time_days_training = train_df.TX_TIME_DAYS.min()

    # Then, for each day of the test set
    for day in range(delta_test):

        # Get test data for that day
        test_df_day = transactions_df[transactions_df.TX_TIME_DAYS==start_tx_time_days_training+
                                                                    delta_train+delta_delay+
                                                                    day]

        # Compromised cards from that test day, minus the delay period, are added to the pool of known defrauded customers
        test_df_day_delay_period = transactions_df[transactions_df.TX_TIME_DAYS==start_tx_time_days_training+
                                                                                delta_train+
                                                                                day-1]

        new_defrauded_customers = set(test_df_day_delay_period[test_df_day_delay_period.TX_FRAUD==1].CUSTOMER_ID)
        known_defrauded_customers = known_defrauded_customers.union(new_defrauded_customers)

        test_df_day = test_df_day[~test_df_day.CUSTOMER_ID.isin(known_defrauded_customers)]

        test_df.append(test_df_day)

    test_df = pd.concat(test_df)

    # Sort data sets by ascending order of transaction ID
    train_df=train_df.sort_values('TRANSACTION_ID')
    test_df=test_df.sort_values('TRANSACTION_ID')

    return (train_df, test_df)

In [4]:
transactions_df = pd.read_csv('simulated-data-raw/trs.csv', parse_dates=['TX_DATETIME'])

In [5]:
transactions_df.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO,...,CUSTOMER_ID_NB_TX_7DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,CUSTOMER_ID_NB_TX_30DAY_WINDOW,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW,TERMINAL_ID_NB_TX_1DAY_WINDOW,TERMINAL_ID_RISK_1DAY_WINDOW,TERMINAL_ID_NB_TX_7DAY_WINDOW,TERMINAL_ID_RISK_7DAY_WINDOW,TERMINAL_ID_NB_TX_30DAY_WINDOW,TERMINAL_ID_RISK_30DAY_WINDOW
0,0,0,2021-07-01 00:00:31,596,3156,1143.26,31,0,0,0,...,1.0,1143.26,1.0,1143.26,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,2021-07-01 00:02:10,4961,3412,1630.23,130,0,0,0,...,1.0,1630.23,1.0,1630.23,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2,2021-07-01 00:07:56,2,1365,2920.0,476,0,0,0,...,1.0,2920.0,1.0,2920.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,3,2021-07-01 00:09:29,4128,8737,1289.88,569,0,0,0,...,1.0,1289.88,1.0,1289.88,0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,2021-07-01 00:10:34,927,9906,1019.77,634,0,0,0,...,1.0,1019.77,1.0,1019.77,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
start_date_training = datetime.datetime.strptime("2021-11-25", "%Y-%m-%d")

In [7]:
(train_df, test_df)=get_train_test_set(transactions_df,start_date_training,
                                       delta_train=7,delta_delay=7,delta_test=7)

In [8]:
output_feature="TX_FRAUD"

input_features=['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
       'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
       'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']

In [9]:
(train_df, test_df)=scaleData(train_df,test_df,input_features)

In [10]:
classifier = xgboost.XGBClassifier(random_state=0,n_jobs=-1)

In [11]:
classifier.fit(train_df[input_features], train_df[output_feature])



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=-1,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [12]:
predictions_test=classifier.predict_proba(test_df[input_features])[:,1]

In [13]:
predictions_train=classifier.predict_proba(train_df[input_features])[:,1]

In [14]:
train_df['TX_FRAUD_PREDICTED']=predictions_train

In [15]:
test_df['TX_FRAUD_PREDICTED']=predictions_test

In [16]:
predictions_train

array([9.9566442e-01, 5.5838205e-05, 2.3420146e-04, ..., 5.1771603e-03,
       3.1502231e-03, 4.2049885e-03], dtype=float32)

In [17]:
predictions_test

array([0.00157972, 0.0001757 , 0.0038097 , ..., 0.0013182 , 0.00098514,
       0.00103186], dtype=float32)

In [18]:
roc_auc_score(train_df['TX_FRAUD'], train_df['TX_FRAUD_PREDICTED'])

0.9998657187080929

In [19]:
roc_auc_score(test_df['TX_FRAUD'], test_df['TX_FRAUD_PREDICTED'])

0.8692832197592001