In [1]:
import pandas as pd

raw_df = pd.read_csv('data/fraudTrain.csv',index_col=0)

print(raw_df.shape)
raw_df.head()

(1203221, 22)


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376000.0,36.011293,-82.048315,0.0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376000.0,49.159047,-118.186462,0.0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154.0,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376000.0,43.150704,-112.154481,0.0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376000.0,47.034331,-112.561071,0.0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376000.0,38.674999,-78.632459,0.0


In [2]:
# data types
cat_cols = ['category','gender','state']
num_cols = ['amt','lat','long','city_pop',]
txt_cols = ['merchant','first','last','street','city','zip','job']
time_col = 'trans_date_trans_time'
date_col = 'dob'
id_cols = ['cc_num','trans_num']

# data cleaning

def clean_data(df):
    # drop duplicates
    df = df.copy()
    df.drop_duplicates(inplace=True)
    # drop columns
    df.drop(columns=id_cols, inplace=True)
    # convert dob column to datetime
    df[date_col] = df[date_col].apply(pd.to_datetime, format="%Y-%m-%d")
    # convert to age
    df['age'] = df[date_col].apply(lambda x: 2021 - x.year)
    # drop dob column
    df.drop(columns=date_col, inplace=True)
    # convert time columns to datetime
    df[time_col] = df[time_col].apply(pd.to_datetime, format="%Y-%m-%d %H:%M:%S")
    # convert to weekday
    df['weekday'] = df['trans_date_trans_time'].apply(lambda x: x.weekday())
    df['weekday'] = df['weekday'].astype('category')
    # convert to hour
    df['hour'] = df['trans_date_trans_time'].apply(lambda x: x.hour)
    df['hour'] = df['hour'].astype('int')
    # drop time column
    df.drop(columns=time_col, inplace=True)
    # convert categorical columns to category type
    df[cat_cols] = df[cat_cols].astype('category')
    # convert numerical columns to float
    df[num_cols] = df[num_cols].astype('float')
    # convert text columns to string
    df[txt_cols] = df[txt_cols].astype('str')
    return df

cleaned_df = clean_data(raw_df)
cleaned_df.head(5)

Unnamed: 0,merchant,category,amt,first,last,gender,street,city,state,zip,...,long,city_pop,job,unix_time,merch_lat,merch_long,is_fraud,age,weekday,hour
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654.0,...,-81.1781,3495.0,"Psychologist, counselling",1325376000.0,36.011293,-82.048315,0.0,33.0,1,0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160.0,...,-118.2105,149.0,Special educational needs teacher,1325376000.0,49.159047,-118.186462,0.0,43.0,1,0
2,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252.0,...,-112.262,4154.0,Nature conservation officer,1325376000.0,43.150704,-112.154481,0.0,59.0,1,0
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632.0,...,-112.1138,1939.0,Patent attorney,1325376000.0,47.034331,-112.561071,0.0,54.0,1,0
4,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433.0,...,-79.4629,99.0,Dance movement psychotherapist,1325376000.0,38.674999,-78.632459,0.0,35.0,1,0


In [3]:
structured_df = cleaned_df.drop(columns=txt_cols)
structured_df

Unnamed: 0,category,amt,gender,state,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,weekday,hour
0,misc_net,4.97,F,NC,36.0788,-81.1781,3495.0,1.325376e+09,36.011293,-82.048315,0.0,33.0,1,0
1,grocery_pos,107.23,F,WA,48.8878,-118.2105,149.0,1.325376e+09,49.159047,-118.186462,0.0,43.0,1,0
2,entertainment,220.11,M,ID,42.1808,-112.2620,4154.0,1.325376e+09,43.150704,-112.154481,0.0,59.0,1,0
3,gas_transport,45.00,M,MT,46.2306,-112.1138,1939.0,1.325376e+09,47.034331,-112.561071,0.0,54.0,1,0
4,misc_pos,41.96,M,VA,38.4207,-79.4629,99.0,1.325376e+09,38.674999,-78.632459,0.0,35.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1203216,misc_net,2.21,M,AL,33.9778,-86.5598,3996.0,1.368854e+09,33.783273,-85.765112,0.0,44.0,0,5
1203217,grocery_pos,88.34,M,MO,40.4931,-91.8912,519.0,1.368854e+09,40.153142,-92.806918,0.0,55.0,0,5
1203218,misc_pos,305.49,M,IL,41.1730,-89.2187,532.0,1.368854e+09,40.640759,-89.445451,0.0,65.0,0,5
1203219,grocery_pos,201.16,F,CA,33.6773,-118.0051,190249.0,1.368854e+09,33.889826,-117.933831,0.0,45.0,0,5


In [4]:
# train val test split
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(structured_df, test_size=0.2, random_state=123)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=123)
# drop duplicates and nans
train_df.dropna(inplace=True)
train_df.drop_duplicates(inplace=True)
test_df.dropna(inplace=True)
test_df.drop_duplicates(inplace=True)
val_df.dropna(inplace=True)
val_df.drop_duplicates(inplace=True)

print(train_df.shape)
train_df.head(5)

(962575, 14)


Unnamed: 0,category,amt,gender,state,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,weekday,hour
733360,home,85.15,F,NY,43.0498,-78.851,45100.0,1352492000.0,43.883519,-79.671577,0.0,28.0,5,20
796196,health_fitness,74.78,M,ND,48.3396,-102.24,229.0,1354492000.0,48.462268,-102.964408,0.0,38.0,0,23
550120,kids_pets,7.66,M,SC,32.8357,-79.8217,20478.0,1345748000.0,32.875318,-79.524554,0.0,24.0,4,18
648316,gas_transport,50.76,M,NH,42.8678,-71.042,4603.0,1349249000.0,43.242446,-70.669395,0.0,69.0,3,7
101961,personal_care,82.32,M,MS,32.3739,-90.1293,233060.0,1330545000.0,31.703997,-90.360769,0.0,49.0,3,19


In [5]:
# fit xgboost model
# y is is_fraud column
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import numpy as np

fraud_counts = train_df['is_fraud'].value_counts()
weights =  np.linspace(0.8, 2, 16)

def fit_xgboost_model(scale_pos_weight):
    xgb_model = xgb.XGBClassifier(
        n_estimators=100, max_depth=5, learning_rate=0.1,
        n_jobs=-1, random_state=42, enable_categorical=True,
        tree_method="gpu_hist",
        scale_pos_weight=scale_pos_weight)
    xgb_model.fit(train_df.drop(columns='is_fraud'), train_df['is_fraud'])
    return xgb_model

# choose weight with best auc score
best_sofar = 0
for scale_pos_weight in weights:
    xgb_model = fit_xgboost_model(scale_pos_weight)
    y_pred = xgb_model.predict_proba(val_df.drop(columns='is_fraud'))[:, 1]
    auc_score = roc_auc_score(val_df['is_fraud'], y_pred)
    print(f"scale_pos_weight: {scale_pos_weight:.4f}, auc_score: {auc_score:.4f}")
    if auc_score > best_sofar:
        best_sofar = auc_score
        best_weight = scale_pos_weight

print(f"best weight: {best_weight:.4f}, best auc score: {best_sofar:.4f}")

scale_pos_weight: 0.8000, auc_score: 0.9972
scale_pos_weight: 0.8800, auc_score: 0.9974
scale_pos_weight: 0.9600, auc_score: 0.9974
scale_pos_weight: 1.0400, auc_score: 0.9975
scale_pos_weight: 1.1200, auc_score: 0.9976
scale_pos_weight: 1.2000, auc_score: 0.9977
scale_pos_weight: 1.2800, auc_score: 0.9974
scale_pos_weight: 1.3600, auc_score: 0.9979
scale_pos_weight: 1.4400, auc_score: 0.9976
scale_pos_weight: 1.5200, auc_score: 0.9977
scale_pos_weight: 1.6000, auc_score: 0.9976
scale_pos_weight: 1.6800, auc_score: 0.9978
scale_pos_weight: 1.7600, auc_score: 0.9977
scale_pos_weight: 1.8400, auc_score: 0.9977
scale_pos_weight: 1.9200, auc_score: 0.9977
scale_pos_weight: 2.0000, auc_score: 0.9977
best weight: 1.3600, best auc score: 0.9979


In [6]:
# fit model with best weight
xgb_model = fit_xgboost_model(best_weight)

In [7]:
# evaluate model
from sklearn.metrics import classification_report

def evaluate_model(model, df):
    y_true = df['is_fraud']
    y_pred = model.predict(df.drop(columns='is_fraud'))
    print(classification_report(y_true, y_pred,digits=4))

print('Train')
evaluate_model(xgb_model, train_df)
print('Test')
evaluate_model(xgb_model, test_df)

Train
              precision    recall  f1-score   support

         0.0     0.9988    0.9998    0.9993    957017
         1.0     0.9534    0.7994    0.8696      5558

    accuracy                         0.9986    962575
   macro avg     0.9761    0.8996    0.9345    962575
weighted avg     0.9986    0.9986    0.9986    962575

Test
              precision    recall  f1-score   support

         0.0     0.9987    0.9997    0.9992    119635
         1.0     0.9397    0.7703    0.8466       688

    accuracy                         0.9984    120323
   macro avg     0.9692    0.8850    0.9229    120323
weighted avg     0.9983    0.9984    0.9983    120323



In [8]:
# artificially induce prior drift by sampling only 2% of non-fraudulent transactions from test set
drift_df = pd.concat([
    test_df.query('is_fraud == 1'),
    test_df.query('is_fraud == 0').sample(frac=.02),
])
drift_df

Unnamed: 0,category,amt,gender,state,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud,age,weekday,hour
957689,shopping_pos,659.61,F,AR,36.0244,-90.9288,7155.0,1.358644e+09,35.496723,-91.256960,1.0,44.0,0,1
762863,shopping_pos,722.07,F,FL,26.4722,-81.8122,224256.0,1.353640e+09,27.009774,-81.021017,1.0,36.0,5,3
978982,shopping_pos,1013.79,F,SC,33.9349,-80.8449,13717.0,1.359758e+09,34.800103,-80.802527,1.0,72.0,5,22
215213,misc_net,771.14,M,MN,44.9673,-93.2828,1022298.0,1.334892e+09,44.905161,-92.423825,1.0,45.0,5,3
715618,grocery_pos,349.06,M,MI,44.6001,-84.2931,864.0,1.351829e+09,45.186246,-83.479088,1.0,79.0,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989285,gas_transport,80.99,F,OH,39.6251,-82.7552,4512.0,1.360297e+09,39.277762,-82.739335,0.0,29.0,5,4
655560,kids_pets,49.82,M,MT,45.2857,-108.9082,1446.0,1.349525e+09,46.001940,-108.438889,0.0,43.0,6,12
658148,grocery_pos,87.08,F,NY,43.0397,-77.6871,10256.0,1.349592e+09,43.990500,-77.213881,0.0,53.0,0,6
450279,entertainment,144.02,M,WI,43.9446,-88.0911,5196.0,1.342699e+09,43.546864,-87.193367,0.0,43.0,4,12


In [9]:
category_map_old = dict(zip(
    list(range(len(train_df.columns))),
    train_df.columns.tolist()
))
category_map = category_map_old.copy()
for k,v in category_map_old.items():
    if v not in cat_cols:
        del category_map[k]
    else:
        category_map[k] = None
category_map

{0: None, 2: None, 3: None}

In [10]:
# drift detection using alibi_detect
# apply Chi-Squared and Kolmogorov-Smirnov tests to categorical and numerical features respectively
from alibi_detect.cd import TabularDrift

def detect_drift(df, test_data_name:str, include_target:bool, ref_df=train_df):
    if not include_target:
        df = df.drop(columns='is_fraud')
        ref_df = ref_df.drop(columns='is_fraud')
    
    # initialize drift detector, using conservative bonferroni correction
    cd = TabularDrift(
        ref_df.to_numpy(), p_val=.05,
        categories_per_feature=category_map,
        correction='bonferroni')
    print(f'Detecting drift on {test_data_name}')
    res = cd.predict(df.to_numpy(), drift_type='batch', return_p_val=True, return_distance=True)
    if res['data']['is_drift']:
        print('Drift detected')
    else:
        print('No drift detected')
    print()


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [11]:
# detect drift and evaluate on test set
detect_drift(test_df, 'test set', False)
evaluate_model(xgb_model, test_df)

# detect drift and evaluate on drift set
detect_drift(drift_df, 'drift set', False)
evaluate_model(xgb_model, drift_df)

Detecting drift on test set
No drift detected

              precision    recall  f1-score   support

         0.0     0.9987    0.9997    0.9992    119635
         1.0     0.9397    0.7703    0.8466       688

    accuracy                         0.9984    120323
   macro avg     0.9692    0.8850    0.9229    120323
weighted avg     0.9983    0.9984    0.9983    120323

Detecting drift on drift set
Drift detected

              precision    recall  f1-score   support

         0.0     0.9381    1.0000    0.9680      2393
         1.0     1.0000    0.7703    0.8703       688

    accuracy                         0.9487      3081
   macro avg     0.9690    0.8852    0.9192      3081
weighted avg     0.9519    0.9487    0.9462      3081

