In [325]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from lib.sampling import subsampling
import numpy as np
from sklearn.model_selection import GridSearchCV
from lib.mutation_util import date
from sklearn import metrics
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Preparation (need balanced sample before classification)

In [231]:
start_date = "2021-01-01"
end_date = "2021-11-30"

In [232]:
df_features = pd.read_csv(f"../feature-engineering/final_features_{end_date}.csv")

# Subsample non_fraudulent transactions records so we have balanced dataset
df_fraudulent = df_features[df_features['has_fraudulent_dispute'] == True]
df_non_fraudulent = df_features[df_features['has_fraudulent_dispute'] == False]
subsample_index= subsampling(df_non_fraudulent.index, len(df_fraudulent))
df_non_fraudulent_subsample  = df_non_fraudulent.loc[subsample_index,:]
df_sample = pd.concat([df_non_fraudulent_subsample,df_fraudulent], axis=0)
df_sample.shape

(19438, 56)

In [233]:
X = df_sample.drop(["date","psp_reference","has_fraudulent_dispute", "is_refused_by_adyen"], axis=1)
y = df_sample["has_fraudulent_dispute"]

In [234]:
y.value_counts()

False    9719
True     9719
Name: has_fraudulent_dispute, dtype: int64

## Training (Tuning)

In [235]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19438 entries, 86282 to 127004
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   is_credit                19438 non-null  bool   
 1   no_ip                    19438 non-null  bool   
 2   no_email                 19438 non-null  bool   
 3   same_country             19438 non-null  bool   
 4   merchant_Merchant B      19438 non-null  int64  
 5   merchant_Merchant C      19438 non-null  int64  
 6   merchant_Merchant D      19438 non-null  int64  
 7   merchant_Merchant E      19438 non-null  int64  
 8   card_scheme_MasterCard   19438 non-null  int64  
 9   card_scheme_Other        19438 non-null  int64  
 10  card_scheme_Visa         19438 non-null  int64  
 11  ip_country_GR            19438 non-null  int64  
 12  ip_country_IT            19438 non-null  int64  
 13  ip_country_NL            19438 non-null  int64  
 14  ip_country_ZW    

In [291]:
# only is_credit feature
X_subset = X.loc(axis=1)[["is_credit"]]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.6465171313921185
AUC Score (Train): 0.6465171313921185


In [237]:
# all features without woe and graph features
X_subset = X.loc(axis=1)["is_credit":"zip_code_ZB"]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.7346434818396954
AUC Score (Train): 0.9444194923973386


In [238]:
# only is_credit, woe feature
X_subset = X.loc(axis=1)[["is_credit","ip_address_woe"]]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.8329046198168536
AUC Score (Train): 0.9315381681998387


In [279]:
# only credit+woe
X_subset = X.loc(axis=1)[["is_credit","ip_address_woe", "email_address_woe", "card_number_woe"]]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.965325650787118
AUC Score (Train): 0.9965906040402033


In [240]:
# only graph
X_subset = X.loc(axis=1)["ip_node_degree":"card_page_rank"]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.8620228418561581
AUC Score (Train): 0.9532777573109237


In [241]:
# is_credit + graph
X_subset = pd.concat([X.loc(axis=1)["ip_node_degree":"card_page_rank"],X.loc(axis=1)[["is_credit"]]], axis=1)
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.8781767671571149
AUC Score (Train): 0.9667844906742094


In [315]:
# is_credit + graph + woe
X_subset = pd.concat([X.loc(axis=1)["ip_node_degree":"card_page_rank"], X.loc(axis=1)[["is_credit"]], X.loc(axis=1)["ip_address_woe":"card_number_woe"]], axis=1)
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.9651198682992077
AUC Score (Train): 0.9971551084496896


## Testing

In [243]:
test_start_date = "2021-12-01"
test_end_date = "2021-12-31"

In [244]:
df_ori = pd.read_csv(f"../adyen-dataset.csv")
df_ori["date"] = df_ori.apply(date, axis=1)
df_ori["date"] = pd.to_datetime(df_ori["date"])
df_ori.set_index('date', inplace=True)
df_ts = df_ori.sort_index()
df_test_ts = df_ts.loc[test_start_date:test_end_date]

In [245]:
df_test_ts = df_test_ts.loc(axis=1)["is_credit", "ip_address", "email_address", "card_number", "has_fraudulent_dispute"]
df_test = df_test_ts.reset_index(drop=True)
df_test

Unnamed: 0,is_credit,ip_address,email_address,card_number,has_fraudulent_dispute
0,False,cqHg4ONJtOBwqNNSjawwVg,vpCrxKV4Rb4hHYZ_xUQrnA,92TKnNIoPTXgok35txpjyg,False
1,True,I4zf2g3Z4BdSVbOtXDmw7g,YKy04vP5R13SJl5ydrfr3w,iWwHGPT5vcubXqewt6QmFQ,False
2,True,M3g-untYinyAQiAIXKGNSw,f-37afIGJJkYJHQXM_HKMg,HDPyxyvo2Lqebm-qusJqOQ,False
3,True,KT8qTQ6tiACsTKJsyxvPIg,36VSxpSABRjj43cfP6lwjw,gbsYb_MOBHH6FXa_a7c7og,False
4,True,wpnDR446pjLAUoNYueB-yg,5MO1Y0cW3sRJ56IbZCwOIQ,s0sj9IsbO7VMzyIsRBBZOA,False
...,...,...,...,...,...
11646,True,NUZxGz6wWtW1enn19ghi4g,D5nXKSWwpD40xABXiW5drg,2XeZsqgafyXVZrQNZpC7Kw,False
11647,False,-0zsia1FtEIuWYt71pLrKw,,leow_iHIe06J6n5YLGOgyQ,False
11648,True,sDicZTCFj6lP8AiV6WFC7g,NJPPoAQt0eoYvDtBIf5tHQ,Gd03xMeOIMabaEfpvqR1wA,False
11649,True,hAKt2EU3vmjrGOT6ueNsdw,3zRwZ-rq1GTbTuPpE9mwXw,h_1om9k6GCO7nv5NeoDgfQ,False


In [246]:
df_graph_features_lookup_table = pd.read_csv(f"../feature-engineering/graph_features_lookup_table_{end_date}.csv")

In [247]:
def append_graph_features(node_type_name, df, graph_feature_lookup_table):
    def append_graph_features_by_row(row):
        node_key = row[node_type_name]
        prefix = node_type_name.split('_')[0]
        try:
            row[f"{prefix}_node_degree"] = graph_feature_lookup_table[graph_feature_lookup_table["Node"] == node_key]["DEGREE"].iloc[0]
            row[f"{prefix}_eigen_centrality"] = graph_feature_lookup_table[graph_feature_lookup_table["Node"] == node_key]["EIGENVECTOR_CENTRALITY"].iloc[0]
            row[f"{prefix}_page_rank"] = graph_feature_lookup_table[graph_feature_lookup_table["Node"] == node_key]["PAGE_RANK"].iloc[0]
        except IndexError as e:
            row[f"{prefix}_node_degree"] = 2
            row[f"{prefix}_eigen_centrality"] = 0.33
            row[f"{prefix}_page_rank"] = 0.57
        return row
    df = df.apply(append_graph_features_by_row, axis=1)
    return df

In [248]:
df_test = append_graph_features("ip_address",df_test, df_graph_features_lookup_table)
df_test = append_graph_features("email_address",df_test, df_graph_features_lookup_table)
df_test = append_graph_features("card_number",df_test, df_graph_features_lookup_table)

In [249]:
df_woe_features = pd.read_csv(f"../feature-engineering/woe_features_{end_date}.csv")

In [250]:
def append_woe_features(col_name, df, woe_features):
    def append_woe_features_by_row(row):
        node_key = row[col_name]
        try:
            row[f"{col_name}_woe"] = woe_features[woe_features[col_name] == node_key][f"{col_name}_woe"].iloc[0]
        except IndexError as e:
            row[f"{col_name}_woe"] = 0.0
        return row
    df = df.apply(append_woe_features_by_row, axis=1)
    return df

In [251]:
df_test = append_woe_features("ip_address",df_test,df_woe_features)
df_test = append_woe_features("email_address",df_test,df_woe_features)
df_test = append_woe_features("card_number",df_test,df_woe_features)

## Predictions (is_credit, woe, graph)

In [383]:
df_test.to_csv("test_dataset_december.csv", index=False)

In [359]:
X_test = pd.concat([df_test[["is_credit"]], df_test.loc(axis=1)["ip_node_degree":"card_number_woe"]], axis=1)
y_test = df_test["has_fraudulent_dispute"]

In [360]:
X_test = X_test[X_subset.columns]
X_test

Unnamed: 0,ip_node_degree,ip_eigen_centrality,ip_page_rank,email_node_degree,email_eigen_centrality,email_page_rank,card_node_degree,card_eigen_centrality,card_page_rank,is_credit,ip_address_woe,email_address_woe,card_number_woe
0,2,3.184233e-03,0.000004,2,3.184233e-03,0.000004,3,3.198447e-03,0.000007,False,0.000000,1.392115,1.104433
1,2,-1.655327e-19,0.000007,2,-8.414725e-21,0.000007,2,1.576474e-19,0.000007,True,0.000000,0.000000,0.000000
2,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,True,0.000000,0.000000,0.000000
3,3,3.198642e-03,0.000005,5,3.212985e-03,0.000010,4,3.198770e-03,0.000007,True,1.392115,0.881290,0.881290
4,2,3.300000e-01,0.570000,2,3.184169e-03,0.000004,2,3.300000e-01,0.570000,True,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11646,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,True,0.000000,0.000000,0.000000
11647,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,False,0.000000,0.000000,0.000000
11648,2,8.075300e-20,0.000005,6,1.684996e-19,0.000016,4,-1.582400e-19,0.000011,True,0.000000,0.881290,1.104433
11649,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,2,3.300000e-01,0.570000,True,0.000000,0.000000,0.000000


In [376]:
y_predict = rf0.predict(X_test)

In [377]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score, accuracy_score

In [378]:
confusion_matrix(y_test, y_predict)

array([[10346,   308],
       [  451,   546]])

In [379]:
accuracy_score(y_test, y_predict)

0.9348553772208394

In [380]:
precision_score(y_test, y_predict)

0.639344262295082

In [381]:
recall_score(y_test, y_predict)

0.5476429287863591

In [382]:
f1_score(y_test, y_predict)

0.5899513776337115

## Tuning (not really useful)

In [None]:
X_subset = pd.concat([X.loc(axis=1)["ip_node_degree":"card_page_rank"], X.loc(axis=1)[["is_credit"]], X.loc(axis=1)["ip_address_woe":"card_number_woe"]], axis=1)

In [352]:
param_test1 = {'n_estimators':range(10,151,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=10),
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(X_subset,y)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([0.12378974, 0.19673738, 0.29205623, 0.42115102, 0.47942514,
         0.58582735, 0.70450187, 0.80118055, 0.88996224, 0.98919044,
         1.07789898, 1.17051721, 1.3112318 , 1.40421591, 1.4641624 ]),
  'std_fit_time': array([0.03867791, 0.00195145, 0.00378195, 0.06570024, 0.00501395,
         0.0080467 , 0.01330481, 0.01787398, 0.01472627, 0.02065134,
         0.01220449, 0.0179001 , 0.04361432, 0.0223939 , 0.01868218]),
  'mean_score_time': array([0.0084094 , 0.01020551, 0.01375451, 0.02915473, 0.02069988,
         0.02513857, 0.02910414, 0.03275661, 0.03674717, 0.03989892,
         0.04394736, 0.04837079, 0.05362921, 0.05739017, 0.0596478 ]),
  'std_score_time': array([0.0030775 , 0.00041452, 0.0001785 , 0.02299957, 0.00021715,
         0.0005639 , 0.00101597, 0.00084113, 0.00185126, 0.00038807,
         0.00142556, 0.00182118, 0.00357857, 0.00399383, 0.00194913]),
  'param_n_estimators': masked_array(data=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 

In [353]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(50,201,20)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 30,
                                  oob_score=True, random_state=10),
   param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(X_subset,y)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([0.16879406, 0.16404853, 0.16362352, 0.16366458, 0.16348042,
         0.16593161, 0.16338406, 0.16368375, 0.20606718, 0.20646753,
         0.21183357, 0.21314287, 0.20728927, 0.20637684, 0.20593209,
         0.20497589, 0.23962893, 0.23948202, 0.24233418, 0.24226007,
         0.24154611, 0.23784742, 0.23677883, 0.23629456, 0.2664885 ,
         0.26362209, 0.2625392 , 0.26205082, 0.26060171, 0.25924153,
         0.25764403, 0.25687222, 0.28216896, 0.28098378, 0.28396125,
         0.31028681, 0.27274718, 0.26891832, 0.26883111, 0.26608062,
         0.2907372 , 0.28888106, 0.28472695, 0.28860178, 0.28236542,
         0.27905354, 0.28301101, 0.27848196]),
  'std_fit_time': array([0.00508852, 0.00064684, 0.00027487, 0.00063139, 0.00028855,
         0.00514847, 0.00045908, 0.00030422, 0.0011309 , 0.00080787,
         0.00204518, 0.00163866, 0.00065204, 0.00052651, 0.00086094,
         0.00102962, 0.00100223, 0.00149451, 0.00211918, 0.00101606,
         0.00323926, 0.

In [354]:
param_test3 = {'min_samples_split':range(80,150,20), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 30, max_depth=7,
                                  oob_score=True, random_state=10),
   param_grid = param_test3, scoring='roc_auc', cv=5)
gsearch3.fit(X_subset,y)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([0.24265223, 0.23990798, 0.2404212 , 0.2410212 , 0.24095545,
         0.24153118, 0.24131575, 0.2398077 , 0.24231324, 0.24241018,
         0.23773222, 0.2397378 , 0.24274039, 0.23850627, 0.23404059,
         0.2330893 , 0.24053025, 0.24314027, 0.24116521, 0.2366312 ]),
  'std_fit_time': array([0.00306061, 0.00128373, 0.00570162, 0.00268747, 0.00195683,
         0.00237842, 0.00240192, 0.0037528 , 0.00266088, 0.00287505,
         0.0033096 , 0.00188151, 0.00264979, 0.00242088, 0.00053005,
         0.00164323, 0.00444959, 0.00122904, 0.00101099, 0.00319665]),
  'mean_score_time': array([0.01094007, 0.01090541, 0.01066694, 0.01083016, 0.01097441,
         0.01105995, 0.01120739, 0.01088634, 0.01107702, 0.01190329,
         0.01089859, 0.01136689, 0.01112556, 0.01119714, 0.01090717,
         0.01092596, 0.01103725, 0.01107583, 0.01184969, 0.0111588 ]),
  'std_score_time': array([0.0002237 , 0.0001523 , 0.00016484, 0.00039312, 0.00019274,
         0.00036151, 0.0002

In [355]:
param_test4 = {'max_features':range(3,11,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 30, max_depth=7, min_samples_split=100,
                                  min_samples_leaf=30 ,oob_score=True, random_state=10),
   param_grid = param_test4, scoring='roc_auc', cv=5)
gsearch4.fit(X_subset,y)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

({'mean_fit_time': array([0.24294357, 0.31898136, 0.39854727, 0.48486314]),
  'std_fit_time': array([0.00534745, 0.00245454, 0.00340895, 0.00498369]),
  'mean_score_time': array([0.0110496 , 0.01072111, 0.01066084, 0.01042175]),
  'std_score_time': array([0.00014124, 0.00016907, 0.0002374 , 0.00010479]),
  'param_max_features': masked_array(data=[3, 5, 7, 9],
               mask=[False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'max_features': 3},
   {'max_features': 5},
   {'max_features': 7},
   {'max_features': 9}],
  'split0_test_score': array([0.97377893, 0.97553078, 0.97567156, 0.9757606 ]),
  'split1_test_score': array([0.98133264, 0.98182323, 0.98118141, 0.9825947 ]),
  'split2_test_score': array([0.97954453, 0.98018462, 0.97952865, 0.97961717]),
  'split3_test_score': array([0.98042567, 0.98154833, 0.98109363, 0.98070749]),
  'split4_test_score': array([0.97699534, 0.9789914 , 0.97816169, 0.97811615]),
  'mean_test_score': array(

In [358]:
rf2 = RandomForestClassifier(n_estimators=30, max_depth=7, min_samples_split=100,
                                  min_samples_leaf=30 ,max_features=5 ,oob_score=True, random_state=10)
rf2.fit(X_subset,y)
print (rf2.oob_score_)
y_predprob = rf2.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

0.9690811811914806
AUC Score (Train): 0.986339607313699
