In [230]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from lib.sampling import subsampling
import numpy as np
from sklearn.model_selection import GridSearchCV
from lib.mutation_util import date
from sklearn import metrics
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Preparation (need balanced sample before classification)

In [231]:
start_date = "2021-01-01"
end_date = "2021-11-30"

In [232]:
df_features = pd.read_csv(f"../feature-engineering/final_features_{end_date}.csv")

# Subsample non_fraudulent transactions records so we have balanced dataset
df_fraudulent = df_features[df_features['has_fraudulent_dispute'] == True]
df_non_fraudulent = df_features[df_features['has_fraudulent_dispute'] == False]
subsample_index= subsampling(df_non_fraudulent.index, len(df_fraudulent))
df_non_fraudulent_subsample  = df_non_fraudulent.loc[subsample_index,:]
df_sample = pd.concat([df_non_fraudulent_subsample,df_fraudulent], axis=0)
df_sample.shape

(19438, 56)

In [233]:
X = df_sample.drop(["date","psp_reference","has_fraudulent_dispute", "is_refused_by_adyen"], axis=1)
y = df_sample["has_fraudulent_dispute"]

In [234]:
y.value_counts()

False    9719
True     9719
Name: has_fraudulent_dispute, dtype: int64

## Training (Tuning)

In [235]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19438 entries, 86282 to 127004
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   is_credit                19438 non-null  bool   
 1   no_ip                    19438 non-null  bool   
 2   no_email                 19438 non-null  bool   
 3   same_country             19438 non-null  bool   
 4   merchant_Merchant B      19438 non-null  int64  
 5   merchant_Merchant C      19438 non-null  int64  
 6   merchant_Merchant D      19438 non-null  int64  
 7   merchant_Merchant E      19438 non-null  int64  
 8   card_scheme_MasterCard   19438 non-null  int64  
 9   card_scheme_Other        19438 non-null  int64  
 10  card_scheme_Visa         19438 non-null  int64  
 11  ip_country_GR            19438 non-null  int64  
 12  ip_country_IT            19438 non-null  int64  
 13  ip_country_NL            19438 non-null  int64  
 14  ip_country_ZW    

In [291]:
# only is_credit feature
X_subset = X.loc(axis=1)[["is_credit"]]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.6465171313921185
AUC Score (Train): 0.6465171313921185


In [237]:
# all features without woe and graph features
X_subset = X.loc(axis=1)["is_credit":"zip_code_ZB"]
X_subset
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.7346434818396954
AUC Score (Train): 0.9444194923973386


In [238]:
# only is_credit, woe feature
X_subset = X.loc(axis=1)[["is_credit","ip_address_woe"]]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.8329046198168536
AUC Score (Train): 0.9315381681998387


In [279]:
# only credit+woe
X_subset = X.loc(axis=1)[["is_credit","ip_address_woe", "email_address_woe", "card_number_woe"]]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.965325650787118
AUC Score (Train): 0.9965906040402033


In [240]:
# only graph
X_subset = X.loc(axis=1)["ip_node_degree":"card_page_rank"]
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.8620228418561581
AUC Score (Train): 0.9532777573109237


In [241]:
# is_credit + graph
X_subset = pd.concat([X.loc(axis=1)["ip_node_degree":"card_page_rank"],X.loc(axis=1)[["is_credit"]]], axis=1)
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")

oob_score_: 0.8781767671571149
AUC Score (Train): 0.9667844906742094


In [266]:
# is_credit + graph + woe
X_subset = pd.concat([X.loc(axis=1)["ip_node_degree":"card_page_rank"], X.loc(axis=1)[["is_credit"]], X.loc(axis=1)["ip_address_woe":"card_number_woe"]], axis=1)
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X_subset,y)
print(f"oob_score_: {rf0.oob_score_}")
y_predprob = rf0.predict_proba(X_subset)[:,1]
print(f"AUC Score (Train): {metrics.roc_auc_score(y, y_predprob)}")
X_subset.info()

oob_score_: 0.9651198682992077
AUC Score (Train): 0.9971551084496896
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19438 entries, 86282 to 127004
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ip_node_degree          19438 non-null  float64
 1   ip_eigen_centrality     19438 non-null  float64
 2   ip_page_rank            19438 non-null  float64
 3   email_node_degree       19438 non-null  float64
 4   email_eigen_centrality  19438 non-null  float64
 5   email_page_rank         19438 non-null  float64
 6   card_node_degree        19438 non-null  float64
 7   card_eigen_centrality   19438 non-null  float64
 8   card_page_rank          19438 non-null  float64
 9   is_credit               19438 non-null  bool   
 10  ip_address_woe          19438 non-null  float64
 11  email_address_woe       19438 non-null  float64
 12  card_number_woe         19438 non-null  float64
dtypes: bool(1), float

## Testing

In [243]:
test_start_date = "2021-12-01"
test_end_date = "2021-12-31"

In [244]:
df_ori = pd.read_csv(f"../adyen-dataset.csv")
df_ori["date"] = df_ori.apply(date, axis=1)
df_ori["date"] = pd.to_datetime(df_ori["date"])
df_ori.set_index('date', inplace=True)
df_ts = df_ori.sort_index()
df_test_ts = df_ts.loc[test_start_date:test_end_date]

In [245]:
df_test_ts = df_test_ts.loc(axis=1)["is_credit", "ip_address", "email_address", "card_number", "has_fraudulent_dispute"]
df_test = df_test_ts.reset_index(drop=True)
df_test

Unnamed: 0,is_credit,ip_address,email_address,card_number,has_fraudulent_dispute
0,False,cqHg4ONJtOBwqNNSjawwVg,vpCrxKV4Rb4hHYZ_xUQrnA,92TKnNIoPTXgok35txpjyg,False
1,True,I4zf2g3Z4BdSVbOtXDmw7g,YKy04vP5R13SJl5ydrfr3w,iWwHGPT5vcubXqewt6QmFQ,False
2,True,M3g-untYinyAQiAIXKGNSw,f-37afIGJJkYJHQXM_HKMg,HDPyxyvo2Lqebm-qusJqOQ,False
3,True,KT8qTQ6tiACsTKJsyxvPIg,36VSxpSABRjj43cfP6lwjw,gbsYb_MOBHH6FXa_a7c7og,False
4,True,wpnDR446pjLAUoNYueB-yg,5MO1Y0cW3sRJ56IbZCwOIQ,s0sj9IsbO7VMzyIsRBBZOA,False
...,...,...,...,...,...
11646,True,NUZxGz6wWtW1enn19ghi4g,D5nXKSWwpD40xABXiW5drg,2XeZsqgafyXVZrQNZpC7Kw,False
11647,False,-0zsia1FtEIuWYt71pLrKw,,leow_iHIe06J6n5YLGOgyQ,False
11648,True,sDicZTCFj6lP8AiV6WFC7g,NJPPoAQt0eoYvDtBIf5tHQ,Gd03xMeOIMabaEfpvqR1wA,False
11649,True,hAKt2EU3vmjrGOT6ueNsdw,3zRwZ-rq1GTbTuPpE9mwXw,h_1om9k6GCO7nv5NeoDgfQ,False


In [246]:
df_graph_features_lookup_table = pd.read_csv(f"../feature-engineering/graph_features_lookup_table_{end_date}.csv")

In [247]:
def append_graph_features(node_type_name, df, graph_feature_lookup_table):
    def append_graph_features_by_row(row):
        node_key = row[node_type_name]
        prefix = node_type_name.split('_')[0]
        try:
            row[f"{prefix}_node_degree"] = graph_feature_lookup_table[graph_feature_lookup_table["Node"] == node_key]["DEGREE"].iloc[0]
            row[f"{prefix}_eigen_centrality"] = graph_feature_lookup_table[graph_feature_lookup_table["Node"] == node_key]["EIGENVECTOR_CENTRALITY"].iloc[0]
            row[f"{prefix}_page_rank"] = graph_feature_lookup_table[graph_feature_lookup_table["Node"] == node_key]["PAGE_RANK"].iloc[0]
        except IndexError as e:
            row[f"{prefix}_node_degree"] = 2
            row[f"{prefix}_eigen_centrality"] = 0.33
            row[f"{prefix}_page_rank"] = 0.57
        return row
    df = df.apply(append_graph_features_by_row, axis=1)
    return df

In [248]:
df_test = append_graph_features("ip_address",df_test, df_graph_features_lookup_table)
df_test = append_graph_features("email_address",df_test, df_graph_features_lookup_table)
df_test = append_graph_features("card_number",df_test, df_graph_features_lookup_table)

In [249]:
df_woe_features = pd.read_csv(f"../feature-engineering/woe_features_{end_date}.csv")

In [250]:
def append_woe_features(col_name, df, woe_features):
    def append_woe_features_by_row(row):
        node_key = row[col_name]
        try:
            row[f"{col_name}_woe"] = woe_features[woe_features[col_name] == node_key][f"{col_name}_woe"].iloc[0]
        except IndexError as e:
            row[f"{col_name}_woe"] = 0.0
        return row
    df = df.apply(append_woe_features_by_row, axis=1)
    return df

In [251]:
df_test = append_woe_features("ip_address",df_test,df_woe_features)
df_test = append_woe_features("email_address",df_test,df_woe_features)
df_test = append_woe_features("card_number",df_test,df_woe_features)

In [259]:
X_test = pd.concat([df_test[["is_credit"]], df_test.loc(axis=1)["ip_node_degree":"card_number_woe"]], axis=1)
y_test = df_test["has_fraudulent_dispute"]

In [292]:
X_test = X_test[X_subset.columns]
X_test

Unnamed: 0,is_credit
0,False
1,True
2,True
3,True
4,True
...,...
11646,True
11647,False
11648,True
11649,True


In [293]:
y_predict = rf0.predict(X_test)

In [300]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score, accuracy_score

In [295]:
confusion_matrix(y_test, y_predict)

array([[3057, 7597],
       [   0,  997]])

In [301]:
accuracy_score(y_test, y_predict)

0.34795296541069437

In [296]:
precision_score(y_test, y_predict)

0.11601117058412846

In [297]:
recall_score(y_test, y_predict)

1.0

In [298]:
f1_score(y_test, y_predict)

0.20790324262329268