In [99]:
import pandas as pd
from lib.sampling import subsampling
from xgboost.sklearn import XGBClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV

# Data Preparation (need balanced sample before classification)

In [2]:
start_date = "2021-01-01"
end_date = "2021-11-30"

In [3]:
df_features = pd.read_csv(f"../feature-engineering/final_features_{end_date}.csv")

# Subsample non_fraudulent transactions records so we have balanced dataset
df_fraudulent = df_features[df_features['has_fraudulent_dispute'] == True]
df_non_fraudulent = df_features[df_features['has_fraudulent_dispute'] == False]
subsample_index = subsampling(df_non_fraudulent.index, len(df_fraudulent))
df_non_fraudulent_subsample = df_non_fraudulent.loc[subsample_index, :]
df_sample = pd.concat([df_non_fraudulent_subsample, df_fraudulent], axis=0)
df_sample.shape

(19438, 56)

In [21]:
X_train = df_sample.drop(["date", "psp_reference", "has_fraudulent_dispute", "is_refused_by_adyen"], axis=1)
y_train = df_sample["has_fraudulent_dispute"]
X_train_subset = pd.concat([X_train.loc(axis=1)["ip_node_degree":"card_page_rank"], X_train.loc(axis=1)[["is_credit"]],
                            X_train.loc(axis=1)["ip_address_woe":"card_number_woe"]], axis=1)

In [10]:
y_train.value_counts()

False    9719
True     9719
Name: has_fraudulent_dispute, dtype: int64

In [26]:
df_test = pd.read_csv("test_dataset_december.csv")
X_test = pd.concat([df_test[["is_credit"]], df_test.loc(axis=1)["ip_node_degree":"card_number_woe"]], axis=1)
y_test = df_test["has_fraudulent_dispute"]
X_test = X_test[X_train_subset.columns]

# Training

In [100]:
# is_credit + graph + woe
xgb0 = XGBClassifier(max_depth=5, learning_rate=0.5, verbosity=1, objective='binary:logistic', random_state=1,
                     eval_metric="error")
xgb0.fit(X_train_subset, y_train, eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.06506
[1]	validation_0-error:0.06506
[2]	validation_0-error:0.06506
[3]	validation_0-error:0.06506
[4]	validation_0-error:0.06506
[5]	validation_0-error:0.06506
[6]	validation_0-error:0.06506
[7]	validation_0-error:0.06506
[8]	validation_0-error:0.06506
[9]	validation_0-error:0.06506
[10]	validation_0-error:0.06506
[11]	validation_0-error:0.06506
[12]	validation_0-error:0.06506
[13]	validation_0-error:0.06506
[14]	validation_0-error:0.06506
[15]	validation_0-error:0.06506
[16]	validation_0-error:0.06506
[17]	validation_0-error:0.06506
[18]	validation_0-error:0.06506
[19]	validation_0-error:0.06506
[20]	validation_0-error:0.06506
[21]	validation_0-error:0.06506
[22]	validation_0-error:0.06506
[23]	validation_0-error:0.06497
[24]	validation_0-error:0.06497
[25]	validation_0-error:0.06497
[26]	validation_0-error:0.06497
[27]	validation_0-error:0.06497
[28]	validation_0-error:0.06497
[29]	validation_0-error:0.06497
[30]	validation_0-error:0.06497
[31]	validation_0-

## Grid Search Tuning

In [119]:
parameters = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [50, 100, 200, 300, 500],
    'min_child_weight': [0, 2, 5, 10, 20],
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}

xlf = XGBClassifier(max_depth=10,
                        learning_rate=0.01,
                        n_estimators=2000,
                        objective='binary:logistic',
                        nthread=-1,
                        gamma=0,
                        min_child_weight=1,
                        max_delta_step=0,
                        subsample=0.85,
                        colsample_bytree=0.7,
                        colsample_bylevel=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        scale_pos_weight=1,
                        seed=1440,
                        missing=None)
# 有了gridsearch我们便不需要fit函数
gsearch = GridSearchCV(xlf, param_grid=parameters, scoring='roc_auc', cv=3)
gsearch.fit(X_train_subset, y_train)
print("Best score: %0.3f" % gsearch.best_score_)
print("Best parameters set:")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    #极其耗费时间,电脑没执行完

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 359, in _score
    y_pred = method_caller(clf, "decision_function", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 72, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'XGBClassifier' object has no attribute 'decision_function'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.1