In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from scipy.special import expit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics

In [None]:
feature=pd.read_csv('finalfeatures.csv')
amount=pd.read_csv('费用类相关特征2.csv')
amount=amount.set_index('个人编码')
amount=amount[['总审批金额']]
feature=feature.set_index('个人编码')
feature=feature.drop(columns=['总审批金额'])
feature=pd.concat([feature,amount],axis=1)
feature = feature.fillna(0)

In [None]:
bayesdata=pd.read_csv('bayesdata2.csv')
bayesdata=bayesdata.set_index('个人编码')
feature=pd.concat([feature,bayesdata],axis=1)

In [None]:
# cost-sensitive XGB

class CSBoost:
    def __init__(self, obj, lambda1=0, lambda2=0, learn_rate=0.01):
        self.obj = obj
        self.lambda1 = lambda1
        self.lambda2 = lambda2
        self.learn_rate=learn_rate
        # alpha is l1, lambda is l2
        params = {'random_state': 42, 'tree_method': 'exact', 'verbosity': 0, 'reg_alpha': lambda1,
                  'reg_lambda': lambda2,'learning_rate': learn_rate}
        if obj == 'ce' or obj == 'weightedce':
            params['objective'] = 'binary:logistic'
        elif obj == 'aec':
            params['disable_default_eval_metric'] = True

        self.params = params

    def fit(self, x_train, y_train, x_val, y_val, cost_matrix_train=None, cost_matrix_val=None):
        x_train = np.array(x_train)
        y_train = np.array(y_train)
        x_val = np.array(x_val)
        y_val = np.array(y_val)
        cost_matrix_train = np.array(cost_matrix_train)
        cost_matrix_val = np.array(cost_matrix_val)
        if self.obj == 'ce':
            dtrain = xgb.DMatrix(x_train, label=y_train)
            dval = xgb.DMatrix(x_val, label=y_val)

            xgboost = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=500, early_stopping_rounds=50,
                            evals=[(dval, 'eval')], verbose_eval=False)

        elif self.obj == 'weightedce':
            misclass_costs = np.zeros(len(y_train))
            misclass_costs[y_train == 0] = cost_matrix_train[:, 1, 0][y_train == 0]
            misclass_costs[y_train == 1] = cost_matrix_train[:, 0, 1][y_train == 1]

            misclass_costs_val = np.zeros(len(y_val))
            misclass_costs_val[y_val == 0] = cost_matrix_val[:, 1, 0][y_val == 0]
            misclass_costs_val[y_val == 1] = cost_matrix_val[:, 0, 1][y_val == 1]

            dtrain = xgb.DMatrix(x_train, label=y_train, weight=misclass_costs)
            dval = xgb.DMatrix(x_val, label=y_val, weight=misclass_costs_val)

            xgboost = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=500, early_stopping_rounds=50,
                                evals=[(dval, 'eval')], verbose_eval=False)

        elif self.obj == 'aec':
            dtrain = xgb.DMatrix(x_train, label=y_train)
            dval = xgb.DMatrix(x_val, label=y_val)

            # Do constant computations here to avoid DMatrix error
            # diff_costs_train = fixed_cost - y_train * amounts_train

            train_constant = (y_train * (cost_matrix_train[:, 1, 1] - cost_matrix_train[:, 0, 1])
                              + (1 - y_train) * (cost_matrix_train[:, 1, 0] - cost_matrix_train[:, 0, 0]))

            def aec_train(raw_scores, y_true):
                scores = expit(raw_scores)

                # Average expected cost:
                # ec = np.multiply(np.multiply(y_true, (1 - scores)), amounts_train) + np.multiply(scores, fixed_cost)
                # ec = y_true * (
                #     scores * cost_matrix_train[:, 1, 1] + (1 - scores) * cost_matrix_val[:, 0, 1]) \
                #     + (1 - y_true) * (
                #     scores * cost_matrix_val[:, 1, 0] + (1 - scores) * cost_matrix_val[:, 0, 0])

                # Gradient
                # Use diff_costs_train instead of (fixed_cost - y_true*amounts_train)
                # grad = scores * (1 - scores) * diff_costs_train
                grad = scores * (1 - scores) * train_constant

                # Hessian
                hess = np.abs((1 - 2 * scores) * grad)
                # hess = scores * (1 - scores) * (1 - 2 * scores) * train_constant

                # Grad and hess cannot be too close to 0!
                # print(grad.mean())
                # print(hess.mean())

                return grad, hess

            def aec_val(raw_scores, y_true):
                scores = expit(raw_scores)

                # Return AEC (not grad/hess)
                # ec = (1 - scores) * y_val * amounts_val + scores * fixed_cost
                # ec = y_true * (
                #     scores * cost_matrix_val[:, 1, 1] + (1 - scores) * cost_matrix_val[:, 0, 1]) \
                #     + (1 - y_true) * (
                #     scores * cost_matrix_val[:, 1, 0] + (1 - scores) * cost_matrix_val[:, 0, 0])

                # Avoid computations with y_true (DMatrix)
                if y_true:
                    ec = scores * cost_matrix_val[:, 1, 1] + (1 - scores) * cost_matrix_val[:, 0, 1]
                else:
                    ec = scores * cost_matrix_val[:, 1, 0] + (1 - scores) * cost_matrix_val[:, 0, 0]

                aec = ec.mean()

                return 'AEC', aec

            xgboost = xgb.train(params=self.params, dtrain=dtrain, obj=aec_train, feval=aec_val, num_boost_round=500,
                                early_stopping_rounds=50, evals=[(dval, 'eval')], verbose_eval=False)

        # print('\tBest number of trees = %i' % xgboost.best_ntree_limit)

        return xgboost

    def tune(self, l1, lambda1_list, l2, lambda2_list, learn_rate, learn_ratelist, x_train, y_train, cost_matrix_train, x_val, y_val, cost_matrix_val):
        if l1:
            self.params['reg_lambda'] = 0
            losses_list = []
            for lambda1 in lambda1_list:
                xgboost = CSBoost(obj=self.obj, lambda1=lambda1)
                xgboost = xgboost.fit(x_train, y_train, x_val, y_val, cost_matrix_train, cost_matrix_val)
                scores = xgboost.inplace_predict(x_val)

                # Evaluate loss (without regularization term!)
                if self.obj == 'ce':
                    eps = 1e-9  # small value to avoid log(0)
                    ce = - (y_val * np.log(scores + eps) + (1 - y_val) * np.log(1 - scores + eps))
                    val_loss = ce.mean()
                elif self.obj == 'weightedce':
                    eps = 1e-9  # small value to avoid log(0)
                    ce = - (y_val * np.log(scores + eps) + (1 - y_val) * np.log(1 - scores + eps))

                    cost_misclass = np.zeros(len(y_val))
                    cost_misclass[y_val == 0] = cost_matrix_val[:, 1, 0][y_val == 0]
                    cost_misclass[y_val == 1] = cost_matrix_val[:, 0, 1][y_val == 1]

                    weighted_ce = cost_misclass * ce
                    val_loss = weighted_ce.mean()
                elif self.obj == 'aec':
                    def aec_val(raw_scores, y_true):
                        scores = expit(raw_scores)

                        # Return AEC (not grad/hess)
                        # ec = (1 - scores) * y_val * amounts_val + scores * fixed_cost
                        ec = y_true * (
                            scores * cost_matrix_val[:, 1, 1] + (1 - scores) * cost_matrix_val[:, 0, 1]) \
                            + (1 - y_true) * (
                            scores * cost_matrix_val[:, 1, 0] + (1 - scores) * cost_matrix_val[:, 0, 0])

                        aec = ec.mean()

                        return 'AEC', aec

                    aec = aec_val(scores, y_val)
                    val_loss = aec[1]
                print('\t\tLambda l1 = %.5f;\tLoss = %.5f' % (lambda1, val_loss))
                losses_list.append(val_loss)
            lambda1_opt = lambda1_list[np.argmin(losses_list)]
            print('\tOptimal lambda = %.5f' % lambda1_opt)
            self.params['reg_alpha'] = lambda1_opt
        elif l2:
            self.params['reg_alpha'] = 0
            losses_list = []
            for lambda2 in lambda2_list:
                xgboost = CSBoost(obj=self.obj, lambda2=lambda2)
                xgboost = xgboost.fit(x_train, y_train, x_val, y_val, cost_matrix_train, cost_matrix_val)
                scores = xgboost.inplace_predict(x_val)

                # Evaluate loss (without regularization term!)
                if self.obj == 'ce':
                    eps = 1e-9  # small value to avoid log(0)
                    ce = - (y_val * np.log(scores + eps) + (1 - y_val) * np.log(1 - scores + eps))
                    val_loss = ce.mean()
                elif self.obj == 'weightedce':
                    eps = 1e-9  # small value to avoid log(0)
                    ce = - (y_val * np.log(scores + eps) + (1 - y_val) * np.log(1 - scores + eps))

                    cost_misclass = np.zeros(len(y_val))
                    cost_misclass[y_val == 0] = cost_matrix_val[:, 1, 0][y_val == 0]
                    cost_misclass[y_val == 1] = cost_matrix_val[:, 0, 1][y_val == 1]

                    weighted_ce = cost_misclass * ce
                    val_loss = weighted_ce.mean()
                elif self.obj == 'aec':
                    def aec_val(raw_scores, y_true):
                        scores = expit(raw_scores)

                        # Return AEC (not grad/hess)
                        ec = y_true * (
                                scores * cost_matrix_val[:, 1, 1] + (1 - scores) * cost_matrix_val[:, 0, 1]) \
                             + (1 - y_true) * (
                                     scores * cost_matrix_val[:, 1, 0] + (1 - scores) * cost_matrix_val[:, 0, 0])

                        aec = ec.mean()

                        return 'AEC', aec

                    aec = aec_val(scores, y_val)
                    val_loss = aec[1]
                print('\t\tLambda l2 = %.5f;\tLoss = %.5f' % (lambda2, val_loss))
                losses_list.append(val_loss)
            lambda2_opt = lambda2_list[np.argmin(losses_list)]
            print('\tOptimal lambda = %.5f' % lambda2_opt)
            self.params['reg_alpha'] = lambda2_opt
        elif learn_rate:
            self.params['learning_rate'] = 0.01
            losses_list = []
            for learn_rate in learn_ratelist:
                xgboost = CSBoost(obj=self.obj, learn_rate=learn_rate)
                xgboost = xgboost.fit(x_train, y_train, x_val, y_val, cost_matrix_train, cost_matrix_val)
                scores = xgboost.inplace_predict(x_val)

                # Evaluate loss (without regularization term!)
                if self.obj == 'ce':
                    eps = 1e-9  # small value to avoid log(0)
                    ce = - (y_val * np.log(scores + eps) + (1 - y_val) * np.log(1 - scores + eps))
                    val_loss = ce.mean()
                elif self.obj == 'weightedce':
                    eps = 1e-9  # small value to avoid log(0)
                    ce = - (y_val * np.log(scores + eps) + (1 - y_val) * np.log(1 - scores + eps))

                    cost_misclass = np.zeros(len(y_val))
                    cost_misclass[y_val == 0] = cost_matrix_val[:, 1, 0][y_val == 0]
                    cost_misclass[y_val == 1] = cost_matrix_val[:, 0, 1][y_val == 1]

                    weighted_ce = cost_misclass * ce
                    val_loss = weighted_ce.mean()
                elif self.obj == 'aec':
                    def aec_val(raw_scores, y_true):
                        scores = expit(raw_scores)

                        # Return AEC (not grad/hess)
                        # ec = (1 - scores) * y_val * amounts_val + scores * fixed_cost
                        ec = y_true * (
                            scores * cost_matrix_val[:, 1, 1] + (1 - scores) * cost_matrix_val[:, 0, 1]) \
                            + (1 - y_true) * (
                            scores * cost_matrix_val[:, 1, 0] + (1 - scores) * cost_matrix_val[:, 0, 0])

                        aec = ec.mean()

                        return 'AEC', aec

                    aec = aec_val(scores, y_val)
                    val_loss = aec[1]
                print('\t\tlearning_rate = %.5f;\tLoss = %.5f' % (learn_rate, val_loss))
                losses_list.append(val_loss)
            learn_rate_opt = learn_ratelist[np.argmin(losses_list)]
            print('\tOptimal learn_rate = %.5f' % learn_rate_opt)
            self.params['learn_rate'] = learn_rate_opt
        else:
            self.lambda1 = 0
            self.lambda2 = 0
            self.learn_rate=0.15

In [None]:
# Grid search for hyperparameters

y_train=feature['欺诈状态']
x_train=feature.drop(columns=['欺诈状态'])

#Split the train set and test set
x_train, x_val, y_train, y_val=train_test_split(x_train, y_train, test_size=0.2, shuffle=True, random_state=333)

#Create cost matrix
amounts_train = x_train['总审批金额'].values
amounts_val = x_val['总审批金额'].values

cost_matrix_train = np.zeros((len(x_train), 2, 2))     # cost_matrix [[TN, FN], [FP, TP]]
cost_matrix_train[:, 0, 0] = 0.0
cost_matrix_train[:, 0, 1] = amounts_train
cost_matrix_train[:, 1, 0] = 20000
cost_matrix_train[:, 1, 1] = 15000

cost_matrix_val = np.zeros((len(x_val), 2, 2))     # cost_matrix [[TN, FN], [FP, TP]]
cost_matrix_val[:, 0, 0] = 0.0
cost_matrix_val[:, 0, 1] = amounts_val
cost_matrix_val[:, 1, 0] = 20000
cost_matrix_val[:, 1, 1] = 15000

#Obtain hyper-parameters
csboost = CSBoost(obj='aec')
csboost.tune(False, [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1], False, [0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1], True,[0.01,0.02,0.05,0.1,0.15], x_train, y_train,cost_matrix_train, x_val, y_val, cost_matrix_val)
# lambda1 = 0.01
# lambda2 = 0.00001
# learn_rate=0.1
#csboost=csboost.fit(x_train, y_train, x_val, y_val, cost_matrix_train, cost_matrix_val)

In [None]:
def cost_with_algorithm(cost_matrix, labels, predictions):

    cost_tn = cost_matrix[:, 0, 0][np.logical_and(predictions == 0, labels == 0)].sum()
    cost_fn = cost_matrix[:, 0, 1][np.logical_and(predictions == 0, labels == 1)].sum()
    cost_fp = cost_matrix[:, 1, 0][np.logical_and(predictions == 1, labels == 0)].sum()
    cost_tp = cost_matrix[:, 1, 1][np.logical_and(predictions == 1, labels == 1)].sum()

    return sum((cost_tn, cost_fn, cost_fp, cost_tp))

def cost_without_algorithm(cost_matrix, labels):

    # Predict everything as the default class that leads to minimal cost
    # Also include cost of TP/TN!
    cost_neg = cost_matrix[:, 0, 0][labels == 0].sum() + cost_matrix[:, 0, 1][labels == 1].sum()
    cost_pos = cost_matrix[:, 1, 0][labels == 0].sum() + cost_matrix[:, 1, 1][labels == 1].sum()

    return min(cost_neg, cost_pos)

def savings(cost_matrix, labels, predictions):
    
    cost_without = cost_without_algorithm(cost_matrix, labels)
    cost_with = cost_with_algorithm(cost_matrix, labels, predictions)
    savings = 1 - cost_with / cost_without
    
    return savings

In [None]:
# Prepare the data
train = feature
y = train['欺诈状态']
X = train.drop(columns=['欺诈状态'])


results_df = pd.DataFrame(columns=['Fold', 'Recall', 'F1-score', 'Precision', 'Savings'])

# Create a 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=1000)


for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    
    # Split the dataset
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Construct the cost matrix
    amounts_train = x_train['总审批金额'].values
    amounts_val = x_val['总审批金额'].values
    
    cost_matrix_train = np.zeros((len(x_train), 2, 2))
    cost_matrix_train[:, 0, 0] = 0
    cost_matrix_train[:, 0, 1] = amounts_train
    cost_matrix_train[:, 1, 0] = 20000
    cost_matrix_train[:, 1, 1] = 15000
    
    cost_matrix_val = np.zeros((len(x_val), 2, 2))
    cost_matrix_val[:, 0, 0] = 0
    cost_matrix_val[:, 0, 1] = amounts_val
    cost_matrix_val[:, 1, 0] = 20000
    cost_matrix_val[:, 1, 1] = 15000
    
    # Train the model
    csboost = CSBoost(obj='aec')
    lambda1 = 0.1
    lambda2 = 0.001
    learn_rate=0.15
    csboost = csboost.fit(x_train, y_train, x_val, y_val, cost_matrix_train, cost_matrix_val)
    
    # Obtain the predictions
    pre_val = expit(csboost.inplace_predict(x_val))
    threshold_instance = (cost_matrix_val[:, 1, 0] - cost_matrix_val[:, 0, 0]) / (
        cost_matrix_val[:, 1, 0] - cost_matrix_val[:, 0, 0]
        + cost_matrix_val[:, 0, 1] - cost_matrix_val[:, 1, 1])
    pred = (pre_val > threshold_instance).astype(int)
    
    # Get the performance metrics
    fold_results = {
        'Fold': fold,
        'ACC': round(metrics.accuracy_score(y_val, pred), 4),
        'Recall': round(metrics.recall_score(y_val, pred), 4),
        'F1-score': round(metrics.f1_score(y_val, pred), 4),
        'Precision': round(metrics.precision_score(y_val, pred), 4),
        'Savings': round(savings(cost_matrix_val, y_val, pred), 4)
    }
    
    results_df = pd.concat([results_df, pd.DataFrame([fold_results])], ignore_index=True)
