In [46]:
import pandas as pd
import numpy as np
import random

#Models and pre-processing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from imblearn.over_sampling import SMOTE
import time

# Custom methods
import CustomDecisionTree as cdt
import importlib
importlib.reload(CustomDecisionTree)

<module 'CustomDecisionTree' from 'C:\\Users\\tadas\\Desktop\\Programming studies\\7. decisiontree from scratch\\Custom-decision-tree\\CustomDecisionTree.py'>

# Testing the custom decision trees

In this example, we will experiment with the two custom decision trees using the dataset from Kaggle from the link below:

Link: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [3]:
df1 = pd.read_csv("creditcard.csv")

In [4]:
df1.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


We will compare the "Class" collumn with our predictions. If it's a false negative we deduce 20% of the corresponding "Amount" from our results, if it's a true negative we add 5% of the corresponding "Amount", and if the model predicts as positive we reject the transaction.

As the dataset is highly imbalanced (over 500/1 ratio), we will test the model with undersampled and oversampled datasets. We will also use predict_proba to find the best cut threshold on the decision tree to find the most profitable cut.

# Preparing the data

Because we don't have the exact information under the V1 - V28 columns, we will treat the dataset as the final prepared dataset and just apply different models.

The information on "Time" columns brings no relevant information so we will drop it for our tests.

In [5]:
X = df1.drop(['Time','Class'], axis = 1)
y = df1.Class

Now we will make the oversampled and undersampled dataset.

# Useful functions:

Some of the functions we will use for our tests.

- get_profit: Returns the profits you get from the approved transactions.
- get_profit_range: Max profit - min profit
- get_profit_percentage: Percentage of the max profit you get from the approved transactions.
- undersample_labels: Undersample to match the requested proportions.
- find_best_threshold: gets the probability list and find the cut threshold with the best profit outcome.
- classify_over_threshold: Classify the prediction cutting over the specific threshold.

In [6]:
def get_profit(transaction_values, true_y, predicted_y, profit_rate, loss_rate):

    if not isinstance(predicted_y, (np.ndarray, np.generic)):
        predicted_y = np.array(predicted_y)
    
    if not isinstance(transaction_values, (np.ndarray, np.generic)):
        transaction_values = transaction_values.to_numpy()
    
    if not isinstance(true_y, (np.ndarray, np.generic)):
        true_y = true_y.to_numpy()
    profits = transaction_values[(true_y==0) & (predicted_y==0)].sum()*profit_rate

    losses = transaction_values[(true_y==1) & (predicted_y==0)].sum()*loss_rate
    
    return (profits-losses)

In [7]:
def get_profit_range(transaction_values, true_y, profit_rate, loss_rate):
    max_profit = get_profit(transaction_values, true_y, true_y, profit_rate, loss_rate)
    min_profit = get_profit(transaction_values, true_y, 1-true_y, profit_rate, loss_rate)
    return (max_profit-min_profit)

In [8]:
# Function to get the profit percentage compared to the max-profit

def get_profit_percentage(transaction_values, true_y, predicted_y, profit_rate, loss_rate):
    max_profit = get_profit(transaction_values, true_y, true_y, profit_rate, loss_rate)
    real_profit = get_profit(transaction_values, true_y, predicted_y, profit_rate, loss_rate)
    return real_profit/max_profit

In [9]:
def undersample_labels(X,y, random_state=None, second_label_rate = 0.5):
    size_0 = len(y[y==0])
    size_1 = len(y[y==1])
    total_size = size_0+size_1
    second_label_real_rate = size_1/total_size
    if second_label_real_rate < second_label_rate:
        size_0 = int(size_1/second_label_rate - size_1)
    else:
        size_1 = int(size_0/(1-second_label_rate) - size_0)
        
    X_resample_0 = X.loc[y[y==0].index].sample(n = size_0, random_state = random_state)
    X_resample_1 = X.loc[y[y==1].index].sample(n = size_1, random_state = random_state)
    y_resample_0 = y.loc[X_resample_0.index]
    y_resample_1 = y.loc[X_resample_1.index]
    X_resample = pd.concat([X_resample_0, X_resample_1])
    y_resample = pd.concat([y_resample_0, y_resample_1])
    return X_resample, y_resample

In [36]:
def find_best_threshold(transaction_values, true_y, predictions_y, profit_rate, loss_rate):
    # predictions_y is a list of predictions percentages
    # first division in 100 parts
    profits = []
    max_profit = get_profit(transaction_values, true_y, true_y, profit_rate, loss_rate)
    for i in range(101):
        rate = (1/100)*i
        y_pred = predictions_y.copy()
        y_pred [y_pred > rate] = 1
        y_pred [y_pred <= rate] = 0
        profit = get_profit(transaction_values, true_y, y_pred, profit_rate, loss_rate)
        profits.append([rate,profit, profit/max_profit])
    
    np_profits_ = np.array(profits)
    top_percentage = np_profits_[np.where(np_profits_[:,1] == np_profits_[:,1].max())][:,0].mean()
    
    for i in range(5):
        interval = (1/(100*(2**i)))
        cut = (0.25/(2**i))
        if top_percentage >= 1-cut:
            top_percentage = 1-cut+ interval/2
        elif top_percentage <= cut:
            top_percentage = 1-cut
        min_threshold =  (top_percentage - cut) 
        for j in range (51):
            rate = min_threshold+interval*j
            y_pred = predictions_y.copy()
            y_pred [y_pred > rate] = 1
            y_pred [y_pred <= rate] = 0
            profit = get_profit(transaction_values, true_y, y_pred, profit_rate, loss_rate)
            profits.append([rate,profit, profit/max_profit])
        np_profits_ = np.array(profits)
        top_percentage = np_profits_[np.where(np_profits_[:,1] == np_profits_[:,1].max())][:,0].mean()
    return sorted(profits)

In [37]:
def classify_over_threshold(y_probabilities, threshold):
    y_pred = y_probabilities.copy()
    y_pred [y_pred > threshold] = 1
    y_pred [y_pred <= threshold] = 0
    return y_pred

# Preparing the samples:

In [14]:
profit_rate = 0.05
loss_rate = 0.2

random_state = 123456
oversample = SMOTE(random_state=random_state)
profits_list = X.Amount

# Creating the train/test sets
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.2 ,random_state=random_state, stratify=y)

# Creating oversamples with SMOTE for testing
oversample = SMOTE(random_state=random_state)
X_over, y_over = oversample.fit_resample(X_train , y_train)

# Creating undersampling of 99.5/0.5
# After some testing, undersampling more than 99.5/0.5 makes the sampling too small, making all the models underperform.
X_train_downsample_995005, y_train_downsample_995005 = undersample_labels(X_train,y_train,random_state=random_state, second_label_rate = 0.005)

profit_accepting_test = get_profit_percentage(X_test.Amount, y_test,np.zeros(len(y_test)), profit_rate, loss_rate)
max_profit_test = get_profit(X_test.Amount, y_test,y_test, profit_rate, loss_rate)

# Making a list of the samples
training_samples = []
training_samples.append(["Stratified", "99.83/0.17", X_train, y_train])
training_samples.append(["Undersampled", "99.5/0.5",X_train_downsample_995005, y_train_downsample_995005])
training_samples.append(["Oversampled", "50/50",X_over, y_over])

# Training the models

In [19]:
results_df = pd.DataFrame(columns = [ 'Model', 'Labels proportion', ' New Profit %' , 
                                     'Improvement %', 'Time spent (s)' ])

In [52]:
# Training the decision trees

DT_model = DecisionTreeClassifier(max_depth= 8, random_state=random_state)

for i in range(len(training_samples)):
    start = time.perf_counter()
    DT_model.fit(training_samples[i][2],training_samples[i][3])
    y_prob_train = DT_model.predict_proba(X_train)[:,1]
    y_prob = DT_model.predict_proba(X_test)[:,1]
    profits = find_best_threshold(X_train.Amount, y_train, y_prob_train, profit_rate,loss_rate)
    np_profits = np.array(profits)
    top_percentage = np_profits[np.where(np_profits[:,1] == np_profits[:,1].max())][:,0].mean()
    y_prediction = classify_over_threshold(y_prob, top_percentage)
    
    end = time.perf_counter()
    time_training = (end-start)
    
    #getting the profits
    profits = get_profit_percentage(X_test.Amount, y_test, y_prediction, profit_rate, loss_rate)
    results_df.loc[i] = [ f"Decision Tree {training_samples[i][0]}", training_samples[i][1],
                         profits*100, (profits-profit_accepting_test)*100, time_training]

In [59]:
REG_model = DecisionTreeRegressor(max_depth=8, random_state=random_state)

for i in range(len(training_samples)):
    start = time.perf_counter()
    REG_model.fit(training_samples[i][2],training_samples[i][3])
    y_prob_train = REG_model.predict(X_train)
    y_prob = REG_model.predict(X_test)
    profits = find_best_threshold(X_train.Amount, y_train, y_prob_train, profit_rate,loss_rate)
    np_profits = np.array(profits)
    top_percentage = np_profits[np.where(np_profits[:,1] == np_profits[:,1].max())][:,0].mean()
    y_prediction = classify_over_threshold(y_prob, top_percentage)
    
    end = time.perf_counter()
    time_training = (end-start)
    
    #getting the profits
    profits = get_profit_percentage(X_test.Amount, y_test, y_prediction, profit_rate, loss_rate)
    results_df.loc[i+3] = [ f"Regression Tree {training_samples[i][0]}", training_samples[i][1],
                         profits*100, (profits-profit_accepting_test)*100, time_training]

In [54]:
# Training the profit decision trees

PDT_model = cdt.ProfitDecisionTreeClassifier(profit_rate, loss_rate, min_samples_split=20, max_depth=8)

for i in range(len(training_samples)):
    start = time.perf_counter()
    PDT_model.fit(training_samples[i][2],training_samples[i][3],training_samples[i][2].Amount)
    y_prediction = PDT_model.predict(X_test)
    end = time.perf_counter()
    time_training = (end-start)
    
    #getting the profits
    profits = get_profit_percentage(X_test.Amount, y_test, y_prediction, profit_rate, loss_rate)
    results_df.loc[i+6] = [ f"Profit decision Tree {training_samples[i][0]}", training_samples[i][1],
                         profits*100, (profits-profit_accepting_test)*100, time_training]

In [55]:
# Training the profit decision trees

ADT_model = cdt.AdaptiveDecisionTreeClassifier(profit_rate, loss_rate, min_samples_split=20, max_depth=8)

for i in range(len(training_samples)):
    start = time.perf_counter()
    ADT_model.fit(training_samples[i][2],training_samples[i][3],training_samples[i][2].Amount)
    y_prediction = ADT_model.predict(X_test)
    end = time.perf_counter()
    time_training = (end-start)
    
    #getting the profits
    profits = get_profit_percentage(X_test.Amount, y_test, y_prediction, profit_rate, loss_rate)
    results_df.loc[i+9] = [ f"Adaptive decision Tree {training_samples[i][0]}", training_samples[i][1],
                         profits*100, (profits-profit_accepting_test)*100, time_training]

In [60]:
results_df

Unnamed: 0,Model,Labels proportion,New Profit %,Improvement %,Time spent (s)
0,Decision Tree Stratified,99.83/0.17,99.457015,0.587133,9.385774
1,Decision Tree Undersampled,99.5/0.5,99.622724,0.752842,3.828014
2,Decision Tree Oversampled,50/50,99.560355,0.690473,17.463892
3,Regression Tree Stratified,99.83/0.17,99.457015,0.587133,8.095889
4,Regression Tree Undersampled,99.5/0.5,99.622724,0.752842,3.162646
5,Regression Tree Oversampled,50/50,99.560355,0.690473,15.05085
6,Profit decision Tree Stratified,99.83/0.17,99.350504,0.480622,18.84623
7,Profit decision Tree Undersampled,99.5/0.5,99.394249,0.524367,6.501151
8,Profit decision Tree Oversampled,50/50,95.984568,-2.885314,69.739973
9,Adaptive decision Tree Stratified,99.83/0.17,99.595055,0.725173,12.916883


Out of curiosity, below's the results obtained from the random_seed 0 to 49.

|    |   DT nor |   DT u 99.5/0.5 |   DT o 50/50 |   PDT nor |   PDT u 99.5/0.5 |   PDT o 50/50 |   ADT nor |   ADT u 99.5/0.5 |   ADT o 50/50 |
|---:|---------:|----------------:|-------------:|----------:|-----------------:|--------------:|----------:|-----------------:|--------------:|
|  0 |  99.7301 |         99.1881 |      99.6991 |   99.748  |          99.7426 |       89.1181 |   99.7467 |          99.7411 |        95.227 |
|  1 |  99.64   |         99.2741 |      99.6548 |   99.625  |          99.6179 |       94.2343 |   99.7011 |          99.7175 |        96.737 |
|  2 |  99.6682 |         99.6345 |      99.6106 |   99.6687 |          99.6712 |       94.6856 |   99.7    |          99.7006 |        95.689 |
|  3 |  99.652  |         99.6018 |      99.669  |   99.5422 |          99.7709 |       95.0178 |   99.7716 |          99.4986 |        96.511 |
|  4 |  99.6367 |         99.4597 |      99.5893 |   99.4588 |          99.4657 |       96.1503 |   99.4606 |          99.5803 |        95.314 |
|  5 |  99.791  |         99.7742 |      99.7653 |   99.6694 |          99.6958 |       94.4567 |   99.6484 |          99.5883 |        89.073 |
|  6 |  99.6482 |         99.6165 |      99.4745 |   99.5804 |          99.655  |       93.9905 |   99.6572 |          99.6112 |        97.518 |
|  7 |  99.7356 |         99.5822 |      99.6995 |   99.7407 |          99.7629 |       94.4608 |   99.7713 |          99.7306 |        93.443 |
|  8 |  99.6543 |         99.6867 |      99.6002 |   99.6483 |          99.672  |       90.7408 |   99.6736 |          99.6674 |        97.51  |
|  9 |  99.6297 |         99.6345 |      99.7554 |   99.6943 |          99.642  |       95.6586 |   99.6109 |          99.7479 |        95.964 |
| 10 |  99.6192 |         99.7219 |      99.6944 |   99.4608 |          99.7192 |       96.2874 |   99.7129 |          99.6676 |        95.482 |
| 11 |  99.797  |         99.8037 |      99.8406 |   99.7898 |          99.4642 |       94.4405 |   99.7456 |          99.8469 |        92.837 |
| 12 |  99.7183 |         99.4518 |      99.7827 |   99.5768 |          99.8184 |       97.1549 |   99.5879 |          99.6518 |        96.97  |
| 13 |  99.4897 |         99.5844 |      99.5691 |   99.5227 |          99.2448 |       93.031  |   99.5707 |          99.5694 |        93.04  |
| 14 |  99.7173 |         99.6578 |      99.57   |   99.729  |          99.7605 |       96.8611 |   99.7284 |          99.7794 |        97.056 |
| 15 |  99.6356 |         99.4956 |      99.6826 |   99.3488 |          99.4709 |       95.0921 |   99.5678 |          99.7105 |        96.762 |
| 16 |  99.7673 |         99.6223 |      99.466  |   99.7389 |          99.7598 |       97.0387 |   99.7695 |          99.768  |        97.898 |
| 17 |  99.8604 |         99.8257 |      99.8032 |   99.8204 |          99.8716 |       96.0393 |   99.8716 |          99.8501 |        95.328 |
| 18 |  99.6937 |         99.6017 |      99.6702 |   99.4115 |          99.5572 |       89.2005 |   99.7124 |          99.0342 |        97.008 |
| 19 |  99.6535 |         99.5686 |      99.5902 |   99.5195 |          99.5138 |       95.3896 |   99.5186 |          99.6408 |        96.724 |
| 20 |  99.1955 |         99.623  |      99.634  |   99.6779 |          99.6808 |       95.1217 |   99.6232 |          99.1118 |        97.077 |
| 21 |  99.744  |         99.5717 |      99.6026 |   99.7254 |          99.7033 |       86.5212 |   99.7305 |          99.7302 |        90.305 |
| 22 |  99.8137 |         99.7774 |      99.7731 |   99.6501 |          99.7839 |       94.0149 |   99.6793 |          99.6903 |        96.752 |
| 23 |  99.7474 |         99.5769 |      99.6773 |   99.6095 |          99.6494 |       97.1694 |   99.7321 |          99.6116 |        97.47  |
| 24 |  99.5852 |         99.4448 |      99.4644 |   99.4774 |          99.5863 |       83.9322 |   99.6189 |          99.6115 |        92.869 |
| 25 |  99.6152 |         99.5507 |      99.7118 |   99.6755 |          99.7777 |       86.3807 |   99.6959 |          99.7768 |        91.98  |
| 26 |  99.0925 |         99.0727 |      99.5354 |   99.5872 |          99.6651 |       95.133  |   99.6361 |          99.6727 |        95.828 |
| 27 |  99.7002 |         99.7004 |      99.6788 |   99.6622 |          99.7493 |       89.8888 |   99.6937 |          99.7296 |        92.995 |
| 28 |  99.8685 |         99.7925 |      99.8253 |   99.8137 |          99.8928 |       95.4846 |   99.8755 |          99.8692 |        96.87  |
| 29 |  99.7285 |         99.5864 |      99.6917 |   99.3389 |          99.2569 |       95.2842 |   99.5483 |          99.7565 |        96.336 |
| 30 |  99.5353 |         99.7104 |      99.6669 |   99.7495 |          99.7834 |       98.6264 |   99.6219 |          99.7955 |        98.541 |
| 31 |  99.5834 |         99.6997 |      99.6285 |   99.782  |          99.8017 |       97.3249 |   99.6227 |          99.8017 |        97.156 |
| 32 |  99.4609 |         99.6017 |      99.4933 |   99.4001 |          99.3717 |       95.9972 |   99.5894 |          99.6715 |        97.628 |
| 33 |  99.6261 |         99.6309 |      99.6532 |   99.6585 |          99.6565 |       97.4296 |   99.6642 |          99.1666 |        97.86  |
| 34 |  99.3233 |         99.294  |      99.278  |   99.2599 |          99.3238 |       93.3997 |   99.329  |          99.3649 |        92.914 |
| 35 |  99.7396 |         99.2553 |      99.726  |   99.7479 |          99.2578 |       97.5265 |   99.7735 |          99.2568 |        96.57  |
| 36 |  99.1833 |         99.0303 |      99.6467 |   99.3282 |          99.6622 |       93.0276 |   99.5616 |          99.1865 |        93.302 |
| 37 |  99.7242 |         99.7245 |      99.5939 |   99.4955 |          99.5683 |       90.9791 |   99.5209 |          99.7921 |        95.402 |
| 38 |  99.6719 |         99.7646 |      99.5782 |   99.6973 |          99.7191 |       89.3365 |   99.7527 |          99.7019 |        93.149 |
| 39 |  99.5918 |         99.5574 |      99.4886 |   99.4476 |          99.6078 |       96.3238 |   99.3284 |          99.6658 |        97.129 |
| 40 |  99.5918 |         99.4214 |      99.5385 |   99.4915 |          99.6082 |       96.288  |   99.6413 |          99.6156 |        98.291 |
| 41 |  99.7384 |         99.7458 |      99.6595 |   99.763  |          99.7661 |       86.8692 |   99.7791 |          99.7966 |        94.906 |
| 42 |  99.7101 |         99.7944 |      99.6827 |   99.6442 |          99.7759 |       96.9736 |   99.6292 |          99.6638 |        97.449 |
| 43 |  99.8562 |         99.7774 |      99.7721 |   99.8384 |          99.8626 |       94.3976 |   99.8446 |          99.8492 |        95.683 |
| 44 |  99.8544 |         99.8453 |      99.7593 |   99.877  |          99.807  |       95.5378 |   99.7261 |          99.7537 |        97.496 |
| 45 |  99.1996 |         99.2562 |      99.7155 |   99.7426 |          99.2243 |       90.7599 |   99.7712 |          99.2283 |        92.167 |
| 46 |  99.1275 |         99.1401 |      99.5643 |   99.5739 |          99.6265 |       98.5778 |   99.6468 |          99.617  |        98     |
| 47 |  99.6442 |         99.5428 |      99.6888 |   99.689  |          99.7204 |       96.6418 |   99.7362 |          99.759  |        97.121 |
| 48 |  99.5042 |         99.344  |      99.5811 |   99.4288 |          99.6263 |       98.0893 |   99.4895 |          99.6399 |        96.18  |
| 49 |  99.3346 |         99.4414 |      99.3583 |   99.4138 |          99.407  |       94.4605 |   99.3215 |          99.4754 |        96.9   |

|      Model     | Avg profit |
|:---------------|--------:|
| DT nor         | 99.6166 |
| DT u 99.5/0.5  | 99.5612 |
| DT o 50/50     | 99.6371 |
| PDT nor        | 99.6108 |
| PDT u 99.5/0.5 | 99.636  |
| PDT o 50/50    | 94.1309 |
| ADT nor        | 99.6542 |
| ADT u 99.5/0.5 | 99.6293 |
| ADT o 50/50    | 95.6883 |

On average, the Adaptive decision tree with the original training sample had a sligtly better performance on the final objective.

Because this is a simple decision tree, interpreting the trained model is trivial:

In [80]:
ADT_model.fit(X_test, y_test, X.Amount)
ADT_model.print_tree()

V17 < -2.8235791751326533  --- Profit gain = 0.011195132665124785
├───left:V3 < -0.8507511263669767  --- Profit gain = 0.15033756305255136
│   ├───left:V15 < 1.0052969096306186  --- Profit gain = 0.08456978130369858
│   │   ├───left:1
│   │   └───right:1
│   └───right:0
└───right:V14 < -4.391611307593247  --- Profit gain = 0.002078282086022192
    ├───left:V9 < 0.0493671175035062  --- Profit gain = 0.3854412389193441
    │   ├───left:V7 < 1.019359359831022  --- Profit gain = 0.8997416855020989
    │   │   ├───left:1
    │   │   └───right:0
    │   └───right:0
    └───right:V4 < 1.3043288948700182  --- Profit gain = 0.00044359498945829596
        ├───left:0
        └───right:0
