# Packages

In [1]:
### Import Packages ###
import os
import numpy as np
import math as math
import pandas as pd
from scipy import stats
import random as random
from sklearn.metrics import f1_score

### Append Path ###
import sys
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)
sys.path.append(parent_dir)

### Add TreeFarms ###
treefarms_dir = os.path.join(parent_dir, 'treeFarms')
sys.path.append(treefarms_dir)

### IMPORT FUNCTIONS ###
from utils.Auxiliary import *
from treefarms.model.treefarms import TREEFARMS

### GET DIRECTORY ###
SaveDirectory = os.path.join(cwd, "Results/OptimalThreshold")

# Input

In [2]:
DataFile =  "Bar7"
rashomon_bound_adder = .03
regularization = 0.01
TestProportion = 0.25
Seed = 0

# Set Up

In [3]:
# Load Data #
df = LoadData(DataFile)
random.seed(Seed)
np.random.seed(Seed)

# Train Test Split #
from sklearn.model_selection import train_test_split
X_Train, X_Test, y_Train, y_Test = train_test_split(
    df.loc[:, df.columns != "Y"], df["Y"], test_size=TestProportion
)

In [4]:

# Keep original column names #
df_Train = X_Train.copy()
df_Train.insert(0, 'Y', y_Train)

df_Test = X_Test.copy()
df_Test.insert(0, 'Y', y_Test)


In [5]:

### TRAIN TREEFARMS ###
# TreeFarms #
config = {"regularization": regularization, "rashomon_bound_adder": rashomon_bound_adder}
TreeFarmsModel = TREEFARMS(config)
TreeFarmsModel.fit(df_Train.loc[:, df_Train.columns != "Y"], df_Train["Y"])
TreeCount = TreeFarmsModel.get_tree_count()


null
treefarms reported successful executionFinding Optimal Objective...
{
  "false": {
    "false": {
      "false": {
        "complexity": 0.009999999776482582,
        "loss": 0.2022315263748169,
        "name": "Y",
        "prediction": 0
      },
      "feature": 9,
      "name": "Bar1",
      "reference": 1,
      "relation": "==",
      "true": {
        "complexity": 0.009999999776482582,
        "loss": 0.0195258017629385,
        "name": "Y",
        "prediction": 1
      },
      "type": "integral"
    },
    "feature": 8,
    "name": "Bar0",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.06694560497999191,
      "name": "Y",
      "prediction": 1
    },
    "type": "integral"
  },
  "feature": 10,
  "model_objective": 0.3349790871143341,
  "name": "Bar2",
  "reference": 1,
  "relation": "==",
  "true": {
    "complexity": 0.009999999776482582,
    "loss": 0.0062761506997048855,
    "name": "Y",
    "predic

In [7]:
# # Manually #
# PredictionArray_Duplicate = pd.DataFrame(np.array([TreeFarmsModel[i].predict(df_Train.loc[:, df_Train.columns != "Y"]) for i in range(TreeCount)]))
# TrueValues = df_Train["Y"].to_numpy()
# PredictionArray = PredictionArray_Duplicate
# TrainingAccuracy = PredictionArray.eq(TrueValues, axis=1).mean(axis=1) # MANUAL #    

# TreeFarms #
TrainingAccuracy = [1-TreeFarmsModel[i].error(df_Train.loc[:, df_Train.columns != "Y"], df_Train["Y"]) for i in range(TreeCount)]

# Threshold Values #
EpsilonVec = np.max(TrainingAccuracy) - TrainingAccuracy
ThresholdValues = np.arange(0, rashomon_bound_adder + 0.000001, 0.000001)


In [8]:
np.unique(TrainingAccuracy)

array([0.67573222, 0.67642957, 0.67712692, 0.67921897, 0.68131102,
       0.68340307, 0.68410042, 0.68479777, 0.68549512, 0.68619247,
       0.68688982, 0.68758717, 0.68828452, 0.68898187, 0.68967922,
       0.69037657, 0.69107392, 0.69177127, 0.69246862, 0.69316597,
       0.69386332, 0.69456067, 0.69525802, 0.69595537, 0.69665272,
       0.69735007, 0.69804742, 0.69874477, 0.69944212, 0.70013947,
       0.70083682, 0.70153417, 0.70223152, 0.70292887, 0.70362622,
       0.70432357, 0.70502092, 0.70571827, 0.70641562, 0.70711297,
       0.70781032, 0.70850767, 0.70920502, 0.70990237, 0.71059972,
       0.71129707, 0.71199442, 0.71269177, 0.71338912, 0.71408647,
       0.71478382, 0.71548117, 0.71617852, 0.71687587, 0.71757322,
       0.71827057, 0.71896792, 0.71966527, 0.72036262, 0.72105997,
       0.72175732, 0.72245467, 0.72384937, 0.72454672, 0.72524407,
       0.72594142, 0.72663877, 0.72733612, 0.72873082])

In [9]:
### TEST ACCURACY ###
# Set Up #
ModelIndicesVec = []
Epsilon_F1Score = []
Epsilon_ClassAccuracy = []
ModelIndicesOld = []
F1ScoreOld = None
ClassAccuracyOld = None


# Find Threshold 

In [10]:
for Threshold in ThresholdValues:

    # Filter Models Based on Threshold
    ModelIndices = np.where(EpsilonVec <= Threshold)[0].tolist()

    # Only recalculate F1 and Class Accuracy if new models were added
    if ModelIndices == ModelIndicesOld:
        # Use stored values
        F1Score = F1ScoreOld
        ClassAccuracy = ClassAccuracyOld
    else: 
        # Test Set Predictions
        Test_Predictions = pd.DataFrame(
            np.array([TreeFarmsModel[i].predict(df_Test.loc[:, df_Test.columns != "Y"]) for i in ModelIndices])
        )
        Test_Predictions.columns = df_Test.index.astype(str)

        # Compute Ensemble Prediction (Mode)
        mode_result = stats.mode(Test_Predictions, axis=0, keepdims=True)
        EnsemblePrediction = pd.Series(mode_result.mode.flatten())
        EnsemblePrediction.index = df_Test.index

        # Compute Metrics
        F1Score = float(f1_score(df_Test["Y"], EnsemblePrediction, average='micro'))
        ClassAccuracy = float(np.mean(EnsemblePrediction == df_Test["Y"]))

        # Store Old ModelIndices
        ModelIndicesOld = ModelIndices.copy()
        F1ScoreOld = F1Score
        ClassAccuracyOld = ClassAccuracy

    # Append Metrics
    ModelIndicesVec.append(ModelIndices)
    Epsilon_F1Score.append(F1Score)
    Epsilon_ClassAccuracy.append(ClassAccuracy)

### OUTPUT ###
SimulationResults = {
    "ModelIndicesVec" : ModelIndicesVec,
    "ThresholdValues" : ThresholdValues,
    "Epsilon_F1Score" : Epsilon_F1Score,
    "Epsilon_ClassAccuracy" : Epsilon_ClassAccuracy}

In [12]:
np.unique(Epsilon_F1Score)

array([0.67640919, 0.67849687, 0.68058455, 0.68267223, 0.69311065,
       0.69519833, 0.70563674])