# Packages

In [1]:
### Import Packages ###
import os
import argparse
import numpy as np
import math as math
import pandas as pd
from scipy import stats
import random as random
from sklearn.metrics import f1_score
### Append Path ###
import sys
sys.path.append('..')

### IMPORT FUNCTIONS ###
from utils.Auxiliary import *
from treeFarms.treefarms.model.treefarms import TREEFARMS

### GET DIRECTORY ###
cwd = os.getcwd()
SaveDirectory = os.path.join(cwd, "Results/OptimalThreshold")


# Input

In [24]:
### SET UP ###

# Input #
DataFile =  "BankNote"
rashomon_bound_adder = .08
regularization = 0.01
TestProportion = 0.2
CandidateProportion = 0.8
Seed = 1


# Set Up

In [25]:

# Load Data #
df = LoadData(DataFile)
random.seed(Seed)
np.random.seed(Seed)

# Train Test Candidate Split #
from utils.Main import TrainTestCandidateSplit
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, TestProportion, CandidateProportion)

### TRAIN TREEFARMS ###
# TreeFarms #
config = {"regularization": regularization, "rashomon_bound_adder": rashomon_bound_adder}
TreeFarmsModel = TREEFARMS(config)
TreeFarmsModel.fit(df_Train.loc[:, df_Train.columns != "Y"], df_Train["Y"])
TreeCount = TreeFarmsModel.get_tree_count()


null
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 665949
Finding Optimal Objective...
{
  "false": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "Y",
      "prediction": 0
    },
    "feature": 0,
    "name": "variance_leq_-3",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "Y",
      "prediction": 1
    },
    "type": "integral"
  },
  "feature": 5,
  "model_objective": 0.03999999910593033,
  "name": "skewness_leq_5",
  "reference": 1,
  "relation": "==",
  "true": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "Y",
      "prediction": 0
    },
    "feature": 3,
    "name": "variance_leq_1",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "Y",
      "prediction": 1
    },
    "typ

In [None]:

# Duplicate and Unique #
PredictionArray_Duplicate = pd.DataFrame(np.array([TreeFarmsModel[i].predict(df_Train.loc[:, df_Train.columns != "Y"]) for i in range(TreeCount)]))
PredictionArray_Unique = pd.DataFrame(PredictionArray_Duplicate).drop_duplicates(keep='first', ignore_index=False)
TrueValues = df_Train["Y"].to_numpy()
PredictionArray = PredictionArray_Unique

### TRAINING ACCURACY ###
# Training Accuracy #
TreeClassificationAccuracy = PredictionArray.eq(TrueValues, axis=1).mean(axis=1)
BestAccuracy = float(np.max(TreeClassificationAccuracy))

# Threshold Values #
EpsilonVec = BestAccuracy - TreeClassificationAccuracy
MinEpsilon = float(np.min(EpsilonVec))
MaxEpsilon = float(np.max(EpsilonVec))
# ThresholdValues = np.linspace(MinEpsilon, MaxEpsilon, 10000)
ThresholdValues = np.arange(MinEpsilon, MaxEpsilon + 0.000001, 0.000001)

### TEST ACCURACY ###
# Set Up #
ModelIndicesVec = []
Epsilon_F1Score = []
Epsilon_ClassAccuracy = []
ModelIndicesOld = []  # Initialize as empty list
F1ScoreOld = None
ClassAccuracyOld = None


# Find Threshold 

In [4]:
for Threshold in ThresholdValues:

    # Filter Models Based on Threshold
    ModelIndices = EpsilonVec[EpsilonVec <= Threshold].index.tolist()

    # Only recalculate F1 and Class Accuracy if new models were added
    if ModelIndices == ModelIndicesOld:
        # Use stored values
        F1Score = F1ScoreOld
        ClassAccuracy = ClassAccuracyOld
    else: 
        # Test Set Predictions
        Test_Predictions = pd.DataFrame(
            np.array([TreeFarmsModel[i].predict(df_Test.loc[:, df_Test.columns != "Y"]) for i in ModelIndices])
        )
        Test_Predictions.columns = df_Test.index.astype(str)

        # Compute Ensemble Prediction (Mode)
        mode_result = stats.mode(Test_Predictions, axis=0, keepdims=True)
        EnsemblePrediction = pd.Series(mode_result.mode.flatten())
        EnsemblePrediction.index = df_Test.index

        # Compute Metrics
        F1Score = float(f1_score(df_Test["Y"], EnsemblePrediction, average='micro'))
        ClassAccuracy = float(np.mean(EnsemblePrediction == df_Test["Y"]))

        # Store Old ModelIndices
        ModelIndicesOld = ModelIndices.copy()
        F1ScoreOld = F1Score
        ClassAccuracyOld = ClassAccuracy

    # Append Metrics
    ModelIndicesVec.append(ModelIndices)
    Epsilon_F1Score.append(F1Score)
    Epsilon_ClassAccuracy.append(ClassAccuracy)

### OUTPUT ###
SimulationResults = {
    "ModelIndicesVec" : ModelIndicesVec,
    "ThresholdValues" : ThresholdValues,
    "Epsilon_F1Score" : Epsilon_F1Score,
    "Epsilon_ClassAccuracy" : Epsilon_ClassAccuracy}