In [1]:
### Import packages ###
import time
import numpy as np
import math as math
import pandas as pd
import random as random

### Import functions ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

# Inputs

In [None]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, ".."))

### DataType ###
DataFileInput = "BostonHousing"

### Simulation Type ###
# i = 0 # Random forests
# i = 1 # REAL (Duplicates)
# i = 2 # UNREAL (Unique)

In [3]:
### Parameter Vector ###
ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))

### Parameter Vector ###
# SimulationConfigInput = {"DataFileInput": ParameterVector.iloc[i]["Data"],
#                         "Seed": int(ParameterVector.iloc[i]["Seed"]),
#                         "TestProportion": float(ParameterVector.iloc[i]["TestProportion"]),
#                         "CandidateProportion": float(ParameterVector.iloc[i]["CandidateProportion"]),
#                         "SelectorType": str(ParameterVector.iloc[i]["SelectorType"]), 
#                         "ModelType": str(ParameterVector.iloc[i]["ModelType"]), 
#                         # "TopCModels": float(ParameterVector.iloc[i]["TopCModels"]), 
#                         "UniqueErrorsInput": int(ParameterVector.iloc[i]["UniqueErrorsInput"]),
#                         "n_estimators":int(ParameterVector.iloc[i]["n_estimators"]),
#                         "regularization":float(ParameterVector.iloc[i]["regularization"]),
#                         "rashomon_bound_adder":float(ParameterVector.iloc[i]["rashomon_bound_adder"]),
#                         "Type":ParameterVector.iloc[i]["Type"]
#                         }

SimulationConfigInput = {'DataFileInput': 'BostonHousing',
 'Seed': 1,
 'TestProportion': 0.2,
 'CandidateProportion': 0.8,
 'SelectorType': 'TreeEnsembleQBCFunction',
 'ModelType': 'TreeFarmsFunction',
 'UniqueErrorsInput': 0,
 'n_estimators': 100,
 'regularization': 0.01,
 'rashomon_bound_adder': 0.01,
 'Type': 'Classification'}

# Set Up

In [None]:
### Set Up ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Generate Data ###
df = LoadData(SimulationConfigInput["DataFileInput"])

### Train Test Candidate Split ###
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test                                     # NOTE: Change to df_Test if there is a test set
SimulationConfigInput["df_Candidate"] = df_Candidate


# Learning Procedure

In [8]:
### Input ###
SimulationConfigInputUpdated = SimulationConfigInput

### Set Up ###
ErrorVec = []
AllErrorVec = []
SelectedObservationHistory = []
RashomonCommitteeDict = {"AllModelsInRashomonSet": [], "UniqueModelsInRashomonSet": []}
for i in range(len(SimulationConfigInputUpdated["df_Candidate"])):

    ### Prediction Model ###
    print("Iteration: " + str(i))
    ModelType = globals().get(SimulationConfigInputUpdated["ModelType"], None)
    ModelArgsFiltered = FilterArguments(ModelType, SimulationConfigInputUpdated)
    Model = ModelType(**ModelArgsFiltered)
    SimulationConfigInputUpdated['Model'] = Model

    ### Current Error ###
    TestErrorVal = TestErrorFunction(Model, SimulationConfigInputUpdated["df_Test"], SimulationConfigInputUpdated["Type"])
    if('TREEFARMS' in str(type(Model))):                                                                   # If Rashomon
        AllErrors = TestErrorVal                                                                 # All errors of Rashomon
        AllErrorVec.append(AllErrors)
        CurrentError = min(AllErrors)                                                            # Extract the best one
        RashomonCommitteeDict["AllModelsInRashomonSet"].append(Model.get_tree_count())           # Store number of trees
        RashomonCommitteeDict["UniqueModelsInRashomonSet"].append(len(set(AllErrors)))  
    else: 
        CurrentError = TestErrorVal                                                              # One output for non-Rashomon
        AllErrors = [None]
    SimulationConfigInputUpdated["AllErrors"] = AllErrors                                        # Use AllErrors in RashomonQBC
    ErrorVec.append(CurrentError)

    print("CURRENT ERROR: ")
    print(CurrentError)
    print("---")

    ### Sampling Procedure ###
    SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
    SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated)
    QueryObservationIndex = SelectorType(**SelectorArgsFiltered)
    QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[[QueryObservationIndex]] # or should this be iloc
    SelectedObservationHistory.append(QueryObservationIndex)

    ### Update Train and Candidate Sets ###
    SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation])
    SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex) 


Iteration: 2
null
Finding Optimal Objective...
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 481
{
  "false": {
    "false": {
      "false": {
        "complexity": 0.009999999776482582,
        "loss": 0.02500000037252903,
        "name": "Y",
        "prediction": "False"
      },
      "feature": 12,
      "name": "LSTAT_1",
      "reference": 1.0,
      "relation": "==",
      "true": {
        "complexity": 0.009999999776482582,
        "loss": 0.03750000149011612,
        "name": "Y",
        "prediction": "True"
      },
      "type": "rational"
    },
    "feature": 4,
    "name": "NOX_2",
    "reference": 1.0,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.012500000186264515,
      "name": "Y",
      "prediction": "False"
    },
    "type": "rational"
  },
  "feature": 11,
  "model_objective": 0.125,
  "name": "PTRATIO_3",
  "reference": 1.0,
  "relation": "==",
  "true": {
    "

In [None]:

print("CURRENT ERROR: ")
print(CurrentError)
print("---")

### Sampling Procedure ###
SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated)
QueryObservationIndex = SelectorType(**SelectorArgsFiltered)
QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[[QueryObservationIndex]] # or should this be iloc
SelectedObservationHistory.append(QueryObservationIndex)

### Update Train and Candidate Sets ###
SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation])
SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex) 


# Save

In [11]:
### Return Simulation Parameters ###
SimulationParameters = {"DataFileInput" : str(SimulationConfigInput["DataFileInput"]),
                            "Seed" : str(SimulationConfigInput["Seed"]),
                            "TestProportion" : str(SimulationConfigInput["TestProportion"]),
                            "CandidateProportion" : str(SimulationConfigInput["CandidateProportion"]),
                            "SelectorType" :  str(SimulationConfigInput["SelectorType"]),
                            "ModelType" :  str(SimulationConfigInput["ModelType"]),
                            'UniqueErrorsInput': str(SimulationConfigInput["UniqueErrorsInput"]),
                            'n_estimators': str(SimulationConfigInput["n_estimators"]),
                            'regularization': str(SimulationConfigInput["regularization"]),
                            'rashomon_bound_adder': str(SimulationConfigInput["rashomon_bound_adder"]),
                            'Type': 'Classification',
                            }

In [12]:
### Return Time ###
ElapsedTime = time.time() - StartTime

### Return Dictionary ###
SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
                            "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
                            "SimulationParameters" : SimulationParameters,
                            "ElapsedTime" : ElapsedTime}