In [1]:
### Import packages ###
import time
import numpy as np
import math as math
import pandas as pd
import random as random

### Import functions ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

# Inputs

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, ".."))

### DataType ###
DataFileInput = "BreastCancer"

### Simulation Type ###
# SimV = 0 # Random forests
# SimV = 1 # REAL (Duplicates)
# SimV = 2 # UNREAL (Unique)

In [3]:
### Parameter Vector ###
ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))

# ### Parameter Vector ###
# SimulationConfigInput = {"DataFileInput": ParameterVector.iloc[SimV]["Data"],
#                         "Seed": int(ParameterVector.iloc[SimV]["Seed"]),
#                         "TestProportion": float(ParameterVector.iloc[SimV]["TestProportion"]),
#                         "CandidateProportion": float(ParameterVector.iloc[SimV]["CandidateProportion"]),
#                         "SelectorType": str(ParameterVector.iloc[SimV]["SelectorType"]), 
#                         "ModelType": str(ParameterVector.iloc[SimV]["ModelType"]), 
#                         # "TopCModels": float(ParameterVector.iloc[SimV]["TopCModels"]), 
#                         "UniqueErrorsInput": int(ParameterVector.iloc[SimV]["UniqueErrorsInput"]),
#                         "n_estimators":int(ParameterVector.iloc[SimV]["n_estimators"]),
#                         "regularization":float(ParameterVector.iloc[SimV]["regularization"]),
#                         "rashomon_bound_adder":float(ParameterVector.iloc[SimV]["rashomon_bound_adder"]),
#                         "Type":ParameterVector.iloc[SimV]["Type"]
#                         }

SimulationConfigInput = {'DataFileInput': DataFileInput,
 'Seed': 1,
 'TestProportion': 0.2,
 'CandidateProportion': 0.8,
 'SelectorType': 'TreeEnsembleQBCFunction',
 'ModelType': 'TreeFarmsFunction',
 'UniqueErrorsInput': 0,
 'n_estimators': 100,
 'regularization': 0.01,
 'rashomon_bound_adder': 0.01,
 'Type': 'Classification'}

# Set Up

In [4]:
### Set Up ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Generate Data ###
df = LoadData(SimulationConfigInput["DataFileInput"])

### Train Test Candidate Split ###
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test                                     # NOTE: Change to df_Test if there is a test set
SimulationConfigInput["df_Candidate"] = df_Candidate


# Learning Procedure

In [5]:
### Input ###
SimulationConfigInputUpdated = SimulationConfigInput

### Set Up ###
ErrorVec = []
AllErrorVec = []
SelectedObservationHistory = []
RashomonCommitteeDict = {"AllTreeCount": [], "UniqueTreeCount": []}

In [None]:
for i in range(len(SimulationConfigInputUpdated["df_Candidate"])):
    ### Prediction Model ###
    print("Iteration: " + str(i))
    ModelType = globals().get(SimulationConfigInputUpdated["ModelType"], None)
    ModelArgsFiltered = FilterArguments(ModelType, SimulationConfigInputUpdated)
    Model = ModelType(**ModelArgsFiltered)
    SimulationConfigInputUpdated['Model'] = Model

    ### Test Error ###
    TestErrorOutput = TestErrorFunction(InputModel = Model, df_Test = SimulationConfigInputUpdated["df_Test"], Type = "Classification")
    if('TREEFARMS' in str(type(Model))):                                                              # If Rashomon
        CurrentError = TestErrorOutput["Error_Duplicate"]
        # # Unique vs. Duplicate *ENSEMBLE* Prediction Error #                                          # NOTE: Should ensemble prediction error be based on unique or duplicate?
        # if(SimulationConfigInput["UniqueErrorsInput"]) == 1:                                          # NOTE: Once decided, modify TestErrorOutput to have only 1 error output.
        #     CurrentError = TestErrorOutput["Error_Unique"]
        # if(SimulationConfigInput["UniqueErrorsInput"]) == 0:
        #     CurrentError = TestErrorOutput["Error_Duplicate"]
    else: 
        CurrentError = TestErrorOutput["ErrorVal"]                                                      # One output for non-Rashomon
    ErrorVec.append(CurrentError)

    ### Sampling Procedure ###
    SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
    SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated)
    SelectorFuncOutput = SelectorType(**SelectorArgsFiltered)
    QueryObservationIndex = SelectorFuncOutput["IndexRecommendation"]
    QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[[QueryObservationIndex]] # or should this be iloc
    SelectedObservationHistory.append(QueryObservationIndex)

    ### Update Train and Candidate Sets ###
    SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation])
    SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex) 

    ### Store Number of (Unique) Trees ###
    if('TREEFARMS' in str(type(Model))):
        RashomonCommitteeDict["AllTreeCount"].append(SelectorFuncOutput["AllTreeCount"])          # Store number of trees
        RashomonCommitteeDict["UniqueTreeCount"].append(SelectorFuncOutput["UniqueTreeCount"])    # Store number of unique/duplicate trees


Iteration: 0
null
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 141
Finding Optimal Objective...
{
  "false": {
    "false": {
      "false": {
        "complexity": 0.009999999776482582,
        "loss": 0.018018018454313278,
        "name": "Y",
        "prediction": 1
      },
      "feature": 8,
      "name": "NormalNucleoli_1",
      "reference": 1,
      "relation": "==",
      "true": {
        "complexity": 0.009999999776482582,
        "loss": 0.018018018454313278,
        "name": "Y",
        "prediction": 0
      },
      "type": "integral"
    },
    "feature": 5,
    "name": "SingleEpithelialCellSize_2",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.009009009227156639,
      "name": "Y",
      "prediction": 0
    },
    "type": "integral"
  },
  "feature": 7,
  "model_objective": 0.09405405819416046,
  "name": "BareNuclei_10",
  "reference": 1,
  "relation"

# Save

In [None]:
### Return Simulation Parameters ###
SimulationParameters = {"DataFileInput" : str(SimulationConfigInput["DataFileInput"]),
                            "Seed" : str(SimulationConfigInput["Seed"]),
                            "TestProportion" : str(SimulationConfigInput["TestProportion"]),
                            "CandidateProportion" : str(SimulationConfigInput["CandidateProportion"]),
                            "SelectorType" :  str(SimulationConfigInput["SelectorType"]),
                            "ModelType" :  str(SimulationConfigInput["ModelType"]),
                            'UniqueErrorsInput': str(SimulationConfigInput["UniqueErrorsInput"]),
                            'n_estimators': str(SimulationConfigInput["n_estimators"]),
                            'regularization': str(SimulationConfigInput["regularization"]),
                            'rashomon_bound_adder': str(SimulationConfigInput["rashomon_bound_adder"]),
                            'Type': 'Classification',
                            }

In [None]:
### Return Time ###
ElapsedTime = time.time() - StartTime

### Return Dictionary ###
SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
                            "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
                            "SimulationParameters" : SimulationParameters,
                            "ElapsedTime" : ElapsedTime}