In [1]:
### Import Packages ###
import os
import ast
import numpy as np
import math as math
import pandas as pd
import random as random
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

### Local Packages ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, ".."))

### DataType ###
DataFileInput = "BostonHousingBinned"

# One Iteration

## Inputs

In [3]:
# TopCModels = 50
# Seed = 1
# TestProportion = 0.2
# CandidateProportion = 0.8
# SelectorType = TreeEnsembleQBCFunction
# ModelType = RandomForestClassificationFunction
# # ModelType = TreeFarmsFunction
# DataArgs = {}
# SelectorArgs = {"TopCModels": 10}
# ModelArgs = {"TopCModels": TopCModels, 
#              "config" : {"regularization": 0.01, 
#              "rashomon_bound_multiplier": 0.05}, 
#              "Seed": Seed,
#              "n_estimators": TopCModels,
#              "Type":"Classification"}


In [4]:
### Parameter Vector ###
k=3
ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))
Seed = int(ParameterVector.iloc[k]["Seed"])
TestProportion = 0.2
CandidateProportion = 0.8
SelectorType = globals().get(ParameterVector.iloc[k]["SelectorType"], None)
# ModelType = globals().get(ParameterVector.iloc[k]["ModelType"], None)
ModelType = TreeFarmsFunction
DataArgs = ast.literal_eval(ast.literal_eval(ParameterVector.iloc[k]["DataArgs"]))
SelectorArgs = ast.literal_eval(ast.literal_eval(ParameterVector.iloc[k]["SelectorArgs"].replace("[","").replace("]","")))
ModelArgs = ast.literal_eval(ast.literal_eval(ParameterVector.iloc[k]["ModelArgs"].replace("[","").replace("]","")))


# Set Up

In [5]:
### Run Time ###
StartTime = time.time()

### Set Up ###
random.seed(Seed)
np.random.seed(Seed)
ErrorVec = []
SelectedObservationHistory = []

In [6]:
### Generate Data ###
if(DataFileInput == "Simulate"):
    from utils.Main import DataGeneratingProcess                             ### NOTE: Why is this not imported from utils.Main import *
    df = DataGeneratingProcess(**DataArgs)
else:
    df = LoadData(DataFileInput)

In [7]:
### Train Test Candidate Split
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, TestProportion, CandidateProportion)

In [8]:
### Selector Arguments ###
SelectorArgs["df_Train"] = df_Test                                     # NOTE: Change to df_Test if there is a test set
SelectorArgs["df_Candidate"] = df_Candidate
SelectorArgs["Model"] = ""
# SelectorArgsFiltered = FilterArguments(SelectorType, SelectorArgs)

In [9]:
### Model Arguments ###
ModelArgs['df_Train'] = df_Train
# ModelArgsFiltered = FilterArguments(ModelType, ModelArgs)

# Learning Procedure

In [10]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []

In [11]:
i=0

In [12]:
### Prediction Model ###
print("Iteration: " + str(i))
ModelArgsFiltered = FilterArguments(ModelType, ModelArgs)
Model = ModelType(**ModelArgsFiltered)
if "Model" in SelectorArgs.keys(): SelectorArgs['Model'] = Model            # NOTE: THIS IS NOT DYNAMIC


Iteration: 0
null
Finding Optimal Objective...
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 83
{
  "false": {
    "complexity": 0.009999999776482582,
    "loss": 0.07500000298023224,
    "name": "Y",
    "prediction": "False"
  },
  "feature": 9,
  "model_objective": 0.1550000011920929,
  "name": "PTRATIO_1",
  "reference": 1.0,
  "relation": "==",
  "true": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.0,
      "name": "Y",
      "prediction": "False"
    },
    "feature": 8,
    "name": "RM_3",
    "reference": 1.0,
    "relation": "==",
    "true": {
      "complexity": 0.009999999776482582,
      "loss": 0.05000000074505806,
      "name": "Y",
      "prediction": "True"
    },
    "type": "rational"
  },
  "type": "rational"
}
{
  "false": {
    "complexity": 0.009999999776482582,
    "loss": 0.03750000149011612,
    "name": "Y",
    "prediction": "False"
  },
  "feature": 8,
  "model_objective": 0.155

In [13]:
# ### Current Error ###
TestErrorVal = TestErrorFunction(Model, df_Test, ModelArgs["Type"])        # NOTE: Change to df_Test if there is a test set
if(len(TestErrorVal) > 1):
    AllErrors = TestErrorVal                                                # Rashomon gives all errors of Rashomon
    CurrentError = float(np.min(AllErrors))                                 # Extract the best one
else: 
    CurrentError = TestErrorVal                                             # One output for non-Rashomon
    AllErrors = [None]
SelectorArgs["AllErrors"] = AllErrors                                       # Use AllErrors in RashomonQBC
ErrorVec.append(CurrentError)


---

In [14]:
TopCModels = ModelArgs["TopCModels"]

In [15]:
### Ignore warning (taken care of) ###
np.seterr(all = 'ignore') 
warnings.filterwarnings("ignore", category=UserWarning)


In [16]:
AllErrors

[0.17647058823529416,
 0.17647058823529416,
 0.17647058823529416,
 0.17647058823529416,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.13725490196078427,
 0.17647058823529416,
 0.13725490196078427,
 0.17647058823529416,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.13725490196078427,
 0.13725490196078427,
 0.17647058823529416,
 0.17647058823529416,
 0.13725490196078427,
 0.17647058823529416,
 0.16666666666666663,
 0.16666666666666663,
 0.13725490196078427,
 0.17647058823529416,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.16666666666666663,
 0.12745098039215685,
 0.12745098039215685,
 0.12745098039215685,
 0.12745098039215685,
 0.12745098039215685,
 0.1274509

In [None]:
### Predicted Values ###
if len(AllErrors) < TopCModels:
    TopCModels = len(AllErrors)

In [23]:
LowestErrorIndices = np.argsort(AllErrors)[:TopCModels]


In [24]:
LowestErrorIndices

array([78, 75, 54, 53, 82, 72, 70, 67, 65, 81, 45, 44, 43, 42, 40, 41, 16,
       18, 79, 76, 24, 25, 32, 28, 80, 77, 71, 66, 48, 49, 30,  6, 23,  5,
        4, 22,  7,  8, 50, 15, 21, 20, 60, 10,  9, 12, 31, 34, 35, 36, 11,
       37, 38, 39, 14, 13, 63,  3, 19, 17,  1,  2,  0, 27, 33, 26, 29, 59,
       62, 52, 51, 74, 73, 56, 69, 68, 57, 58, 55, 47, 46, 64, 61])

In [32]:
np.min(AllErrors)

np.float64(0.10784313725490191)

In [31]:
np.mean(Model[78].predict(df_Test.loc[:, df_Test.columns != "Y"]) != df_Test["Y"])

np.float64(0.10784313725490197)

In [None]:

PredictedValues = [Model[i].predict(df_Candidate) for i in LowestErrorIndices]                          # RandomForest

---

In [16]:
### Sampling Procedure ###
SelectorArgsFiltered = FilterArguments(SelectorType, SelectorArgs)
QueryObservationIndex = SelectorType(**SelectorArgsFiltered)
QueryObservation = df_Candidate.loc[[QueryObservationIndex]] # or should this be iloc

In [17]:
### Update Train and Candidate Sets ###
df_Train = pd.concat([df_Train, QueryObservation])
df_Candidate = df_Candidate.drop(QueryObservationIndex)
### Update SelectorArgs and ModelArgs ###                                     # NOTE: THIS IS NOT DYNAMIC
if "df_Train" in ModelArgs.keys(): ModelArgs['df_Train'] = df_Train
if "df_Train" in SelectorArgs.keys(): SelectorArgs['df_Train'] = df_Train
if "df_Candidate" in SelectorArgs.keys(): SelectorArgs['df_Candidate'] = df_Candidate  

# Save

In [20]:
# ### Return Simulation Parameters ###
# SimulationParameters = {"DataFileInput" : str(DataFileInput),
#                         "Seed" : str(Seed),
#                         "TestProportion" : str(TestProportion),
#                         "CandidateProportion" : str(CandidateProportion),
#                         "SelectorType" : str(SelectorType),
#                         "ModelType" : str(ModelType),
#                         "DataArgs" : str(DataArgs),
#                         # "SelectorArgs" : str(SelectorArgs),
#                         "ModelArgs" : str(FilterArguments(ModelType, ModelArgs).pop('df_Train', None))
#                         }

In [21]:
# ### Return Time ###
# ElapsedTime = time.time() - StartTime

# ### Return Dictionary ###
# SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
#                             "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
#                             "SimulationParameters" : SimulationParameters,
#                             "ElapsedTime" : ElapsedTime}