In [1]:
### Import packages ###
import time
import numpy as np
import math as math
import pandas as pd
import random as random
from sklearn.cluster import AgglomerativeClustering

### Append Path ###
import sys
sys.path.append('..')

### Import functions ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

# Inputs

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, "..",".."))

### DataType ###
DataFileInput = "BankNote"  # BankNote  Bar7  BreastCancer  CarEvaluation  COMPAS  FICO  Haberman  Iris  MONK1  MONK3

In [3]:
### Parameter Vector ###
# ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))

# ### Parameter Vector ###
# for SimV in range(0,len(ParameterVector)):
#     SimulationConfigInput = {"DataFileInput": ParameterVector.iloc[SimV]["Data"],
#                             "Seed": int(ParameterVector.iloc[SimV]["Seed"]),
#                             "TestProportion": float(ParameterVector.iloc[SimV]["TestProportion"]),
#                             "CandidateProportion": float(ParameterVector.iloc[SimV]["CandidateProportion"]),
#                             "SelectorType": str(ParameterVector.iloc[SimV]["SelectorType"]), 
#                             "ModelType": str(ParameterVector.iloc[SimV]["ModelType"]), 
#                             "UniqueErrorsInput": int(ParameterVector.iloc[SimV]["UniqueErrorsInput"]),
#                             "n_estimators":int(ParameterVector.iloc[SimV]["n_estimators"]),
#                             "regularization":float(ParameterVector.iloc[SimV]["regularization"]),
#                             "RashomonThresholdType":ParameterVector.iloc[SimV]["RashomonThresholdType"],
#                             "RashomonThreshold":float(ParameterVector.iloc[SimV]["RashomonThreshold"]),
#                             "Type":ParameterVector.iloc[SimV]["Type"]
#                             }

SimulationConfigInput = {'DataFileInput': DataFileInput,
    'Seed': 1,
    'TestProportion': 0.2,
    'CandidateProportion': 0.8,
    'SelectorType': 'BatchQBCDiversityFunction', # BatchQBCDiversityFunction, PassiveLearning
    'ModelType': 'TreeFarmsFunction', # RandomForestClassificationFunction, TreeFarmsFunction
    'UniqueErrorsInput': 0,
    'n_estimators': 100,
    'regularization': 0.01,
    'RashomonThresholdType': "Adder",
    'RashomonThreshold': 0.015,
    'Type': 'Classification',
    "DiversityWeight" : 0.4,
    "BatchSize" : 10}


In [4]:
### Set Up ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Load Data ###
df = LoadData(SimulationConfigInput["DataFileInput"])

### Train Test Candidate Split ###
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Batch Active Learning Metrics ###
# Set Up #
X_Candidate = df_Candidate.loc[:, df_Candidate.columns!= "Y"]
X_Train = df_Train.loc[:,df_Train.columns!= "Y"]

# Clustering #
cluster = AgglomerativeClustering(n_clusters=5, linkage="average")
ClusterLabels = cluster.fit_predict(X_Candidate)
df_Candidate["ClusterLabels"] = ClusterLabels

# Diversity Metric #
d_nmX = cdist(X_Candidate, X_Train, metric = "euclidean")
d_nX = d_nmX.min(axis=1)
df_Candidate["d_nX"] = d_nX

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test
SimulationConfigInput["df_Candidate"] = df_Candidate

# Learning Procedure

In [5]:
### Input ###
SimulationConfigInputUpdated = SimulationConfigInput

In [6]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []
TreeCount = {"AllTreeCount": [], "UniqueTreeCount": []}

In [7]:
i=1

In [8]:
### Prediction Model ###
print("Iteration: " + str(i))
ModelType = globals().get(SimulationConfigInputUpdated["ModelType"], None)
ModelArgsFiltered = FilterArguments(ModelType, SimulationConfigInputUpdated)
Model = ModelType(**ModelArgsFiltered)
SimulationConfigInputUpdated['Model'] = Model

Iteration: 1
null
Finding Optimal Objective...
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 296
{
  "false": {
    "complexity": 0.009999999776482582,
    "loss": 0.013698630034923553,
    "name": "Y",
    "prediction": 0
  },
  "feature": 3,
  "model_objective": 0.09022830426692963,
  "name": "variance_leq_1",
  "reference": 1,
  "relation": "==",
  "true": {
    "false": {
      "false": {
        "complexity": 0.009999999776482582,
        "loss": 0.004566209856420755,
        "name": "Y",
        "prediction": 0
      },
      "feature": 5,
      "name": "skewness_leq_5",
      "reference": 1,
      "relation": "==",
      "true": {
        "complexity": 0.009999999776482582,
        "loss": 0.031963467597961426,
        "name": "Y",
        "prediction": 1
      },
      "type": "integral"
    },
    "feature": 0,
    "name": "variance_leq_-3",
    "reference": 1,
    "relation": "==",
    "true": {
      "complexity": 0.00999999

In [9]:
Model.get_tree_count()

296

In [10]:
for i in range(0,Model.get_tree_count()):
    print(Model[i].error(df_Candidate.loc[:, df_Candidate.columns != "Y"], df_Candidate["Y"]))

0.07744874715261962
0.07744874715261962
0.07744874715261962
0.07744874715261962
0.07744874715261962
0.07630979498861046
0.07630979498861046
0.06036446469248291
0.06036446469248291
0.07630979498861046
0.07630979498861046
0.07630979498861046
0.07630979498861046
0.07630979498861046
0.07630979498861046
0.06036446469248291
0.06036446469248291
0.046697038724373585
0.046697038724373585
0.0694760820045558
0.0694760820045558
0.06150341685649208
0.06150341685649208
0.046697038724373585
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.046697038724373585
0.0694760820045558
0.059225512528473856
0.059225512528473856
0.06150341685649208
0.06150341685649208
0.059225512528473856
0.059225512528473856

In [11]:
### Test Error ###
TestErrorOutput = TestErrorFunction(InputModel = Model, df_Test = SimulationConfigInputUpdated["df_Test"], Type = SimulationConfigInputUpdated["Type"])
if('TREEFARMS' in str(type(Model))):                                                       # If Rashomon
    CurrentError = TestErrorOutput["Error_Duplicate"]
else: 
    CurrentError = TestErrorOutput["ErrorVal"]                                               # One output for non-Rashomon
ErrorVec.append(CurrentError)


In [12]:
### Sampling Procedure ###
SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated)
SelectorFuncOutput = SelectorType(**SelectorArgsFiltered)
QueryObservationIndex = SelectorFuncOutput["IndexRecommendation"]
QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[QueryObservationIndex]
SelectedObservationHistory.append(QueryObservationIndex)


df_Candidate obs: 878


In [13]:
### Update Train and Candidate Sets ###
SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation]).drop(columns=['ClusterLabels', 'd_nX'])
SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex) 

In [14]:
### Store Number of (Unique) Trees ###
if('TREEFARMS' in str(type(Model))):
    TreeCount["AllTreeCount"].append(SelectorFuncOutput["AllTreeCount"])          # Store number of trees
    TreeCount["UniqueTreeCount"].append(SelectorFuncOutput["UniqueTreeCount"])    # Store number of unique/duplicate trees


---
---

In [32]:
Model[0].error(SimulationConfigInputUpdated["df_Candidate"].loc[:, SimulationConfigInputUpdated["df_Candidate"].columns != "Y"], SimulationConfigInputUpdated["df_Candidate"]["Y"])

0.07603686635944695

In [None]:
SimulationConfigInputUpdated["RashomonThreshold"]

0.015

In [53]:
np.max(Error1Vec) - np.min(Error1Vec)

np.float64(0.07403189066059224)

In [54]:
np.max(Error2Vec) - np.min(Error2Vec)

np.float64(0.06566820276497698)

In [None]:
Error1Vec = [] 
Error2Vec = [] 
ErrorDiffVec = [] 

for i in range(0,Model.get_tree_count()):
    Error1 = Model[i].error(df_Candidate.loc[:, df_Candidate.columns != "Y"], df_Candidate["Y"])
    Error2 = Model[i].error(SimulationConfigInputUpdated["df_Candidate"].loc[:, SimulationConfigInputUpdated["df_Candidate"].columns != "Y"], SimulationConfigInputUpdated["df_Candidate"]["Y"])
    ErrorDiff = Error2-Error1
    Error1Vec.append(Error1)
    Error2Vec.append(Error2)
    ErrorDiffVec.append(ErrorDiff)
    print(Error1, Error2, ErrorDiff)

0.07744874715261962 0.07603686635944695 -0.0014118807931726707
0.07744874715261962 0.07603686635944695 -0.0014118807931726707
0.07744874715261962 0.07603686635944695 -0.0014118807931726707
0.07744874715261962 0.07603686635944695 -0.0014118807931726707
0.07744874715261962 0.07603686635944695 -0.0014118807931726707
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.06036446469248291 0.06105990783410142 0.0006954431416185125
0.06036446469248291 0.06105990783410142 0.0006954431416185125
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.07630979498861046 0.07488479262672809 -0.001425002361882366
0.06036446469248291 0.06105990783410142 0.0006954431416185125
0.0

---
---

# Save

In [None]:
### Return Simulation Parameters ###
SimulationParameters = {"DataFileInput" : str(SimulationConfigInput["DataFileInput"]),
                            "Seed" : str(SimulationConfigInput["Seed"]),
                            "TestProportion" : str(SimulationConfigInput["TestProportion"]),
                            "CandidateProportion" : str(SimulationConfigInput["CandidateProportion"]),
                            "SelectorType" :  str(SimulationConfigInput["SelectorType"]),
                            "ModelType" :  str(SimulationConfigInput["ModelType"]),
                            'UniqueErrorsInput': str(SimulationConfigInput["UniqueErrorsInput"]),
                            'n_estimators': str(SimulationConfigInput["n_estimators"]),
                            'regularization': str(SimulationConfigInput["regularization"]),
                            'RashomonThreshold': str(SimulationConfigInput["RashomonThreshold"]),
                            'Type': 'Classification',
                            }

In [None]:
### Return Time ###
ElapsedTime = time.time() - StartTime

### Return Dictionary ###
SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
                            "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
                            "SimulationParameters" : SimulationParameters,
                            "ElapsedTime" : ElapsedTime}