In [1]:
### Import packages ###
import time
import numpy as np
import math as math
import pandas as pd
import random as random
from sklearn.cluster import AgglomerativeClustering

### Append Path ###
import sys
sys.path.append('..')

### Import functions ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *


# Inputs

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, "..",".."))

### DataType ###
DataFileInput = "BankNote"  # BankNote  Bar7  BreastCancer  CarEvaluation  COMPAS  FICO  Haberman  Iris  MONK1  MONK3

In [3]:
### Parameter Vector ###
# ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))

# ### Parameter Vector ###
# for SimV in range(0,len(ParameterVector)):
#     SimulationConfigInput = {"DataFileInput": ParameterVector.iloc[SimV]["Data"],
#                             "Seed": int(ParameterVector.iloc[SimV]["Seed"]),
#                             "TestProportion": float(ParameterVector.iloc[SimV]["TestProportion"]),
#                             "CandidateProportion": float(ParameterVector.iloc[SimV]["CandidateProportion"]),
#                             "SelectorType": str(ParameterVector.iloc[SimV]["SelectorType"]), 
#                             "ModelType": str(ParameterVector.iloc[SimV]["ModelType"]), 
#                             "UniqueErrorsInput": int(ParameterVector.iloc[SimV]["UniqueErrorsInput"]),
#                             "n_estimators":int(ParameterVector.iloc[SimV]["n_estimators"]),
#                             "regularization":float(ParameterVector.iloc[SimV]["regularization"]),
#                             "RashomonThresholdType":ParameterVector.iloc[SimV]["RashomonThresholdType"],
#                             "RashomonThreshold":float(ParameterVector.iloc[SimV]["RashomonThreshold"]),
#                             "Type":ParameterVector.iloc[SimV]["Type"]
#                             }

SimulationConfigInput = {'DataFileInput': DataFileInput,
    'Seed': 1,
    'TestProportion': 0.2,
    'CandidateProportion': 0.8,
    'SelectorType': 'BaldSelectorFunction', # BatchQBCDiversityFunction, PassiveLearning, BaldSelectorFunction
    'ModelType': 'BayesianNeuralNetworkFunction', # RandomForestClassificationFunction, TreeFarmsFunction, BayesianNeuralNetworkFunction
    'UniqueErrorsInput': 0,
    'n_estimators': 100,
    'regularization': 0.01,
    'RashomonThresholdType': "Adder",
    'RashomonThreshold': 0.015,
    'Type': 'Classification',
    "DiversityWeight" : 0.33,
    "DensityWeight" : 0.33,
    "BatchSize" : 10}


In [4]:
### Set Up ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Load Data ###
df = LoadData(SimulationConfigInput["DataFileInput"])

### Train Test Candidate Split ###
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Add Batch Active Learning Metrics ###
df_Candidate = DiversityMetricsFunction(df_Candidate, df_Train, k=10)
SimulationConfigInput['auxiliary_data_cols'] = ['DiversityScores', 'DensityScores']

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test
SimulationConfigInput["df_Candidate"] = df_Candidate

# Learning Procedure

In [5]:
### Input ###
SimulationConfigInputUpdated = SimulationConfigInput

In [6]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []
TreeCount = {"AllTreeCount": [], "UniqueTreeCount": []}

In [7]:
### Set Up ###
i=0


In [8]:
### Set Up Prediction Model ###
print("Iteration: " + str(i))
ModelType = globals().get(SimulationConfigInputUpdated["ModelType"], None)
ModelArgsFiltered = FilterArguments(ModelType, SimulationConfigInputUpdated)

X_train_df, y_train_series = get_features_and_target(
    df=SimulationConfigInputUpdated["df_Train"],
    target_column_name="Y",
    auxiliary_columns=SimulationConfigInputUpdated.get('auxiliary_data_cols', [])
)

Iteration: 0


In [9]:
### Train Prediction Model ###
if 'Seed' in ModelArgsFiltered:
    del ModelArgsFiltered['Seed']
Model = ModelType(X_train_df = X_train_df, 
                    y_train_series = y_train_series,
                    Seed=SimulationConfigInputUpdated["Seed"], 
                    **ModelArgsFiltered)
SimulationConfigInputUpdated['Model'] = Model

In [10]:
### Test Error ###
TestErrorOutput = TestErrorFunction(InputModel=Model,
                                    df_Test=SimulationConfigInputUpdated["df_Test"],
                                    Type=SimulationConfigInputUpdated["Type"],
                                    auxiliary_columns=SimulationConfigInputUpdated.get('auxiliary_data_cols', [])) # Pass aux cols
if('TREEFARMS' in str(type(Model))):                                                       # If Rashomon
    CurrentError = TestErrorOutput["Error_Duplicate"]
else: 
    CurrentError = TestErrorOutput["ErrorVal"]                                               # One output for non-Rashomon
ErrorVec.append(CurrentError)

In [11]:
### Sampling Procedure ###
SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated)
SelectorArgsFiltered['auxiliary_columns'] = SimulationConfigInputUpdated.get('auxiliary_data_cols', [])
SelectorFuncOutput = SelectorType(**SelectorArgsFiltered)


Entropy:   0%|          | 0/878 [00:00<?, ?it/s]

Conditional Entropy:   0%|          | 0/878 [00:00<?, ?it/s]

In [17]:
SelectorFuncOutput

CandidateBatch(scores=[125.44367995996261, 56.58643338959707, 53.51328277216227, 46.75540920021087, 27.327823616620442, 23.142658227601682, 19.329929359939975, 15.568346012088051, 11.943033220480551, 10.861574174472858], indices=[1300, 683, 1030, 743, 704, 1195, 15, 1252, 600, 574])

In [16]:
SelectorFuncOutput.indices

[1300, 683, 1030, 743, 704, 1195, 15, 1252, 600, 574]

In [15]:

QueryObservationIndex = SelectorFuncOutput["IndexRecommendation"]
QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[QueryObservationIndex]
SelectedObservationHistory.append(QueryObservationIndex)

TypeError: 'CandidateBatch' object is not subscriptable

In [19]:
### Update Train and Candidate Sets ###
SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation]).drop(columns=['DiversityScores', 'DensityScores'])
SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex) 

In [20]:
### Store Number of (Unique) Trees ###
if('TREEFARMS' in str(type(Model))):
    TreeCount["AllTreeCount"].append(SelectorFuncOutput["AllTreeCount"])          # Store number of trees
    TreeCount["UniqueTreeCount"].append(SelectorFuncOutput["UniqueTreeCount"])    # Store number of unique/duplicate trees


# Save

In [14]:
# ### Return Simulation Parameters ###
# SimulationParameters = {"DataFileInput" : str(SimulationConfigInput["DataFileInput"]),
#                             "Seed" : str(SimulationConfigInput["Seed"]),
#                             "TestProportion" : str(SimulationConfigInput["TestProportion"]),
#                             "CandidateProportion" : str(SimulationConfigInput["CandidateProportion"]),
#                             "SelectorType" :  str(SimulationConfigInput["SelectorType"]),
#                             "ModelType" :  str(SimulationConfigInput["ModelType"]),
#                             'UniqueErrorsInput': str(SimulationConfigInput["UniqueErrorsInput"]),
#                             'n_estimators': str(SimulationConfigInput["n_estimators"]),
#                             'regularization': str(SimulationConfigInput["regularization"]),
#                             'RashomonThreshold': str(SimulationConfigInput["RashomonThreshold"]),
#                             'Type': 'Classification',
#                             }

In [15]:
# ### Return Time ###
# ElapsedTime = time.time() - StartTime

# ### Return Dictionary ###
# SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
#                             "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
#                             "SimulationParameters" : SimulationParameters,
#                             "ElapsedTime" : ElapsedTime}