In [1]:
### Import packages ###
import time
import numpy as np
import math as math
import pandas as pd
import random as random
from sklearn.cluster import AgglomerativeClustering

### Append Path ###
import sys
sys.path.append('..')

### Import functions ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

# Inputs

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, "..",".."))

### DataType ###
DataFileInput = "mpg"  # BankNote  Bar7  BreastCancer  CarEvaluation  COMPAS  FICO  Haberman  Iris  MONK1  MONK3

In [3]:
### Parameter Vector ###
# ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))

# ### Parameter Vector ###
# for SimV in range(0,len(ParameterVector)):
#     SimulationConfigInput = {"DataFileInput": ParameterVector.iloc[SimV]["Data"],
#                             "Seed": int(ParameterVector.iloc[SimV]["Seed"]),
#                             "TestProportion": float(ParameterVector.iloc[SimV]["TestProportion"]),
#                             "CandidateProportion": float(ParameterVector.iloc[SimV]["CandidateProportion"]),
#                             "SelectorType": str(ParameterVector.iloc[SimV]["SelectorType"]), 
#                             "ModelType": str(ParameterVector.iloc[SimV]["ModelType"]), 
#                             "UniqueErrorsInput": int(ParameterVector.iloc[SimV]["UniqueErrorsInput"]),
#                             "n_estimators":int(ParameterVector.iloc[SimV]["n_estimators"]),
#                             "regularization":float(ParameterVector.iloc[SimV]["regularization"]),
#                             "RashomonThresholdType":ParameterVector.iloc[SimV]["RashomonThresholdType"],
#                             "RashomonThreshold":float(ParameterVector.iloc[SimV]["RashomonThreshold"]),
#                             "Type":ParameterVector.iloc[SimV]["Type"]
#                             }

SimulationConfigInput = {'DataFileInput': DataFileInput,
    'Seed': 1,
    'TestProportion': 0.2,
    'CandidateProportion': 0.8,
    'SelectorType': 'iGSFunction', # BatchQBCDiversityFunction, PassiveLearning
    'ModelType': 'RidgeRegressionFunction', # RandomForestClassificationFunction, TreeFarmsFunction
    'UniqueErrorsInput': 0,
    'n_estimators': 100,
    'regularization': 0.01,
    'RashomonThresholdType': "Adder",
    'RashomonThreshold': 0.00,
    'Type': 'Regression',
    "DiversityWeight" : 0,
    "DensityWeight" : 0,
    "BatchSize" : 1}


In [4]:
### Set Up ###
StartTime = time.time()
random.seed(SimulationConfigInput["Seed"])
np.random.seed(SimulationConfigInput["Seed"])

### Load Data ###
df = LoadData(SimulationConfigInput["DataFileInput"])

### Train Test Candidate Split ###
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, SimulationConfigInput["TestProportion"], SimulationConfigInput["CandidateProportion"])

### Add Batch Active Learning Diversity Metrics ###
df_Candidate = DiversityMetricsFunction(df_Candidate, df_Train, k=10)

### Update SimulationConfig Arguments ###
SimulationConfigInput['df_Train'] = df_Train
SimulationConfigInput["df_Test"] = df_Test
SimulationConfigInput["df_Candidate"] = df_Candidate


# Learning Procedure

In [5]:
### Input ###
SimulationConfigInputUpdated = SimulationConfigInput

In [6]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []
TreeCount = {"AllTreeCount": [], "UniqueTreeCount": []}

In [7]:
i=0

In [8]:
### Prediction Model ###
print("Iteration: " + str(i))
ModelType = globals().get(SimulationConfigInputUpdated["ModelType"], None)
ModelArgsFiltered = FilterArguments(ModelType, SimulationConfigInputUpdated)
Model = ModelType(**ModelArgsFiltered)
SimulationConfigInputUpdated['Model'] = Model

Iteration: 0


In [9]:
### Test Error ###
TestErrorOutput = TestErrorFunction(InputModel = Model, 
                                    df_Test = SimulationConfigInputUpdated["df_Test"], 
                                    Type = SimulationConfigInputUpdated["Type"])
if('TREEFARMS' in str(type(Model))):                                                       # If Rashomon
    CurrentError = TestErrorOutput["Error_Duplicate"]
else: 
    CurrentError = TestErrorOutput["ErrorVal"]                                               # One output for non-Rashomon
ErrorVec.append(CurrentError)


In [10]:
### Sampling Procedure ###
SelectorType = globals().get(SimulationConfigInputUpdated["SelectorType"], None)
SelectorArgsFiltered = FilterArguments(SelectorType, SimulationConfigInputUpdated)


---

In [11]:
df_Train = SelectorArgsFiltered["df_Train"]
df_Candidate = SelectorArgsFiltered["df_Candidate"]
Model = SelectorArgsFiltered["Model"]
distance = "euclidean" 

In [None]:

### Variables ###
columns_to_remove = ['Y', "DiversityScores", "DensityScores"]
X_Candidate = df_Candidate.drop(columns=columns_to_remove)


In [13]:

### GSx ###
d_nmX = cdist(X_Candidate, df_Train.loc[:,df_Train.columns!= "Y"], metric = distance)
d_nX = d_nmX.min(axis=1)


In [14]:

### GSy ###
## Prediction ##
Predictions = Model.predict(X_Candidate)


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [16]:
d_nmY = cdist(Predictions.reshape(-1,1), df_Train["Y"].values.reshape(-1,1), metric = distance)
d_nY = d_nmY.min(axis=1)

### iGS ###
d_nXY = d_nX*d_nY
MaxRowNumber = np.argmax(d_nXY)
IndexRecommendation = df_Candidate.iloc[[MaxRowNumber]].index[0]

### Output ###
Output = {"IndexRecommendation": [float(IndexRecommendation)]}

---

In [None]:
SelectorFuncOutput = SelectorType(**SelectorArgsFiltered)
QueryObservationIndex = SelectorFuncOutput["IndexRecommendation"]
QueryObservation = SimulationConfigInputUpdated["df_Candidate"].loc[QueryObservationIndex]
SelectedObservationHistory.append(QueryObservationIndex)

In [19]:
### Update Train and Candidate Sets ###
SimulationConfigInputUpdated["df_Train"] = pd.concat([SimulationConfigInputUpdated["df_Train"], QueryObservation]).drop(columns=['DiversityScores', 'DensityScores'])
SimulationConfigInputUpdated["df_Candidate"] = SimulationConfigInputUpdated["df_Candidate"].drop(QueryObservationIndex) 

In [20]:
### Store Number of (Unique) Trees ###
if('TREEFARMS' in str(type(Model))):
    TreeCount["AllTreeCount"].append(SelectorFuncOutput["AllTreeCount"])          # Store number of trees
    TreeCount["UniqueTreeCount"].append(SelectorFuncOutput["UniqueTreeCount"])    # Store number of unique/duplicate trees


# Save

In [14]:
# ### Return Simulation Parameters ###
# SimulationParameters = {"DataFileInput" : str(SimulationConfigInput["DataFileInput"]),
#                             "Seed" : str(SimulationConfigInput["Seed"]),
#                             "TestProportion" : str(SimulationConfigInput["TestProportion"]),
#                             "CandidateProportion" : str(SimulationConfigInput["CandidateProportion"]),
#                             "SelectorType" :  str(SimulationConfigInput["SelectorType"]),
#                             "ModelType" :  str(SimulationConfigInput["ModelType"]),
#                             'UniqueErrorsInput': str(SimulationConfigInput["UniqueErrorsInput"]),
#                             'n_estimators': str(SimulationConfigInput["n_estimators"]),
#                             'regularization': str(SimulationConfigInput["regularization"]),
#                             'RashomonThreshold': str(SimulationConfigInput["RashomonThreshold"]),
#                             'Type': 'Classification',
#                             }

In [15]:
# ### Return Time ###
# ElapsedTime = time.time() - StartTime

# ### Return Dictionary ###
# SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
#                             "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
#                             "SimulationParameters" : SimulationParameters,
#                             "ElapsedTime" : ElapsedTime}