In [1]:
### Import Packages ###
import os
import ast
import numpy as np
import math as math
import pandas as pd
import random as random
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

### Local Packages ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, ".."))

### DataType ###
DataFileInput = "COMPAS"

# One Iteration

## Inputs

In [3]:
# myscript_Seed0_DataCOMPAS_TP0.7_CP0.8_STRashomonQBC_MTTreeFarms_1643306.err
# myscript_Seed0_DataCOMPAS_TP0.7_CP0.8_STPassiveLearning_MTTreeFarms_1643305.err

In [4]:
# TopCModels = 50
# Seed = 1
# TestProportion = 0.99
# CandidateProportion = 0.7
# SelectorType = RashomonQBCFunction
# ModelType = TreeFarmsFunction
# DataArgs = {}
# SelectorArgs = {"TopCModels": 10}
# ModelArgs = {"TopCModels": TopCModels, "config" : {"regularization": 0.01, "rashomon_bound_multiplier": 0.05}, "Type":"Classification"}


In [5]:
### Parameter Vector ###
k=2
ParameterVector = pd.read_csv(os.path.join(ParentDirectory, "Data", "ParameterVectors", "ParameterVector" + DataFileInput + ".csv"))
Seed = int(ParameterVector.iloc[k]["Seed"])
TestProportion = ParameterVector.iloc[k]["TestProportion"]
CandidateProportion = ParameterVector.iloc[k]["CandidateProportion"]
SelectorType = globals().get(ParameterVector.iloc[k]["SelectorType"], None)
ModelType = globals().get(ParameterVector.iloc[k]["ModelType"], None)
DataArgs = ast.literal_eval(ast.literal_eval(ParameterVector.iloc[k]["DataArgs"]))
SelectorArgs = ast.literal_eval(ast.literal_eval(ParameterVector.iloc[k]["SelectorArgs"].replace("[","").replace("]","")))
ModelArgs = ast.literal_eval(ast.literal_eval(ParameterVector.iloc[k]["ModelArgs"].replace("[","").replace("]","")))


# Set Up

In [6]:
### Run Time ###
StartTime = time.time()

### Set Up ###
random.seed(Seed)
np.random.seed(Seed)
ErrorVec = []
SelectedObservationHistory = []

In [7]:
### Generate Data ###
if(DataFileInput == "Simulate"):
    from utils.Main import DataGeneratingProcess                             ### NOTE: Why is this not imported from utils.Main import *
    df = DataGeneratingProcess(**DataArgs)
else:
    df = LoadData(DataFileInput)

In [8]:
### Train Test Candidate Split
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, TestProportion, CandidateProportion)

In [9]:
### Selector Arguments ###
SelectorArgs["df_Train"] = df_Test                                     # NOTE: Change to df_Test if there is a test set
SelectorArgs["df_Candidate"] = df_Candidate
SelectorArgs["Model"] = ""
# SelectorArgsFiltered = FilterArguments(SelectorType, SelectorArgs)

In [10]:
### Model Arguments ###
ModelArgs['df_Train'] = df_Train
# ModelArgsFiltered = FilterArguments(ModelType, ModelArgs)

# Learning Procedure

In [35]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []
i=1

In [36]:
### Prediction Model ###
print("Iteration: " + str(i))
ModelArgsFiltered = FilterArguments(ModelType, ModelArgs)
Model = ModelType(**ModelArgsFiltered)
if "Model" in SelectorArgs.keys(): SelectorArgs['Model'] = Model            # NOTE: THIS IS NOT DYNAMIC

Iteration: 1
null
treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 627
Finding Optimal Objective...
{
  "false": {
    "false": {
      "complexity": 0.009999999776482582,
      "loss": 0.12289156764745712,
      "name": "Y",
      "prediction": 0
    },
    "feature": 3,
    "name": "age:<26",
    "reference": 1,
    "relation": "==",
    "true": {
      "false": {
        "complexity": 0.009999999776482582,
        "loss": 0.04819277301430702,
        "name": "Y",
        "prediction": 1
      },
      "feature": 8,
      "name": "priors:=0",
      "reference": 1,
      "relation": "==",
      "true": {
        "complexity": 0.009999999776482582,
        "loss": 0.04819277301430702,
        "name": "Y",
        "prediction": 0
      },
      "type": "integral"
    },
    "type": "integral"
  },
  "feature": 11,
  "model_objective": 0.3508433699607849,
  "name": "priors:>3",
  "reference": 1,
  "relation": "==",
  "true": {
    "complexi

In [37]:
### Current Error ###
TestErrorVal = TestErrorFunction(Model, df_Test, ModelArgs["Type"])        # NOTE: Change to df_Test if there is a test set
if(len(TestErrorVal) > 1):
    AllErrors = TestErrorVal                                                # Rashomon gives all errors of Rashomon
    CurrentError = float(np.min(AllErrors))                                 # Extract the best one
    SelectorArgs["AllErrors"] = AllErrors                           # Use AllErrors in RashomonQBC
else: 
    CurrentError = TestErrorVal                                             # One output for non-Rashomon
ErrorVec.append(CurrentError)

In [38]:
### Sampling Procedure ###
SelectorArgsFiltered = FilterArguments(SelectorType, SelectorArgs)

In [39]:
QueryObservationIndex = SelectorType(**SelectorArgsFiltered)

In [40]:
print("QueryObservationIndex: " + str(QueryObservationIndex))

QueryObservationIndex: 3999


In [41]:
QueryObservation = df_Candidate.loc[[QueryObservationIndex]] # or should this be iloc

In [42]:
QueryObservation

Unnamed: 0,Y,sex:Female,age:<21,age:<23,age:<26,age:<46,juvenile-felonies:=0,juvenile-misdemeanors:=0,juvenile-crimes:=0,priors:=0,priors:=1,priors:2-3,priors:>3
3999,1,0,1,1,1,1,1,0,0,0,0,0,1


In [43]:
### Update Train and Candidate Sets ###
df_Train = pd.concat([df_Train, QueryObservation])
df_Candidate = df_Candidate.drop(QueryObservationIndex)

In [44]:
QueryObservationIndex in df_Candidate.index

False

In [45]:
QueryObservationIndex in df_Train.index

True

In [46]:
### Update SelectorArgs and ModelArgs ###                                     # NOTE: THIS IS NOT DYNAMIC
if "df_Train" in ModelArgs.keys(): ModelArgs['df_Train'] = df_Train
if "df_Train" in SelectorArgs.keys(): SelectorArgs['df_Train'] = df_Train
if "df_Candidate" in SelectorArgs.keys(): SelectorArgs['df_Candidate'] = df_Candidate  

# Save

In [None]:
### Return Simulation Parameters ###
SimulationParameters = {"DataFileInput" : str(DataFileInput),
                        "Seed" : str(Seed),
                        "TestProportion" : str(TestProportion),
                        "CandidateProportion" : str(CandidateProportion),
                        "SelectorType" : str(SelectorType),
                        "ModelType" : str(ModelType),
                        "DataArgs" : str(DataArgs),
                        # "SelectorArgs" : str(SelectorArgs),
                        "ModelArgs" : str(FilterArguments(ModelType, ModelArgs).pop('df_Train', None))
                        }

In [None]:
### Return Time ###
ElapsedTime = time.time() - StartTime

### Return Dictionary ###
SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
                            "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
                            "SimulationParameters" : SimulationParameters,
                            "ElapsedTime" : ElapsedTime}