In [1]:
### Import Packages ###
import os
import ast
import numpy as np
import math as math
import pandas as pd
import random as random
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist

### Local Packages ###
from utils.Main import *
from utils.Selector import *
from utils.Auxiliary import *
from utils.Prediction import *

In [2]:
### Get Directory ###
cwd = os.getcwd()
ParentDirectory = os.path.abspath(os.path.join(cwd, ".."))

### DataType ###
DataTypeInput = "COMPAS"

# One Iteration

## Inputs

In [3]:
i=1
DataFileInput = "COMPAS"
TopCModels = 100
Seed = 1
TestProportion = 0.2
CandidateProportion = 0.8
SelectorType = "RashomonQBCFunction"
ModelType = TreeFarmsFunction
DataArgs = {}
ModelArgs = {"TopCModels": TopCModels, "config" : {"regularization": 0.01, "rashomon_bound_multiplier": 0.03}, "Type":"Classification"}


## One Iteration Function

### Set Up

In [4]:
### Run Time ###
StartTime = time.time()

### Set Up ###
random.seed(Seed)
np.random.seed(Seed)
ErrorVec = []
SelectedObservationHistory = []

### Load Data

In [5]:
from utils.Main import DataGeneratingProcess                             ### NOTE: Why is this not imported from utils.Main import *
### Generate Data ###
if(DataFileInput == "Simulate"):
    df = DataGeneratingProcess(**DataArgs)
else:
    df = LoadData(DataFileInput)

### Train Test Split

In [6]:
### Train Test Candidate Split
from utils.Main import TrainTestCandidateSplit                           ### NOTE: Why is this not imported from utils.Main import *
df_Train, df_Test, df_Candidate = TrainTestCandidateSplit(df, TestProportion, CandidateProportion)

### Argument Processing

### Learning Process

In [7]:
### Set Up ###
ErrorVec = []
SelectedObservationHistory = []


In [8]:
i=1

In [9]:
# SelectorArgs = {"Model" : Model, "df_Candidate" : df_Candidate, "df_Train" : df_Train, "TopCModels" : TopCModels},

# ### Selector Arguments ###
# SelectorArgs["df_Train"] = df_Test
# SelectorArgs["df_Candidate"] = df_Candidate
# SelectorArgs["Model"] = ModelType
# SelectorArgsFiltered = FilterArguments(SelectorType, SelectorArgs)


In [10]:
### Model Arguments ###
ModelArgs['df_Train'] = df_Train
ModelArgsFiltered = FilterArguments(ModelType, ModelArgs)

In [11]:
# Summary: Initializes and fits a treefarms model.
# Input:
#   df_Train: The training data.
#   TopCModels: TopCModels top models
#   config:
# Output:
# treeFarmsModel: A treefarms model.

### Libraries ###
from treeFarms.treefarms import TREEFARMS

# import sys
# sys.path.append('/Users/simondn/Documents/RashomonActiveLearning/Code')
# from treeFarms.treefarms.model.threshold_guess import compute_thresholds
# from utils.Prediction.TreeFARMS import <function_or_class_name>

### Function ###
def TreeFarmsFunction(df_Train, config, TopCModels):
   ### Train TreeFarms Model ###
    TreeFarmsModel = TREEFARMS("config")
    TreeFarmsModel.fit(df_Train.loc[:, df_Train.columns != "Y"], df_Train["Y"])
    
    ### Return ###
    return TreeFarmsModel


# NOTE: Is there a way to prune the tree such that only the top models are given back? Look into this


In [None]:
TreeFarmsModel = TREEFARMS("config")
TreeFarmsModel.fit(df_Train.loc[:, df_Train.columns != "Y"], df_Train["Y"])

: 

In [None]:
TreeFarmsFunction(df_Train = df_Train, config = ModelArgs["config"], TopCModels= ModelArgs["TopCModels"])

: 

In [None]:
TreeFarmsFunction(**ModelArgsFiltered)

: 

In [None]:
### Prediction Model ###
Model = ModelType(**ModelArgsFiltered)
if "Model" in SelectorArgs.keys(): SelectorArgs['Model'] = Model            # NOTE: THIS IS NOT DYNAMIC

In [14]:
### Current Error ###
CurrentError = TestErrorFunction(Model, df_Test, ModelArgs)
ErrorVec.append(CurrentError)

In [None]:

# RashomonQBCSelector #

def RashomonQBCSelector(TreeFarmsModel, df_Candidate, df_Train, SelectorArgs):

    # ### GSx ### Incorporate? Good for tie breakers.
    # d_nmX = cdist(df_Candidate.loc[:,df_Candidate.columns!= "Y"], df_Train.loc[:,df_Train.columns!= "Y"], metric = distance)
    # d_nX = d_nmX.min(axis=1)

    ### Extract Errors ###
    AllErrors = [TreeFarmsModel[i].score(df_Candidate.loc[:, df_Candidate.columns != "Y"], df_Candidate["Y"]) for i in range(TreeFarmsModel.get_tree_count())]
    HighestAccuracyIndices = np.argsort(AllErrors)[::-1][0:SelectorArgs["TopCModels"]]

    ### Prediction ###
    PredictedValues = [TreeFarmsModel[i].predict(df_Candidate) for i in HighestAccuracyIndices]
    RashomonMean = np.array(PredictedValues).mean(axis =0)

    ### Uncertainty Metric ###
    df_Candidate["UncertaintyMetric"] = np.sort(abs(RashomonMean - 0.5))
    IndexRecommendation = df_Candidate.sort_values(by = "UncertaintyMetric", ascending = True).index[0]

    return(IndexRecommendation)



In [18]:
RashomonQBCSelector(Model, df_Candidate, df_Train, SelectorArgs)

np.int64(4055)

In [None]:

### Sampling Procedure ###
QueryObservationIndex = SelectorType(**SelectorArgs)
QueryObservation = df_Candidate.loc[[QueryObservationIndex]] # or should this be iloc
SelectedObservationHistory.append(QueryObservationIndex)

### Update Train and Candidate Sets ###
df_Train = pd.concat([df_Train, QueryObservation])
df_Candidate = df_Candidate.drop(QueryObservationIndex)

### Update SelectorArgs and ModelArgs ###                                     # NOTE: THIS IS NOT DYNAMIC
if "df_Train" in ModelArgs.keys(): ModelArgs['df_Train'] = df_Train
if "df_Train" in SelectorArgs.keys(): SelectorArgs['df_Train'] = df_Train
if "df_Candidate" in SelectorArgs.keys(): SelectorArgs['df_Candidate'] = df_Candidate      

### Return

In [None]:

SimulationParameters = {"DataFileInput" : str(DataFileInput),
                        "Seed" : str(Seed),
                        "TestProportion" : str(TestProportion),
                        "CandidateProportion" : str(CandidateProportion),
                        "SelectorType" : str(SelectorType),
                        "ModelType" : str(ModelType),
                        "DataArgs" : str(DataArgs),
                        # "SelectorArgs" : str(SelectorArgs),
                        "ModelArgs" : str(ModelArgsFiltered.pop('df_Train', None))
                        }

ElapsedTime = time.time() - StartTime

### Return Dictionary ###
SimulationResults = {"ErrorVec" : pd.DataFrame(ErrorVec, columns =["Error"]),
                            "SelectionHistory" : pd.DataFrame(SelectedObservationHistory, columns = ["ObservationID"]),
                            "SimulationParameters" : SimulationParameters,
                            "ElapsedTime" : ElapsedTime}