In [1]:
### Import packages ###
import itertools
import pandas as pd

# Input

In [2]:
# Input Data Set #
Data ="BreastCancer"
JobNameAbbrev = "BC"

# Input directory #
dir_path = "/Users/simondn/Documents/RashomonActiveLearning/Code/raw"

# Input Parameters #
ParameterDictionary = {"Data":[Data],
                       "Seed":list(range(0,99)),
                       "TestProportion":[0.2],
                       "CandidateProportion":[0.8],
                       "SelectorType":["TreeEnsembleQBCFunction"],
                       "ModelType":["TreeFarmsFunction"],
                       "UniqueErrorsInput": [0,1],
                       "n_estimators": [100], 
                       "regularization": [0.01],
                       "rashomon_bound_adder": [0.005, 0.007, 0.009, 0.011, 0.013, 0.017, 0.019, 0.021, 0.023],
                       "Type": ["Classification"]}

# Create Parameter Vector #
ParameterVector = pd.DataFrame.from_records(itertools.product(*ParameterDictionary.values()), columns=ParameterDictionary.keys())

# Include/exclude Random Forest Simulations

In [3]:
# ### Include Random Forest ###
# RandomForestParameterDictionary = {"Data":[Data],
#                        "Seed":list(range(0,100)),
#                        "TestProportion":[0.2],
#                        "CandidateProportion":[0.8],
#                        "SelectorType":["TreeEnsembleQBCFunction"],
#                        "ModelType":["RandomForestClassificationFunction"],
#                        "UniqueErrorsInput": [0],
#                        "n_estimators": [100], 
#                        "regularization": [0.00],
#                        "rashomon_bound_adder": [0],
#                        "Type": ["Classification"]}
# RandomForestParameterVector = pd.DataFrame.from_records(itertools.product(*RandomForestParameterDictionary.values()), columns=RandomForestParameterDictionary.keys())

# # NOTE: Comment out chunk to not include random forest simulations. ###
# ParameterVector = pd.concat([ParameterVector, RandomForestParameterVector]) # NOTE: Comment out to not include random forest baseline
# ParameterVector = ParameterVector.sort_values("Seed")
# ParameterVector.index = range(0, ParameterVector.shape[0])

# Job and Output Name

In [4]:
# Generate JobName #
ParameterVector["JobName"] = (
    ParameterVector["Seed"].astype(str) +
    JobNameAbbrev + 
    "_MT" + ParameterVector["ModelType"].astype(str) +
    "_UEI" + ParameterVector["UniqueErrorsInput"].astype(str) +
    "_" + ParameterVector["rashomon_bound_adder"].astype(str))

# Replace Job Name #
ParameterVector["JobName"] = (
    ParameterVector["JobName"]
    .str.replace(r"_MTTreeFarmsFunction_UEI0_", "_DPL", regex=True)
    .str.replace(r"_MTTreeFarmsFunction_UEI1_", "_UNQ", regex=True)
    .str.replace(r"_MTRandomForestClassificationFunction_UEI0_", "_RF", regex=True))

# Output Name #
ParameterVector["Output"] = ParameterVector["Data"].astype(str) + "/" + ParameterVector["ModelType"].astype(str) + "/Raw/" + ParameterVector["JobName"] + ".pkl"
ParameterVector["Output"] = ParameterVector["Output"].str.replace("Function", "", regex=False)

# Save Parameter Vector

In [5]:
ParameterVector

Unnamed: 0,Data,Seed,TestProportion,CandidateProportion,SelectorType,ModelType,UniqueErrorsInput,n_estimators,regularization,rashomon_bound_adder,Type,JobName,Output
0,BreastCancer,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,0.005,Classification,0BC_DPL0.005,BreastCancer/TreeFarms/Raw/0BC_DPL0.005.pkl
1,BreastCancer,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,0.007,Classification,0BC_DPL0.007,BreastCancer/TreeFarms/Raw/0BC_DPL0.007.pkl
2,BreastCancer,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,0.009,Classification,0BC_DPL0.009,BreastCancer/TreeFarms/Raw/0BC_DPL0.009.pkl
3,BreastCancer,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,0.011,Classification,0BC_DPL0.011,BreastCancer/TreeFarms/Raw/0BC_DPL0.011.pkl
4,BreastCancer,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,0.013,Classification,0BC_DPL0.013,BreastCancer/TreeFarms/Raw/0BC_DPL0.013.pkl
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1777,BreastCancer,98,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,0.013,Classification,98BC_UNQ0.013,BreastCancer/TreeFarms/Raw/98BC_UNQ0.013.pkl
1778,BreastCancer,98,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,0.017,Classification,98BC_UNQ0.017,BreastCancer/TreeFarms/Raw/98BC_UNQ0.017.pkl
1779,BreastCancer,98,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,0.019,Classification,98BC_UNQ0.019,BreastCancer/TreeFarms/Raw/98BC_UNQ0.019.pkl
1780,BreastCancer,98,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,0.021,Classification,98BC_UNQ0.021,BreastCancer/TreeFarms/Raw/98BC_UNQ0.021.pkl


In [9]:
ParameterVector.loc[len(ParameterVector)+0] = Duplicate15
ParameterVector.loc[len(ParameterVector)+1] = Unique15
ParameterVector.loc[len(ParameterVector)+2] = Duplicate68
ParameterVector.loc[len(ParameterVector)+3] = Unique68

In [11]:
# Save 
output_path = "/Users/simondn/Documents/RashomonActiveLearning/Data/ParameterVectors/ParameterVector" + str(Data) + ".csv"
ParameterVector.to_csv(output_path, index=False)

In [7]:
### 15 ###
Duplicate15 = {"Data" : "BreastCancer",
"Seed" : 15,
"TestProportion" : 0.2,
"CandidateProportion" : 0.8,
"SelectorType" : "TreeEnsembleQBCFunction",
"ModelType" : "TreeFarmsFunction",
"UniqueErrorsInput" : 0,
"n_estimators" : 100,
"regularization" : 0.01,
"rashomon_bound_adder" : 0.025,
"Type" : "Classification",
"JobName" : "15BC_DPL0.025",
"Output" : "BreastCancer/TreeFarms/Raw/15BC_DPL0.025.pkl"}

Unique15 = {"Data" : "BreastCancer",
"Seed" : 15,
"TestProportion" : 0.2,
"CandidateProportion" : 0.8,
"SelectorType" : "TreeEnsembleQBCFunction",
"ModelType" : "TreeFarmsFunction",
"UniqueErrorsInput" : 1,
"n_estimators" : 100,
"regularization" : 0.01,
"rashomon_bound_adder" : 0.025,
"Type" : "Classification",
"JobName" : "15BC_UNQ0.025",
"Output" : "BreastCancer/TreeFarms/Raw/15BC_UNQ0.025.pkl"}

### 68 ###
Duplicate68 = {"Data" : "BreastCancer",
"Seed" : 68,
"TestProportion" : 0.2,
"CandidateProportion" : 0.8,
"SelectorType" : "TreeEnsembleQBCFunction",
"ModelType" : "TreeFarmsFunction",
"UniqueErrorsInput" : 0,
"n_estimators" : 100,
"regularization" : 0.01,
"rashomon_bound_adder" : 0.025,
"Type" : "Classification",
"JobName" : "68BC_DPL0.025",
"Output" : "BreastCancer/TreeFarms/Raw/68BC_DPL0.025.pkl"}

Unique68 = {"Data" : "BreastCancer",
"Seed" : 68,
"TestProportion" : 0.2,
"CandidateProportion" : 0.8,
"SelectorType" : "TreeEnsembleQBCFunction",
"ModelType" : "TreeFarmsFunction",
"UniqueErrorsInput" : 1,
"n_estimators" : 100,
"regularization" : 0.01,
"rashomon_bound_adder" : 0.025,
"Type" : "Classification",
"JobName" : "68BC_UNQ0.025",
"Output" : "BreastCancer/TreeFarms/Raw/68BC_UNQ0.025.pkl"}
