In [1]:
### Import packages ###
import itertools
import pandas as pd

# Input

In [2]:
AbbreviationDictionary = {"BankNote": "BN",
                          "Bar7": "B7",
                          "BreastCancer": "BC",
                          "CarEvaluation": "CE",
                          "COMPAS": "CP",
                          "FICO": "FI",
                          "Haberman": "HM",
                          "Iris": "IS",
                          "MONK1": "M1",
                          "MONK3":"M3"}

In [3]:
# Input Data Set #
Data = "BankNote" # BankNote  Bar7  BreastCancer  CarEvaluation  COMPAS  FICO  Haberman  Iris  MONK1  MONK3 
JobNameAbbrev = AbbreviationDictionary[Data]

# Input directory #
dir_path = "/Users/simondn/Documents/RashomonActiveLearning/Code/raw"

# Input Parameters #
ParameterDictionary = {"Data":[Data],
                       "Seed":list(range(0,3)),
                      #  "Seed":list([]),                       
                       "TestProportion":[0.2],
                       "CandidateProportion":[0.8],
                       "SelectorType":["TreeEnsembleQBCFunction"],
                       "ModelType":["TreeFarmsFunction"],
                       "UniqueErrorsInput": [0,1],
                       "n_estimators": [100], 
                       "regularization": [0.01],
                       "RashomonThresholdType": ["Adder"],                                         # ["Adder", "Multiplier"]
                       "RashomonThreshold": [0.01],
                       "Type": ["Classification"],
                       "Partition": ["compute"],                                                        # [short, medium, long, largemem, compute, cpu-g2-mem2x]
                       "Time": ["00:59:00"],                                                            # [00:59:00, 11:59:00, 6-23:59:00]
                       "Memory": ["30000M"]}                                                                # [100M, 30000M, 100000M]

# Create Parameter Vector #
ParameterVector = pd.DataFrame.from_records(itertools.product(*ParameterDictionary.values()), columns=ParameterDictionary.keys())

# Include/exclude Passive Learning

In [4]:
### Include Random Forest ###
RandomForestParameterDictionary = {"Data":[Data],
                                    "Seed":[0,1,2],
                       "TestProportion":[0.2],
                       "CandidateProportion":[0.8],
                       "SelectorType":["PassiveLearning"],
                       "ModelType":["RandomForestClassificationFunction"],
                       "UniqueErrorsInput": [1],
                       "n_estimators": [100], 
                       "regularization": [0.01],
                       "RashomonThresholdType": ["Adder"],                                                    # ["Adder", "Multiplier"]
                       "RashomonThreshold": [0],
                       "Type": ["Classification"],
                       "Partition": ["compute"],                                                        # [short, medium, long, largemem, or compute]
                       "Time": ["00:59:00"],                                                            # [00:59:00, 11:59:00, 6-23:59:00]
                       "Memory": [1000]}                                                                # [1000, 30000, 100000]

RandomForestParameterVector = pd.DataFrame.from_records(itertools.product(*RandomForestParameterDictionary.values()), columns=RandomForestParameterDictionary.keys())

# NOTE: Comment out chunk to not include random forest simulations. ###
ParameterVector = pd.concat([ParameterVector, RandomForestParameterVector]) # NOTE: Comment out to not include random forest baseline
ParameterVector = ParameterVector.sort_values("Seed")
ParameterVector.index = range(0, ParameterVector.shape[0])

# Include/exclude Random Forest Simulations

In [5]:
### Include Random Forest ###
RandomForestParameterDictionary = {"Data":[Data],
                                    "Seed":[0,1,2],
                       "TestProportion":[0.2],
                       "CandidateProportion":[0.8],
                       "SelectorType":["TreeEnsembleQBCFunction"],
                       "ModelType":["RandomForestClassificationFunction"],
                       "UniqueErrorsInput": [0],
                       "n_estimators": [100], 
                       "regularization": [0.01],
                       "RashomonThresholdType": ["Adder"],                                                    # ["Adder", "Multiplier"]
                       "RashomonThreshold": [0],
                       "Type": ["Classification"],
                       "Partition": ["compute"],                                                        # [short, medium, long, largemem, or compute]
                       "Time": ["00:59:00"],                                                            # [00:59:00, 11:59:00, 6-23:59:00]
                       "Memory": [1000]}                                                                # [1000, 30000, 100000]

RandomForestParameterVector = pd.DataFrame.from_records(itertools.product(*RandomForestParameterDictionary.values()), columns=RandomForestParameterDictionary.keys())

# NOTE: Comment out chunk to not include random forest simulations. ###
ParameterVector = pd.concat([ParameterVector, RandomForestParameterVector]) # NOTE: Comment out to not include random forest baseline
ParameterVector = ParameterVector.sort_values("Seed")
ParameterVector.index = range(0, ParameterVector.shape[0])

# Job and Output Name

In [6]:
### Remove Dictionary ###
del ParameterDictionary

# Generate JobName #
ParameterVector["JobName"] = (
    ParameterVector["Seed"].astype(str) +
    JobNameAbbrev + 
    "_MT" + ParameterVector["ModelType"].astype(str) +
    "_UEI" + ParameterVector["UniqueErrorsInput"].astype(str) +
    "_" + ParameterVector["RashomonThresholdType"].astype(str) + 
    ParameterVector["RashomonThreshold"].astype(str))

# Replace Job Name #
ParameterVector["JobName"] = (
    ParameterVector["JobName"]
    .str.replace(r"_MTTreeFarmsFunction_UEI0_", "_D", regex=True)
    .str.replace(r"_MTTreeFarmsFunction_UEI1_", "_U", regex=True)
    .str.replace(r"Adder", "A", regex=True)
    .str.replace(r"Multiplier", "M", regex=True)
    .str.replace(r"_MTRandomForestClassificationFunction_UEI0_", "_RF", regex=True)
    .str.replace(r"_MTRandomForestClassificationFunction_UEI1_", "_PL", regex=True)
    )

# Output Name #
ParameterVector["Output"] = ParameterVector["Data"].astype(str) + "/" + ParameterVector["ModelType"].astype(str) + "/Raw/" + ParameterVector["JobName"] + ".pkl"
ParameterVector["Output"] = ParameterVector["Output"].str.replace("Function", "", regex=False)

In [7]:
ParameterVector

Unnamed: 0,Data,Seed,TestProportion,CandidateProportion,SelectorType,ModelType,UniqueErrorsInput,n_estimators,regularization,RashomonThresholdType,RashomonThreshold,Type,Partition,Time,Memory,JobName,Output
0,BankNote,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,0BN_DA0.01,BankNote/TreeFarms/Raw/0BN_DA0.01.pkl
1,BankNote,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,0BN_UA0.01,BankNote/TreeFarms/Raw/0BN_UA0.01.pkl
2,BankNote,0,0.2,0.8,PassiveLearning,RandomForestClassificationFunction,1,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,0BN_PLA0.0,BankNote/RandomForestClassification/Raw/0BN_PL...
3,BankNote,0,0.2,0.8,TreeEnsembleQBCFunction,RandomForestClassificationFunction,0,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,0BN_RFA0.0,BankNote/RandomForestClassification/Raw/0BN_RF...
4,BankNote,1,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,1BN_DA0.01,BankNote/TreeFarms/Raw/1BN_DA0.01.pkl
5,BankNote,1,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,1BN_UA0.01,BankNote/TreeFarms/Raw/1BN_UA0.01.pkl
6,BankNote,1,0.2,0.8,PassiveLearning,RandomForestClassificationFunction,1,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,1BN_PLA0.0,BankNote/RandomForestClassification/Raw/1BN_PL...
7,BankNote,1,0.2,0.8,TreeEnsembleQBCFunction,RandomForestClassificationFunction,0,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,1BN_RFA0.0,BankNote/RandomForestClassification/Raw/1BN_RF...
8,BankNote,2,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,2BN_DA0.01,BankNote/TreeFarms/Raw/2BN_DA0.01.pkl
9,BankNote,2,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,2BN_UA0.01,BankNote/TreeFarms/Raw/2BN_UA0.01.pkl


---

# Only run these ones

In [8]:
# # ### Simulations that failed ###
# FilterText = ['92HM_UAO.11', '36HM_UA0.11', '67HM_UA0.11', '31HM_UA0.11', '18HM_UAO.11', '49HM_UA0.11', '53HM_UA0.11', '53HM_DA0.11', '48HM_UA0.11', '60HM_DA0.11', '8HM_UA0.11', '3HM_DA0.11', '43HM_UA0.11', '36HM_DA0.11', '48HM_DA0.11', '18HM_DA0.11', '46HM_UA0.11', '66HM_UA0.11', '66HM_UA0.11', '50HM_UA0.11', '66HM_DA0.11', '13HM_UA0.11', '30HM_UA0.11', '3HM_UAO.11', '30HM_DA0.11', '43HM_DA0.11', '46HM_DA0.11', '92HM_UA0.11', '19HM_UA0.11', '94HM_DA0.11', '94HM_UA0.11', '31HM_DA0.11', '56HM_DA0.11', '65HM_UA0.11', '92HM_DA0.11', '69HM_DA0.11', '49HM_DA0.11', '46HM_DA0.11', '19HM_UA0.11', '94HM_DA0.11', '94HM_UA0.11', '31HM_DAO.11', '56HM_DA0.11', '65HM_UA0.11', '92HM_DA0.11', '69HM_DAO.11', '49HM_DA0.11', '79HM_UA0.11', '14HM_DA0.11', '17HM_DA0.11', '32HM_DA0.11', '13HM_DAO.11', '99HM_DA0.11', '56HM_UA0.11']
# ### Rerun these simulations ###
# ParameterVector = ParameterVector[ParameterVector["JobName"].isin(FilterText)]
# ParameterVector.index = range(0,len(ParameterVector))
# ParameterVector

---

In [9]:
ParameterVector

Unnamed: 0,Data,Seed,TestProportion,CandidateProportion,SelectorType,ModelType,UniqueErrorsInput,n_estimators,regularization,RashomonThresholdType,RashomonThreshold,Type,Partition,Time,Memory,JobName,Output
0,BankNote,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,0BN_DA0.01,BankNote/TreeFarms/Raw/0BN_DA0.01.pkl
1,BankNote,0,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,0BN_UA0.01,BankNote/TreeFarms/Raw/0BN_UA0.01.pkl
2,BankNote,0,0.2,0.8,PassiveLearning,RandomForestClassificationFunction,1,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,0BN_PLA0.0,BankNote/RandomForestClassification/Raw/0BN_PL...
3,BankNote,0,0.2,0.8,TreeEnsembleQBCFunction,RandomForestClassificationFunction,0,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,0BN_RFA0.0,BankNote/RandomForestClassification/Raw/0BN_RF...
4,BankNote,1,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,1BN_DA0.01,BankNote/TreeFarms/Raw/1BN_DA0.01.pkl
5,BankNote,1,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,1BN_UA0.01,BankNote/TreeFarms/Raw/1BN_UA0.01.pkl
6,BankNote,1,0.2,0.8,PassiveLearning,RandomForestClassificationFunction,1,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,1BN_PLA0.0,BankNote/RandomForestClassification/Raw/1BN_PL...
7,BankNote,1,0.2,0.8,TreeEnsembleQBCFunction,RandomForestClassificationFunction,0,100,0.01,Adder,0.0,Classification,compute,00:59:00,1000,1BN_RFA0.0,BankNote/RandomForestClassification/Raw/1BN_RF...
8,BankNote,2,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,0,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,2BN_DA0.01,BankNote/TreeFarms/Raw/2BN_DA0.01.pkl
9,BankNote,2,0.2,0.8,TreeEnsembleQBCFunction,TreeFarmsFunction,1,100,0.01,Adder,0.01,Classification,compute,00:59:00,30000M,2BN_UA0.01,BankNote/TreeFarms/Raw/2BN_UA0.01.pkl


# Save Parameter Vector

In [10]:
### Resave ###
output_path = "/Users/simondn/Documents/RashomonActiveLearning/Data/ParameterVectors/ParameterVector" + str(Data) + ".csv"
ParameterVector.to_csv(output_path, index=False)