In [1]:
# Summary: A python script to extract the error and time for the active learning simulation. It accesses each of the .pkl result files
#          from the simulations, and places each error (time) into a row in the ErrorMatrix.csv (TimeMatrix.csv) file.
# Input: 
#   DataType: A string that indicates either "Simulate" for the simulation or the name of the DataFrame in the Data folder.
#   ModelType: Predictive model. Examples can be LinearRegression or RandomForestRegresso.
#   Categories: The last identifying portion of the results file.
#               For the DUREAL, UNREAL, and RandomForests methods, the respective inputs are
#               {"MTTreeFarms_UEI0_NE100_Reg0.01_RBA0.01.pkl", 
#                "MTTreeFarms_UEI1_NE100_Reg0.01_RBA0.01.pkl",
#                "MTRandomForestClassification_UEI0_NE100_Reg0.01_RBA0.01.pkl"}
# Output: Outputs the matrices ErrorMatrix and TimeMatrix into the ProcessedResults folder.

### Import libraries ###
import os
import pickle
import argparse
import numpy as np
import pandas as pd


In [2]:

### Extract Error and Time Function ###
def ExtractInformation(files):

    ### Set Up ###
    ErrorVec = []
    TimeVec = []
    SelectionHistoryVec = []
    AllTreeCountVec = []
    UniqueTreeCountVec = []
    for file in files:
        try:
            with open(file, "rb") as f:
                data = pickle.load(f)
                ErrorVec.append(data["ErrorVec"])
                TimeVec.append(data["ElapsedTime"])
                SelectionHistoryVec.append(data["SelectionHistory"])
                AllTreeCountVec.append(data["TreeCount"]["AllTreeCount"])
                UniqueTreeCountVec.append(data["TreeCount"]["UniqueTreeCount"])
        except Exception as e:
            print(f"Error loading file {file}: {e}")
    return np.array(ErrorVec), np.array(TimeVec), list(SelectionHistoryVec)[0], np.array(AllTreeCountVec), np.array(UniqueTreeCountVec)


In [3]:
import argparse
import sys

# Store the original sys.argv
original_argv = sys.argv

# Create a mock command line argument list
sys.argv = ['ProcessSimulationResults.py', '--DataType', 'Iris', '--ModelType', 'TreeFarms', '--Categories', 'PLA0']

# Create the parser
parser = argparse.ArgumentParser(description="Aggregate simulation results.")
parser.add_argument("--DataType", type=str, required=True, help="Type of data.")
parser.add_argument("--ModelType", type=str, required=True, help="Prediction model type.")
parser.add_argument("--Categories", type=str, required=True, help="Single category string.")

# Parse the mock arguments
args = parser.parse_args()

# Print the parsed arguments
print(f"DataType: {args.DataType}")
print(f"ModelType: {args.ModelType}")
print(f"Categories: {args.Categories}")

# Restore the original sys.argv
sys.argv = original_argv

DataType: Iris
ModelType: TreeFarms
Categories: PLA0


In [4]:
### Set Up ###
cwd = os.getcwd()
ResultsDirectory = os.path.join(cwd, "Results", args.DataType, args.ModelType)
OutputDirectory = os.path.join(ResultsDirectory, "ProcessedResults")
RawDirectory = "/Users/simondn/Documents/RashomonActiveLearning/Results/Iris/RandomForestClassification/Raw/"
Category = args.Categories

### Extract File Names ###
CategoryFileNames = []
for filename in os.listdir("/Users/simondn/Documents/RashomonActiveLearning/Results/Iris/RandomForestClassification/Raw"):
    if filename.endswith(".pkl") and Category in filename:
        CategoryFileNames.append(os.path.join(RawDirectory, filename))


In [5]:
CategoryFileNames

['/Users/simondn/Documents/RashomonActiveLearning/Results/Iris/RandomForestClassification/Raw/1IS_PLA0_D0B5.pkl',
 '/Users/simondn/Documents/RashomonActiveLearning/Results/Iris/RandomForestClassification/Raw/0IS_PLA0_D0B5.pkl',
 '/Users/simondn/Documents/RashomonActiveLearning/Results/Iris/RandomForestClassification/Raw/2IS_PLA0_D0B5.pkl']

In [6]:

### Extract Data ###
ErrorVec, TimeVec, SelectionHistoryVec, AllTreeCountVec, UniqueTreeCountVec = ExtractInformation(CategoryFileNames)


In [16]:
ErrorMatrix = pd.DataFrame(ErrorVec.squeeze())
TimeMatrix = pd.DataFrame(TimeVec.squeeze())
# SelectionHistoryVec = pd.DataFrame(SelectionHistoryVec.squeeze())
AllTreeCountVec = pd.DataFrame(AllTreeCountVec.squeeze())
UniqueTreeCountVec = pd.DataFrame(UniqueTreeCountVec.squeeze())

### Save ###
ErrorMatrix.to_csv(os.path.join(OutputDirectory, "ErrorVec", f"{Category.replace('.pkl', '')}_ErrorMatrix.csv"), index=False)
TimeMatrix.to_csv(os.path.join(OutputDirectory, "ElapsedTime", f"{Category.replace('.pkl', '')}_TimeMatrix.csv"), index=False)
AllTreeCountVec.to_csv(os.path.join(OutputDirectory, "TreeCount", f"{Category.replace('.pkl', '')}_AllTreeCount.csv"), index=False)
UniqueTreeCountVec.to_csv(os.path.join(OutputDirectory, "TreeCount", f"{Category.replace('.pkl', '')}_UniqueTreeCount.csv"), index=False)
SelectionHistoryVec.to_csv(os.path.join(OutputDirectory, "SelectionHistory", f"{Category.replace('.pkl', '')}_SelectionHistory.csv"), index=False)
print(f"Saved {Category} files!")

OSError: Cannot save file into a non-existent directory: '/Users/simondn/Documents/RashomonActiveLearning/Code/AuxiliaryNotebooks/Results/Iris/TreeFarms/ProcessedResults/ErrorVec'