In [1]:
### Import Functions ###
import os
import json
import pickle
import argparse
import numpy as np
import pandas as pd


In [2]:
# Define a function to extract error and time vectors
def ExtractErrorAndTime(files):
    error_vecs = []
    time_vecs = []
    for file in files:
        try:
            with open(file, "rb") as f:
                data = pickle.load(f)
                error_vecs.append(data["ErrorVec"])
                time_vecs.append(data["ElapsedTime"])
        except Exception as e:
            print(f"Error loading file {file}: {e}")
    return np.array(error_vecs), np.array(time_vecs)


In [3]:
# Simulate argparse args object
class Args:
    def __init__(self, DataType, ModelType, Categories):
        self.DataType = DataType
        self.ModelType = ModelType
        self.Categories = Categories

# Manually setting up args
args = Args(
    DataType="BostonHousing",
    ModelType="TreeFarms",
    Categories='["MTTreeFarms_UEI1_NE100_Reg0.01_RBA0.01.pkl"]'
)

# Parsing arguments
data_type = args.DataType
model_type = args.ModelType
categories = eval(args.Categories)  # Convert string to Python list

# Display arguments (for testing)
print("DataType:", data_type)
print("ModelType:", model_type)
print("Categories:", categories)

# Add the main script logic below using `data_type`, `model_type`, and `categories`.


DataType: BostonHousing
ModelType: TreeFarms
Categories: ['MTTreeFarms_UEI1_NE100_Reg0.01_RBA0.01.pkl']


In [5]:
args.Categories

'["MTTreeFarms_UEI1_NE100_Reg0.01_RBA0.01.pkl"]'

In [6]:

### Set Up ###
cwd = os.getcwd()
ResultsDirectory = os.path.join(cwd, "Results", args.DataType, args.ModelType)
OutputDirectory = os.path.join(ResultsDirectory, "ProcessedResults")
RawDirectory = os.path.join(ResultsDirectory, "Raw")
Categories = json.loads(args.Categories)


In [9]:

# Group files by category
category_files = {category: [] for category in Categories}
for filename in os.listdir('/Users/simondn/Documents/RashomonActiveLearning/Results/BostonHousing/TreeFarms/Raw'):
    if filename.endswith(".pkl"):
        for category in Categories:
            if filename.endswith(category):
                category_files[category].append(os.path.join(RawDirectory, filename))
                break


In [11]:
filename

' .foo.png'

In [10]:
category_files

{'MTTreeFarms_UEI1_NE100_Reg0.01_RBA0.01.pkl': []}

In [None]:

# Process files for each category
ErrorMatrices = {}
TimeMatrices = {}

for category, files in category_files.items():
    if not files:
        print(f"Warning: No files found for category {category}. Skipping.")
        continue
    print(f"Processing category: {category} with {len(files)} files")
    error_vecs, time_vecs = ExtractErrorAndTime(files)
    ErrorMatrices[category] = error_vecs  # Transpose
    TimeMatrices[category] = time_vecs    # Transpose

# Retain original category names as keys
ErrorMatrices = {category: ErrorMatrices[category] for category in category_files if category in ErrorMatrices}
TimeMatrices = {category: TimeMatrices[category] for category in category_files if category in TimeMatrices}
ErrorMatrices = {key.replace(".pkl", ""): value for key, value in ErrorMatrices.items()}
TimeMatrices = {key.replace(".pkl", ""): value for key, value in TimeMatrices.items()}

# Squeeze dimensions #
ErrorMatrices = {key: matrix.squeeze() for key, matrix in ErrorMatrices.items()}
TimeMatrices = {key: matrix.squeeze() for key, matrix in TimeMatrices.items()}

# Ensure the output directory exists
os.makedirs(OutputDirectory, exist_ok=True)

# Save ErrorMatrices
for key, matrix in ErrorMatrices.items():
    df = pd.DataFrame(matrix)  # Convert to DataFrame
    df.to_csv(os.path.join(OutputDirectory, f"{key}_ErrorMatrix.csv"), index=False)

# Save TimeMatrices
for key, matrix in TimeMatrices.items():
    df = pd.DataFrame(matrix)  # Convert to DataFrame
    df.to_csv(os.path.join(OutputDirectory, f"{key}_TimeMatrix.csv"), index=False)

