In [2]:
import os
import pickle
import pandas as pd

In [4]:
data_folder = "data_collection"
output_folder = "tables"
os.makedirs(output_folder, exist_ok=True)

greedy_data = []
mcmc_data = []

In [23]:
# Path to the data collection directory
data_dir = "data_collection"  # Update if necessary

# List to store results
results = []

# File to exclude
exclude_file = "mcmc_gcn_cora_constant_binary_results.pkl"

# Process each pickle file
for file_name in os.listdir(data_dir):
    if file_name.endswith(".pkl") and file_name != exclude_file:
        file_path = os.path.join(data_dir, file_name)
        with open(file_path, "rb") as file:
            recording = pickle.load(file)

        # Check if is_reserved exists, skip file if missing
        if not hasattr(recording, "is_reserved"):
            print(f"Skipping {file_name} due to missing 'is_reserved' attribute.")
            continue  # Skip this file
        
        # Check if accuracies and losses are non-empty and properly formatted
        if not recording.accuracies or not isinstance(recording.accuracies, dict):
            print(f"Skipping {file_name} due to invalid or empty accuracies.")
            continue

        if not recording.losses or not isinstance(recording.losses, dict):
            print(f"Skipping {file_name} due to invalid or empty losses.")
            continue

        # Extract final accuracy and loss
        try:
            final_accuracy = None
            for key in sorted(recording.accuracies.keys(), reverse=True):  # Get the last accuracy list
                if recording.accuracies[key]:
                    final_accuracy = recording.accuracies[key][-1]
                    break

            final_loss = None
            for key in sorted(recording.losses.keys(), reverse=True):  # Get the last loss list
                if recording.losses[key]:
                    final_loss = recording.losses[key][-1]
                    break

        except IndexError:
            print(f"Skipping {file_name} due to IndexError in extracting accuracy/loss.")
            continue

        total_iterations = sum(recording.iterations.values()) if recording.iterations else 0

        # Append a row to the results
        results.append({
            "File Name": file_name,
            "Model": recording.model.name,
            "Dataset": recording.dataset.name,
            "Accept Fn": recording.accept.name,
            "Select Fn": recording.select.name,
            "Reverse Type": recording.is_reserved.name,  # Ensure it's an Enum
            "Final Accuracy": final_accuracy if final_accuracy is not None else "N/A",
            "Final Loss": final_loss if final_loss is not None else "N/A",
            "Total Iterations": total_iterations,
        })

# Convert results list to DataFrame
df = pd.DataFrame(results)

# Save to CSV file
output_file = "tables/results_summary.csv"
os.makedirs("tables", exist_ok=True)  # Ensure the directory exists
df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


Results saved to tables/results_summary.csv


In [21]:
##This function is to find the file that is missing is_reserved in the attribute

missing_is_reserved = []

# Process each pickle file
for file_name in os.listdir(data_folder):
    if file_name.endswith(".pkl"):
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, "rb") as file:
            recording = pickle.load(file)
        
        # Check if is_reserved exists
        if not hasattr(recording, "is_reserved"):
            missing_is_reserved.append(file_name)

# Print results
if missing_is_reserved:
    print("Files missing the 'is_reserved' attribute:")
    for file in missing_is_reserved:
        print(f"- {file}")
else:
    print("All files contain the 'is_reserved' attribute.")

Files missing the 'is_reserved' attribute:
- mcmc_gcn_cora_constant_binary_results.pkl


In [26]:
import os
import pickle
import pandas as pd

# Paths
data_dir = "data_collection"  # Folder where all .pkl files are stored
output_dir = "tables"  # Folder where results are saved
os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists

# File to exclude
exclude_file = "mcmc_gcn_cora_constant_binary_results.pkl"

# Lists to store results separately
greedy_results = []
mcmc_results = []

# Process each pickle file
for file_name in os.listdir(data_dir):
    if file_name.endswith(".pkl") and file_name != exclude_file:
        file_path = os.path.join(data_dir, file_name)
        with open(file_path, "rb") as file:
            recording = pickle.load(file)

        # Check if is_reserved exists, skip file if missing
        if not hasattr(recording, "is_reserved"):
            print(f"Skipping {file_name} due to missing 'is_reserved' attribute.")
            continue  # Skip this file
        
        # Check if accuracies and losses are non-empty and properly formatted
        if not recording.accuracies or not isinstance(recording.accuracies, dict):
            print(f"Skipping {file_name} due to invalid or empty accuracies.")
            continue

        if not recording.losses or not isinstance(recording.losses, dict):
            print(f"Skipping {file_name} due to invalid or empty losses.")
            continue

        # Extract initial and final accuracy
        try:
            initial_accuracy = None
            final_accuracy = None
            for key in sorted(recording.accuracies.keys()):  # Get first non-empty list
                if recording.accuracies[key]:
                    initial_accuracy = recording.accuracies[key][0]
                    break

            for key in sorted(recording.accuracies.keys(), reverse=True):  # Get last non-empty list
                if recording.accuracies[key]:
                    final_accuracy = recording.accuracies[key][-1]
                    break

            # Extract initial and final loss
            initial_loss = None
            final_loss = None
            for key in sorted(recording.losses.keys()):  # Get first non-empty list
                if recording.losses[key]:
                    initial_loss = round(recording.losses[key][0], 5)
                    break

            for key in sorted(recording.losses.keys(), reverse=True):  # Get last non-empty list
                if recording.losses[key]:
                    final_loss = round(recording.losses[key][-1], 5)  # Round to 3 decimal places
                    break

        except IndexError:
            print(f"Skipping {file_name} due to IndexError in extracting accuracy/loss.")
            continue

        # Prepare result row
        result_row = {
            "File Name": file_name,
            "Model": recording.model.name,
            "Dataset": recording.dataset.name,
            "Accept Fn": recording.accept.name,
            "Select Fn": recording.select.name,
            "Reverse Type": recording.is_reserved.name,  # Ensure it's an Enum
            "Initial Accuracy": initial_accuracy if initial_accuracy is not None else "N/A",
            "Final Accuracy": final_accuracy if final_accuracy is not None else "N/A",
            "Initial Loss": initial_loss if initial_loss is not None else "N/A",
            "Final Loss": final_loss if final_loss is not None else "N/A",
        }

        # Categorize into Greedy or MCMC
        if "greedy" in file_name.lower():
            greedy_results.append(result_row)
        elif "mcmc" in file_name.lower():
            mcmc_results.append(result_row)

pd.set_option("display.max_colwidth", 20)  # Truncate long text
pd.set_option("display.width", 100)  # Reduce overall table width

# Convert lists to DataFrames
df_greedy = pd.DataFrame(greedy_results)
df_mcmc = pd.DataFrame(mcmc_results)

# Save to CSV files
greedy_output_file = os.path.join(output_dir, "greedy_results.csv")
mcmc_output_file = os.path.join(output_dir, "mcmc_results.csv")

df_greedy.to_csv(greedy_output_file, index=False)
df_mcmc.to_csv(mcmc_output_file, index=False)

print(f"Greedy results saved to {greedy_output_file}")
print(f"MCMC results saved to {mcmc_output_file}")


Greedy results saved to tables/greedy_results.csv
MCMC results saved to tables/mcmc_results.csv


In [6]:
import os
import pickle
import pandas as pd

# Define path to directory
data_path = "data_collection"  # Update this if needed
output_dir = "tables"

# Storage for results
results = []

# Process each pickle file
for file_name in os.listdir(data_path):
    if not file_name.endswith(".pkl"):
        continue  # Skip non-pickle files
    
    file_path = os.path.join(data_path, file_name)

    with open(file_path, "rb") as file:
        recording = pickle.load(file)

    # Ensure is_reserved attribute exists
    if not hasattr(recording, "is_reserved"):
        continue  # Skip files without this attribute
    
    # Extract splits
    for split, accuracy_list in recording.accuracies.items():
        if len(accuracy_list) == 0:
            continue  # Skip empty splits

        # Extract initial and final values
        initial_accuracy = accuracy_list[0]
        final_accuracy = accuracy_list[-1]
        initial_loss = recording.losses[split][0] if split in recording.losses else None
        final_loss = round(recording.losses[split][-1], 5) if split in recording.losses else None

        # Append results in hierarchical format
        results.append([
            recording.dataset.name,  # Dataset
            recording.model.name,    # Model
            recording.accept.name,   # Accept Function
            recording.select.name,   # Select Function
            split,                   # Split values (0.0, 0.5, 0.7, 0.9)
            recording.is_reserved.name,  # Reverse Type
            round(initial_accuracy, 5),
            round(final_accuracy, 5),
            round(initial_loss, 5) if initial_loss is not None else None,
            round(final_loss, 5) if final_loss is not None else None
        ])

# Convert to DataFrame
df = pd.DataFrame(results, columns=[
    "Dataset", "Model", "Accept Fn", "Select Fn", "Split",
    "Reverse Type", "Initial Accuracy", "Final Accuracy", "Initial Loss", "Final Loss"
])

# Pivot Table to Match the Required Layout
pivot_df = df.pivot_table(
    index=["Dataset", "Model"],
    columns=["Accept Fn", "Select Fn", "Split"],
    values=["Final Accuracy"],
    aggfunc="first"
)


# Reset index to make it CSV-friendly
pivot_df.reset_index(inplace=True)

# Save as CSV
csv_path = os.path.join(output_dir, "formatted_results.csv")
pivot_df.to_csv(csv_path, index=False)

print(f"CSV file saved at: {csv_path}")
 

CSV file saved at: tables/formatted_results.csv


In [7]:
import os
import pickle
import pandas as pd

# Define path to directory
data_path = "data_collection"  # Update this if needed
output_dir = "tables"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Storage for results
results = []

# Process each pickle file
for file_name in os.listdir(data_path):
    if not file_name.endswith(".pkl"):
        continue  # Skip non-pickle files
    
    file_path = os.path.join(data_path, file_name)

    with open(file_path, "rb") as file:
        recording = pickle.load(file)

    # Ensure is_reserved attribute exists
    if not hasattr(recording, "is_reserved"):
        continue  # Skip files without this attribute
    
    # Extract splits
    for split, accuracy_list in recording.accuracies.items():
        if len(accuracy_list) == 0:
            continue  # Skip empty splits

        # Extract initial and final values
        initial_accuracy = accuracy_list[0]
        final_accuracy = accuracy_list[-1]
        initial_loss = recording.losses[split][0] if split in recording.losses else None
        final_loss = round(recording.losses[split][-1], 5) if split in recording.losses else None

        # Append results in hierarchical format
        results.append([
            recording.dataset.name,  # Dataset
            recording.model.name,    # Model
            recording.accept.name,   # Accept Function
            recording.select.name,   # Select Function
            split,                   # Split values (0.0, 0.5, 0.7, 0.9)
            recording.is_reserved.name,  # Reverse Type
            round(initial_accuracy, 5),
            round(final_accuracy, 5),
            round(initial_loss, 5) if initial_loss is not None else None,
            round(final_loss, 5) if final_loss is not None else None
        ])

# Convert to DataFrame
df = pd.DataFrame(results, columns=[
    "Dataset", "Model", "Accept Fn", "Select Fn", "Split",
    "Reverse Type", "Initial Accuracy", "Final Accuracy", "Initial Loss", "Final Loss"
])

# Pivot Table for Correct Formatting
pivot_df = df.pivot_table(
    index=["Dataset", "Model"],  # Rows: Dataset (CORA, CITESEER) → Model (GCN, GAT, GSAGE)
    columns=["Accept Fn", "Select Fn", "Split"],  # Hierarchical columns
    values="Final Accuracy",
    aggfunc="first"
)

# Properly Format MultiIndex Column Names
pivot_df.columns = pd.MultiIndex.from_tuples(pivot_df.columns, names=["Accept Fn", "Select Fn", "Split"])

# Reset index but keep MultiIndex columns
pivot_df.reset_index(inplace=True)

# Save CSV with Proper MultiIndex Formatting
csv_path = os.path.join(output_dir, "formatted_results_2.csv")
pivot_df.to_csv(csv_path, index=False, encoding="utf-8")

print(f"CSV file saved at: {csv_path}")


CSV file saved at: tables/formatted_results_2.csv


In [1]:
import os
import pickle
import pandas as pd

# Define path to directory
data_path = "data_collection"  # Update this if needed
output_dir = "tables"

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Storage for results
mcmc_results = []
greedy_results = []

# Process each pickle file
for file_name in os.listdir(data_path):
    if not file_name.endswith(".pkl"):
        continue  # Skip non-pickle files
    
    file_path = os.path.join(data_path, file_name)

    with open(file_path, "rb") as file:
        recording = pickle.load(file)

    # Ensure is_reserved attribute exists
    if not hasattr(recording, "is_reserved"):
        continue  # Skip files without this attribute

    # Determine if the file belongs to MCMC or Greedy
    if "mcmc" in file_name.lower():
        results_list = mcmc_results
    elif "greedy" in file_name.lower():
        results_list = greedy_results
    else:
        continue  # Skip files that don't belong to either category

    # Extract splits
    for split, accuracy_list in recording.accuracies.items():
        if len(accuracy_list) == 0:
            continue  

        
        initial_accuracy = accuracy_list[0]
        final_accuracy = accuracy_list[-1]
        initial_loss = recording.losses[split][0] if split in recording.losses else None
        final_loss = round(recording.losses[split][-1], 5) if split in recording.losses else None

        
        if initial_loss is not None and final_loss is not None and initial_loss != 0:
            loss_change = round((final_loss - initial_loss) / initial_loss, 5)
        else:
            loss_change = None  

        # Append results in hierarchical format
        results_list.append([
            recording.dataset.name,  # Dataset
            recording.model.name,    # Model
            recording.accept.name,   # Accept Function
            recording.select.name,   # Select Function
            split,                   # Split values (0.0, 0.5, 0.7, 0.9)
            recording.is_reserved.name,  # Reverse Type
            loss_change
        ])


def save_formatted_table(results, filename):
    if not results:
        print(f"No data for {filename}, skipping...")
        return

    
    df = pd.DataFrame(results, columns=[
        "Dataset", "Model", "Accept Fn", "Select Fn", "Split",
        "Reverse Type", "Loss Change"
    ])

    
    pivot_df = df.pivot_table(
        index=["Dataset", "Model"],  # Rows: Dataset (CORA, CITESEER) -> Model (GCN, GAT, GSAGE)
        columns=["Accept Fn", "Select Fn", "Split"],  # Hierarchical columns
        values="Loss Change",
        aggfunc="first"
    )

    
    pivot_df.columns = pd.MultiIndex.from_tuples(pivot_df.columns, names=["Accept Fn", "Select Fn", "Split"])
    pivot_df.reset_index(inplace=True)

    
    csv_path = os.path.join(output_dir, filename)
    pivot_df.to_csv(csv_path, index=False, encoding="utf-8")

    print(f"CSV file saved at: {csv_path}")


save_formatted_table(mcmc_results, "mcmc_formatted_results.csv")
save_formatted_table(greedy_results, "greedy_formatted_results.csv")


CSV file saved at: tables/mcmc_formatted_results.csv
CSV file saved at: tables/greedy_formatted_results.csv
