In [1]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from sklearn.metrics import ConfusionMatrixDisplay
import gzip
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, auc, precision_recall_curve
from scipy.stats import pearsonr, spearmanr
import sys

In [2]:
def load_experiment_data(instance_path):
    dfs = {
        "pathways": None,
        "statistics": None,
        "train_stats": None,
        "test_stats": None, 
        "val_stats": None
    }

    if not os.path.isdir(instance_path):
        print(f"Error: Directory not found at {instance_path}")
        return tuple(dfs.values())

    for filename in os.listdir(instance_path):
        file_path = os.path.join(instance_path, filename)
        
        # Skip if not a file
        if not os.path.isfile(file_path):
            continue

        try:
            if 'analysis' in filename and filename.endswith('.json.gz'):
                print(f"Attempting to load {filename} (analysis JSON)...")
                with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                    data = json.load(f)
                dfs['pathways'] = pd.DataFrame(data) 
                print(f"Successfully created DataFrame for {filename}")
            elif 'results' in filename and filename.endswith('.json.gz'):
                with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                    data = json.load(f)
                if isinstance(data, dict):
                    dfs['statistics'] = pd.DataFrame([data])
                elif isinstance(data, list): 
                    dfs['statistics'] = pd.DataFrame(data)
                else:
                    print(f"Warning: statistics file {filename} has an unexpected main data type: {type(data)}. Could not convert to DataFrame.")
            elif 'train' in filename and filename.endswith('.csv.gz'):
                dfs['train_stats'] = pd.read_csv(file_path, compression='gzip')
            elif 'test' in filename and filename.endswith('.csv.gz'):
                dfs['test_stats'] = pd.read_csv(file_path, compression='gzip')
            elif 'val' in filename and filename.endswith('.csv.gz'):
                dfs['val_stats'] = pd.read_csv(file_path, compression='gzip')
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
    return dfs['pathways'], dfs['statistics'], dfs['train_stats'], dfs['test_stats'], dfs['val_stats']

In [3]:
def analyze_statistics(statistics_df):
    row_data = statistics_df.iloc[0]
    train_metrics=row_data['train_metrics']
    test_metrics=row_data['test_metrics']
    val_metrics=row_data['val_metrics']

    metrics_list = [
        {'set': 'train', 'metrics': train_metrics},
        {'set': 'validation', 'metrics': val_metrics},
        {'set': 'test', 'metrics': test_metrics}
    ]
    
    metrics_df = pd.DataFrame([
        {'Set': 'Train', **train_metrics},
        {'Set': 'Validation', **val_metrics},
        {'Set': 'Test', **test_metrics}
    ])

    print("Metrics Summary Table:")
    print("=" * 80)
    print(metrics_df.to_string(index=False, float_format='%.4f'))
    print("=" * 80)

    print(f"\nAvailable metrics: {list(train_metrics.keys())}")
    
    return metrics_list, metrics_df

In [4]:
def load_experiment_data_chunked(instance_path, chunk_size=10000):
    dfs = {
        "pathways": None,
        "statistics": None,
        "train_stats": None,
        "test_stats": None, 
        "val_stats": None
    }

    if not os.path.isdir(instance_path):
        print(f"Error: Directory not found at {instance_path}")
        return tuple(dfs.values())

    for filename in os.listdir(instance_path):
        file_path = os.path.join(instance_path, filename)
        
        if not os.path.isfile(file_path):
            continue

        try:
            if 'analysis' in filename and filename.endswith('.json.gz'):
                print(f"Attempting to load {filename} (analysis JSON) in chunks...")
                
                # For large JSON files, we'll read and process in chunks
                with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                    # Try to peek at the structure first
                    first_char = f.read(1)
                    f.seek(0)
                    
                    if first_char == '[':
                        # It's a list - process in chunks
                        chunks = []
                        buffer = ""
                        bracket_count = 0
                        in_string = False
                        escape_next = False
                        
                        for line in f:
                            for char in line:
                                if escape_next:
                                    escape_next = False
                                    continue
                                if char == '\\':
                                    escape_next = True
                                elif char == '"' and not escape_next:
                                    in_string = not in_string
                                elif not in_string:
                                    if char == '[':
                                        bracket_count += 1
                                    elif char == ']':
                                        bracket_count -= 1
                                
                                buffer += char
                                
                                # If we have complete objects and buffer is large enough
                                if len(buffer) > chunk_size * 1000 and bracket_count == 1 and not in_string:
                                    if buffer.rstrip().endswith(','):
                                        # Complete chunk
                                        chunk_data = '[' + buffer[1:-1] + ']'
                                        try:
                                            chunk_df = pd.DataFrame(json.loads(chunk_data))
                                            chunks.append(chunk_df)
                                            buffer = "["
                                        except:
                                            continue
                        
                        # Process remaining buffer
                        if len(buffer) > 1:
                            try:
                                if not buffer.rstrip().endswith(']'):
                                    buffer = buffer.rstrip().rstrip(',') + ']'
                                chunk_df = pd.DataFrame(json.loads(buffer))
                                chunks.append(chunk_df)
                            except:
                                pass
                        
                        if chunks:
                            dfs['pathways'] = pd.concat(chunks, ignore_index=True)
                            print(f"Successfully created DataFrame with {len(dfs['pathways'])} rows")
                    else:
                        # It's likely a single object, try loading normally but with error handling
                        try:
                            data = json.load(f)
                            dfs['pathways'] = pd.DataFrame(data)
                        except MemoryError:
                            print(f"File {filename} too large to load into memory, skipping...")
                            continue
                        
            elif 'results' in filename and filename.endswith('.json.gz'):
                with gzip.open(file_path, 'rt', encoding='utf-8') as f:
                    data = json.load(f)
                if isinstance(data, dict):
                    dfs['statistics'] = pd.DataFrame([data])
                elif isinstance(data, list): 
                    dfs['statistics'] = pd.DataFrame(data)
                else:
                    print(f"Warning: statistics file {filename} has an unexpected main data type: {type(data)}. Could not convert to DataFrame.")
            elif 'train' in filename and filename.endswith('.csv.gz'):
                dfs['train_stats'] = pd.read_csv(file_path, compression='gzip')
            elif 'test' in filename and filename.endswith('.csv.gz'):
                dfs['test_stats'] = pd.read_csv(file_path, compression='gzip')
            elif 'val' in filename and filename.endswith('.csv.gz'):
                dfs['val_stats'] = pd.read_csv(file_path, compression='gzip')
        except Exception as e:
            print(f"Error processing file {filename}: {e}")
    return dfs['pathways'], dfs['statistics'], dfs['train_stats'], dfs['test_stats'], dfs['val_stats']

# pathinit_path="/labs/Aguiar/SSPA_BRAY/BRay/Results/pathway_initiated"
# runs_accuracy = [] 

# # Loop through each folder in the combined_path directory
# for folder in os.listdir(pathinit_path):
#     folder_path = os.path.join(pathinit_path, folder)
#     if os.path.isdir(folder_path):
#         # Process files within the folder using the chunked function
#         _, statistics_df, _, _, _ = load_experiment_data_chunked(folder_path)
#         if statistics_df is not None:
#             # Analyze the statistics dataframe to get metric summaries
#             _, metrics_df = analyze_statistics(statistics_df)
#             # Check if accuracy is one of the available metrics
#             if 'accuracy' in metrics_df.columns:
#                 for _, row in metrics_df.iterrows():
#                     runs_accuracy.append({
#                         'Run': folder,
#                         'Set': row['Set'],
#                         'Accuracy': row['accuracy']
#                     })
#             else:
#                 print(f"Run {folder} does not report an accuracy metric.")
#         else:
#             print(f"Run {folder} did not have statistics data available.")

# # Create a combined dataframe with the accuracy for all runs
# df_accuracy = pd.DataFrame(runs_accuracy)
# print("\nAggregated Accuracy Table:")
# print(df_accuracy.to_string(index=False))

In [5]:
# # Sort df_accuracy by validation set accuracy in descending order
# df_accuracy_sorted = df_accuracy[df_accuracy['Set'] == 'Validation'].sort_values('Accuracy', ascending=False)

# print("Validation Accuracy Rankings (Highest to Lowest):")
# print("=" * 60)
# print(df_accuracy_sorted.to_string(index=False))

# # Find the run with highest validation accuracy
# best_run = df_accuracy_sorted.iloc[0]['Run']
# best_val_accuracy = df_accuracy_sorted.iloc[0]['Accuracy']

# print(f"\nBest performing run: {best_run}")
# print(f"Highest validation accuracy: {best_val_accuracy:.4f}")

# # Show all metrics for the best run
# best_run_metrics = df_accuracy[df_accuracy['Run'] == best_run]
# print(f"\nComplete metrics for best run ({best_run}):")
# print(best_run_metrics.to_string(index=False))

In [6]:
best_exp="/labs/Aguiar/SSPA_BRAY/BRay/Results/pathway_initiated/20250612_122135"

In [7]:
best_pathways, best_statistics, best_train_stats, best_test_stats, best_val_stats = load_experiment_data(best_exp)

: 

: 

: 

In [None]:
# # Use a memory-efficient approach to examine the file structure
# analysis_file = "/labs/Aguiar/SSPA_BRAY/BRay/Results/pathway_initiated/20250522_150830/ajm_cyto_cyto_initialized_pw1272_gene_program_analysis.json.gz"

# # First, let's check the file size
# file_size = os.path.getsize(analysis_file)
# print(f"File size: {file_size / (1024**3):.2f} GB")

# # Read just the beginning to understand the structure
# with gzip.open(analysis_file, 'rt', encoding='utf-8') as f:
#     # Read first few characters to see if it's a list or dict
#     first_chars = f.read(100)
#     print(f"First 100 characters: {first_chars}")
    
#     # Reset file pointer
#     f.seek(0)
    
#     # Try to read line by line to find structure
#     line_count = 0
#     for line in f:
#         if line_count == 0:
#             print(f"First line: {line[:200]}...")
#         line_count += 1
#         if line_count >= 5:  # Only read first few lines
#             break
    
#     print(f"Read {line_count} lines")

In [None]:
import json

# The file is very large (0.35 GB) and has a nested structure
# Let's try a more targeted approach to see if we can extract specific parts

# First, let's see what the structure looks like in more detail
with gzip.open(analysis_file, 'rt', encoding='utf-8') as f:
    # Read a larger sample to understand the structure
    sample = f.read(1000)
    print("Structure sample:")
    print(sample)
    
    # Check if this is a dictionary with gene programs
    f.seek(0)
    try:
        # Try to load just the first level to see the keys
        
        # Read line by line until we find the main structure
        content = ""
        brace_count = 0
        for line in f:
            content += line
            brace_count += line.count('{') - line.count('}')
            if len(content) > 10000 and brace_count == 0:  # Stop at first complete object
                break
        
        # Try to parse what we have so far
        if content.strip().endswith(','):
            content = content.strip()[:-1]
        
        partial_data = json.loads(content)
        print(f"\nTop-level keys: {list(partial_data.keys())}")
        
    except Exception as e:
        print(f"Could not parse as JSON: {e}")
        print("This file is likely too large and complex to convert to a single DataFrame")