In [2]:
import torch
import pandas as pd
import numpy as np
import glob
import os
from ogb.nodeproppred import PygNodePropPredDataset

# Configuration
dataset_name = 'products'  # Change to 'arxiv' if needed
methods = ['plain', 'linear', 'mlp']  # Methods to process
predictions_dir = 'predictions'  # Directory containing prediction files
labels_csv = 'gcn_predictions_2.csv'  # CSV file with node_id, prediction, true_label columns


In [3]:
# Load true labels from CSV file
print(f"Loading true labels from CSV: {labels_csv}")
labels_df = pd.read_csv(labels_csv)

# Verify expected columns
expected_cols = ['node_id', 'true_label']
if 'true_label' not in labels_df.columns:
    # Try alternative column names
    if 'truth' in labels_df.columns:
        labels_df = labels_df.rename(columns={'truth': 'true_label'})
    elif 'label' in labels_df.columns:
        labels_df = labels_df.rename(columns={'label': 'true_label'})
    else:
        raise ValueError(f"CSV must have 'true_label' column (or 'truth'/'label'). Found: {labels_df.columns.tolist()}")

# Sort by node_id to ensure correct ordering
labels_df = labels_df.sort_values('node_id').reset_index(drop=True)

# Extract true labels as numpy array
true_labels = labels_df['true_label'].values
num_nodes = len(true_labels)
num_classes = int(true_labels.max() + 1)

print(f"Loaded {num_nodes} nodes with {num_classes} classes")
print(f"Node ID range: {labels_df['node_id'].min()} to {labels_df['node_id'].max()}")
print(f"True label range: {true_labels.min()} to {true_labels.max()}")


Loading true labels from CSV: gcn_predictions_2.csv
Loaded 2449029 nodes with 47 classes
Node ID range: 0 to 2449028
True label range: 0 to 46


In [4]:
def process_predictions_file(pred_file, true_labels, method, run_num):
    """
    Process a single prediction file and extract top 3 predictions.
    
    Args:
        pred_file: Path to .pt prediction file
        true_labels: Array of true labels for all nodes
        method: Method name (for output filename)
        run_num: Run number (for output filename)
    
    Returns:
        DataFrame with node_id, prediction_1, prediction_2, prediction_3, truth
    """
    # Load predictions
    predictions = torch.load(pred_file, map_location='cpu')  # Shape: [num_nodes, num_classes]
    
    # Get top 3 predictions for each node
    top3_values, top3_indices = torch.topk(predictions, k=3, dim=1)
    
    # Convert to numpy
    top3_indices_np = top3_indices.numpy()  # Shape: [num_nodes, 3]
    
    # Create DataFrame
    df = pd.DataFrame({
        'node_id': np.arange(num_nodes),
        'prediction_1': top3_indices_np[:, 0],
        'prediction_2': top3_indices_np[:, 1],
        'prediction_3': top3_indices_np[:, 2],
        'truth': true_labels
    })
    
    return df


In [5]:
# Process all prediction files
all_dataframes = []

for method in methods:
    method_dir = f'{predictions_dir}/{dataset_name}_{method}'
    
    if not os.path.exists(method_dir):
        print(f"Warning: Directory {method_dir} not found, skipping {method}")
        continue
    
    # Find all .pt files in the directory
    pred_files = glob.glob(f'{method_dir}/*.pt')
    pred_files.sort(key=lambda x: int(os.path.splitext(os.path.basename(x))[0].replace('run', '')))
    
    print(f"\nProcessing {method}:")
    print(f"  Found {len(pred_files)} prediction file(s)")
    
    for pred_file in pred_files:
        # Extract run number from filename (e.g., "run0.pt" -> 0)
        run_num = int(os.path.splitext(os.path.basename(pred_file))[0].replace('run', ''))
        
        print(f"  Processing run {run_num}: {pred_file}")
        
        # Process the file
        df = process_predictions_file(pred_file, true_labels, method, run_num)
        
        # Add method and run columns for identification
        df['method'] = method
        df['run'] = run_num
        
        all_dataframes.append(df)
        
        # Save individual CSV file
        output_file = f'{method_dir}/run{run_num}_predictions.csv'
        df.to_csv(output_file, index=False)
        print(f"    Saved to {output_file}")

print(f"\nProcessed {len(all_dataframes)} prediction file(s) total")



Processing plain:
  Found 5 prediction file(s)
  Processing run 0: predictions/products_plain\run0.pt
    Saved to predictions/products_plain/run0_predictions.csv
  Processing run 1: predictions/products_plain\run1.pt
    Saved to predictions/products_plain/run1_predictions.csv
  Processing run 2: predictions/products_plain\run2.pt
    Saved to predictions/products_plain/run2_predictions.csv
  Processing run 3: predictions/products_plain\run3.pt
    Saved to predictions/products_plain/run3_predictions.csv
  Processing run 4: predictions/products_plain\run4.pt
    Saved to predictions/products_plain/run4_predictions.csv

Processing linear:
  Found 5 prediction file(s)
  Processing run 0: predictions/products_linear\run0.pt
    Saved to predictions/products_linear/run0_predictions.csv
  Processing run 1: predictions/products_linear\run1.pt
    Saved to predictions/products_linear/run1_predictions.csv
  Processing run 2: predictions/products_linear\run2.pt
    Saved to predictions/produc

In [6]:
# Combine all dataframes (optional - if you want one big file)
if all_dataframes:
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Reorder columns: node_id, method, run, prediction_1, prediction_2, prediction_3, truth
    column_order = ['node_id', 'method', 'run', 'prediction_1', 'prediction_2', 'prediction_3', 'truth']
    combined_df = combined_df[column_order]
    
    # Save combined CSV
    combined_output = f'{predictions_dir}/{dataset_name}_all_predictions.csv'
    combined_df.to_csv(combined_output, index=False)
    print(f"\nCombined CSV saved to: {combined_output}")
    print(f"Shape: {combined_df.shape}")
    print(f"\nFirst few rows:")
    print(combined_df.head(10))


KeyboardInterrupt: 

In [None]:
# Optional: Create separate CSV for each method (without method/run columns)
for method in methods:
    method_dfs = [df for df in all_dataframes if df['method'].iloc[0] == method]
    
    if method_dfs:
        # If multiple runs, you might want to average or take the first run
        # For now, let's take the first run
        method_df = method_dfs[0].copy()
        
        # Remove method and run columns for cleaner output
        method_df_clean = method_df[['node_id', 'prediction_1', 'prediction_2', 'prediction_3', 'truth']]
        
        output_file = f'{predictions_dir}/{dataset_name}_{method}_predictions.csv'
        method_df_clean.to_csv(output_file, index=False)
        print(f"Saved {method} predictions to: {output_file}")
