# Baseline Subset Experiment Results

This notebook analyzes the results from the baseline subset experiments,
showing how model performance scales with training set size.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

## Load Results

Specify the path to your experiment results directory.

In [None]:
# Specify path to results directory
results_dir = Path('../../results')  # Adjust path as needed

# List available experiments
experiments = sorted([d for d in results_dir.iterdir() if d.is_dir()])
print("Available experiments:")
for i, exp in enumerate(experiments):
    print(f"{i}: {exp.name}")

In [None]:
# Load specific experiment (adjust index)
experiment_idx = 0  # Change this to load different experiment
experiment_dir = experiments[experiment_idx]
results_file = experiment_dir / 'results.json'

print(f"Loading: {experiment_dir.name}")

with open(results_file, 'r') as f:
    results = json.load(f)

print(f"Dataset: {results['config']['data']['dataset_name']}")
print(f"Number of subset fractions: {len(results['subset_results'])}")

## Data Efficiency Curves

Plot how test performance scales with training set size.

In [None]:
# Extract data for plotting
fractions = [r['fraction'] for r in results['subset_results']]
num_samples = [r['num_samples'] for r in results['subset_results']]
test_pearson = [r['test_metrics']['pearson_r'] for r in results['subset_results']]
test_spearman = [r['test_metrics']['spearman_r'] for r in results['subset_results']]

# Plot learning curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(num_samples, test_pearson, marker='o', linewidth=2, markersize=8)
axes[0].set_xlabel('Training Set Size')
axes[0].set_ylabel('Test Pearson R')
axes[0].set_title('Data Efficiency: Pearson R')
axes[0].grid(True, alpha=0.3)

axes[1].plot(num_samples, test_spearman, marker='o', linewidth=2, markersize=8, color='orange')
axes[1].set_xlabel('Training Set Size')
axes[1].set_ylabel('Test Spearman R')
axes[1].set_title('Data Efficiency: Spearman R')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(experiment_dir / 'data_efficiency_curves.png', dpi=300, bbox_inches='tight')
plt.show()