# Experiment Results Viewer

This notebook displays the results from fine-tuning experiments.

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
# Load results (assuming notebook is in notebooks/ and csv is in root)
try:
    df = pd.read_csv('../experiments.csv')
except FileNotFoundError:
    # Fallback if running from root
    df = pd.read_csv('experiments.csv')

print(f"Total experiments: {len(df)}")
df

Total experiments: 1


Unnamed: 0,timestamp,model_path,model_name,run_name,dataset,accuracy,precision_weighted,recall_weighted,f1_weighted,tokens_per_sec,num_epochs,batch_size,learning_rate,lora_rank,lora_alpha,Algebra_precision,Algebra_recall,Algebra_f1,Algebra_support,Counting & Probability_precision,Counting & Probability_recall,Counting & Probability_f1,Counting & Probability_support,Geometry_precision,Geometry_recall,Geometry_f1,Geometry_support,Intermediate Algebra_precision,Intermediate Algebra_recall,Intermediate Algebra_f1,Intermediate Algebra_support,Number Theory_precision,Number Theory_recall,Number Theory_f1,Number Theory_support,Prealgebra_precision,Prealgebra_recall,Prealgebra_f1,Prealgebra_support,Precalculus_precision,Precalculus_recall,Precalculus_f1,Precalculus_support
0,2026-01-03 19:02:25,fine_tunings/llama3:8b_defaultLoRA_2epochs/,llama3:8b,llama3:8b_defaultLoRA_2epochs,qwedsacf/competition_math,0.8,0.854538,0.8,0.8008,20.411347,2,2,0.0002,16,16,0.793103,0.92,0.851852,25.0,0.666667,0.666667,0.666667,6.0,0.7,0.7,0.7,10.0,0.909091,0.952381,0.930233,21.0,0.777778,1.0,0.875,7.0,1.0,0.428571,0.6,21.0,0.909091,1.0,0.952381,10.0


## Summary Metrics

In [3]:
# Key metrics comparison
summary_cols = ['run_name', 'accuracy', 'f1_weighted', 'precision_weighted', 'recall_weighted', 'tokens_per_sec']
available_cols = [c for c in summary_cols if c in df.columns]
df[available_cols].sort_values('accuracy', ascending=False)

Unnamed: 0,run_name,accuracy,f1_weighted,precision_weighted,recall_weighted,tokens_per_sec
0,llama3:8b_defaultLoRA_2epochs,0.8,0.8008,0.854538,0.8,20.411347


## Per-Class F1 Scores

In [7]:
# Extract per-class F1 columns
f1_cols = [c for c in df.columns if c.endswith('_f1') and c != 'f1_weighted']
print(f"Found {len(f1_cols)} per-class F1 columns.")
if f1_cols:
    display_df = df[['run_name'] + f1_cols].copy()
    display_df.columns = ['run_name'] + [c.replace('_f1', '') for c in f1_cols]
    display(display_df)

Found 7 per-class F1 columns.


Unnamed: 0,run_name,Algebra,Counting & Probability,Geometry,Intermediate Algebra,Number Theory,Prealgebra,Precalculus
0,llama3:8b_defaultLoRA_2epochs,0.851852,0.666667,0.7,0.930233,0.875,0.6,0.952381


## Training Configuration

In [8]:
# Config columns
config_cols = ['run_name', 'model_name', 'num_epochs', 'batch_size', 'learning_rate', 'lora_rank', 'lora_alpha']
available_config = [c for c in config_cols if c in df.columns]
df[available_config]

Unnamed: 0,run_name,model_name,num_epochs,batch_size,learning_rate,lora_rank,lora_alpha
0,llama3:8b_defaultLoRA_2epochs,llama3:8b,2,2,0.0002,16,16
