In [46]:
import pandas as pd

In [47]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the data
df = pd.read_csv("/home/zoorab/projects/ECGFounder/res/distill/distillation_results.csv")

# Preview the data
print(data.head())


   experiment_num            timestamp  teacher_type  \
0               1  2026-01-14 01:45:30       full_ft   
1               2  2026-01-14 02:18:16       full_ft   
2               3  2026-01-14 02:57:55       full_ft   
3               4  2026-01-14 03:51:05       full_ft   
4               5  2026-01-14 04:22:21  linear_probe   

                                        teacher_path  teacher_macro_auroc  \
0  /home/zoorab/projects/ECGFounder/res/eval/chec...             0.925928   
1  /home/zoorab/projects/ECGFounder/res/eval/chec...             0.925928   
2  /home/zoorab/projects/ECGFounder/res/eval/chec...             0.925928   
3  /home/zoorab/projects/ECGFounder/res/eval/chec...             0.925928   
4  /home/zoorab/projects/ECGFounder/res/eval/chec...             0.901197   

   teacher_micro_auroc  teacher_macro_ap  teacher_micro_ap  teacher_params  \
0             0.937912          0.593363          0.614470        30682689   
1             0.937912          0.593363    

In [51]:
# ---------- Student models ----------
fig = px.scatter(
    df,
    x='student_size_mb',
    y='best_macro_auroc',
    color='teacher_type',
    size='student_size_mb',
    size_max=40,  # larger blobs
    hover_data={
        'student_size_mb': ':.2f',
        'compression_ratio': ':.2f',
        'param_reduction_pct': ':.1f',
        'performance_retention_pct': ':.1f',
        'performance_gap': ':.3f',
        'temperature': True
    }
)

# ---------- Add line connections for each teacher type ----------
for teacher in df['teacher_type'].unique():
    df_teacher = df[df['teacher_type'] == teacher].sort_values('student_size_mb')  # sort for proper line
    fig.add_trace(
        go.Scatter(
            x=df_teacher['student_size_mb'],
            y=df_teacher['best_macro_auroc'],
            mode='lines',
            line=dict(width=2, color=px.colors.qualitative.Plotly[df['teacher_type'].unique().tolist().index(teacher)]),
            showlegend=False  # don't add extra legend entries for lines
        )
    )

# ---------- Add compression ratio labels above circles ----------
y_offset = df['student_size_mb'] / df['student_size_mb'].max() * 0.0085
fig.add_trace(
    go.Scatter(
        x=df['student_size_mb'],
        y=df['best_macro_auroc'] + y_offset,
        text=df['compression_ratio'].map(lambda x: f"{x:.1f}×"),
        mode='text',
        showlegend=False,
        textfont=dict(size=12, color='black'),
    )
)

# ---------- Teacher models ----------
teachers = (
    df[['teacher_type', 'teacher_size_mb', 'teacher_macro_auroc']]
    .drop_duplicates()
)

fig.add_trace(
    go.Scatter(
        x=teachers['teacher_size_mb'],
        y=teachers['teacher_macro_auroc'],
        mode='markers+text',
        marker=dict(
            symbol='star',
            size=25,
            color='black'
        ),
        name='Teacher (uncompressed)',
        hovertemplate=(
            "Teacher: %{text}<br>"
            "Size: %{x:.1f} MB<br>"
            "Macro AUROC: %{y:.3f}<extra></extra>"
        ),
        text=teachers['teacher_type'],
        textposition='top center'
    )
)

# ---------- Layout ----------
fig.update_layout(
    title='Accuracy vs Model Size with Compression Ratio Labels<br>'
          '<sup>Circle size ∝ model size, text = compression ratio</sup>',
    xaxis_title='Model Size (MB)',
    yaxis_title='Macro AUROC',
    legend_title='Teacher Model',
    template='plotly_white'
)
fig.show()

In [49]:

# ---------- Compute performance gap ----------
teacher_auroc = (
    df[['teacher_type', 'teacher_macro_auroc']]
    .drop_duplicates()
    .set_index('teacher_type')['teacher_macro_auroc']
)

df['performance_gap'] = df.apply(
    lambda row: teacher_auroc[row['teacher_type']] - row['best_macro_auroc'], axis=1
)

# Sort by student size for smooth line
df_sorted = df.sort_values(by='student_size_mb')

# ---------- Line chart with blob markers ----------
fig_line = go.Figure()

# Map teacher type to colors
teacher_colors = {t: c for t, c in zip(df_sorted['teacher_type'].unique(), px.colors.qualitative.Plotly)}

for teacher in df_sorted['teacher_type'].unique():
    df_teacher = df_sorted[df_sorted['teacher_type'] == teacher]
    
    # Scale marker sizes for visualization
    size_scale = 40  # maximum marker size
    marker_sizes = (df_teacher['student_size_mb'] / df_teacher['student_size_mb'].max()) * size_scale
    
    fig_line.add_trace(
        go.Scatter(
            x=df_teacher['student_size_mb'],
            y=df_teacher['performance_gap'],
            mode='lines+markers',
            name=f'Teacher: {teacher}',
            marker=dict(
                size=marker_sizes,
                sizemode='area',
                sizeref=2.*max(marker_sizes)/(size_scale**2),  # adjust sizing properly
                color=teacher_colors[teacher],
                line=dict(width=1, color='black')  # optional border for clarity
            ),
            line=dict(width=2, color=teacher_colors[teacher]),
            hovertemplate=(
                "Student Size: %{x:.1f} MB<br>"
                "Performance Gap: %{y:.3f}<br>"
                "Student AUROC: %{customdata[0]:.3f}<br>"
                "Teacher AUROC: %{customdata[1]:.3f}<extra></extra>"
            ),
            customdata=df_teacher[['best_macro_auroc', 'teacher_macro_auroc']]
        )
    )

# ---------- Layout ----------
fig_line.update_layout(
    title='Student Models Performance Gap vs Model Size',
    xaxis_title='Student Model Size (MB)',
    yaxis_title='Performance Gap (Teacher AUROC − Student AUROC)',
    template='plotly_white'
)

fig_line.show()