In [8]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_metrics_from_csv(csv_path, output_dir, save_name='paper_metrics_by_k.png'):
    df = pd.read_csv(csv_path)
    
    # Metrics to include
    metrics = [
        'val_loss', 'val_accuracy', 'val_ami',
        'test_loss', 'test_accuracy',
        'aic', 'bic'
    ]
    
    metric_titles = {
        'val_loss': 'Validation Loss',
        'val_accuracy': 'Validation Accuracy',
        'val_ami': 'Adjusted Mutual Info (AMI)',
        'test_loss': 'Test Loss',
        'test_accuracy': 'Test Accuracy',
        'aic': 'AIC',
        'bic': 'BIC'
    }

    grouped = df.groupby('num_cluster')
    k_values = sorted(df['num_cluster'].unique())

    # Compute mean and std
    mean_std_by_metric = {
        metric: {
            'mean': grouped[metric].mean().reindex(k_values).values,
            'std': grouped[metric].std().reindex(k_values).values
        }
        for metric in metrics
    }

    # Setup multi-panel plot
    n_metrics = len(metrics)
    n_cols = 3
    n_rows = (n_metrics + n_cols - 1) // n_cols
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4.5 * n_rows))
    axes = axes.flatten()

    for i, metric in enumerate(metrics):
        ax = axes[i]
        means = mean_std_by_metric[metric]['mean']
        stds = mean_std_by_metric[metric]['std']
        
        ax.errorbar(k_values, means, yerr=stds, fmt='o-', capsize=4, linewidth=2, elinewidth=2)
        ax.set_title(metric_titles.get(metric, metric), fontsize=18, fontweight='bold')
        ax.set_xlabel('Number of Clusters (k)', fontsize=16, fontweight='bold')
        ax.set_ylabel(metric_titles.get(metric, metric), fontsize=16, fontweight='bold')
        ax.tick_params(axis='both', labelsize=12)
        ax.grid(True, linestyle='--', alpha=0.6)

    # Remove unused subplots
    for j in range(len(metrics), len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved publication-ready plot: {save_path}")


In [10]:



plot_metrics_from_csv(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='metrics_comparison_panel.png'
)


✅ Saved publication-ready plot: C:/Users/theya/Downloads/ADNIFinal\metrics_comparison_panel.png


In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_grouped_metrics_for_paper(csv_path, output_dir, save_name='final_grouped_metrics_by_k.png'):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')

    # Set publication style
    plt.style.use('seaborn-v0_8-whitegrid')  # Use seaborn style for cleaner look
    sns.set_palette("colorblind")  # Colorblind-friendly palette
    
    # Set up figure with 2x2 layout and more spacing
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()

    # Professional font sizes
    TITLE_SIZE = 18
    LABEL_SIZE = 14
    TICK_SIZE = 12
    LEGEND_SIZE = 11
    LINEWIDTH = 2.5
    CAPSIZE = 4
    MARKERSIZE = 6

    # Define consistent colors for train/val/test
    colors = {'train': '#1f77b4', 'val': '#ff7f0e', 'test': '#2ca02c'}

    # Panel 1: Accuracy
    ax = axes[0]
    metrics = ['train_accuracy', 'val_accuracy', 'test_accuracy']
    for metric in metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()} Accuracy"
        ax.errorbar(k_values, means, yerr=stds, label=label,
                    fmt='o-', capsize=CAPSIZE, linewidth=LINEWIDTH, 
                    markersize=MARKERSIZE, color=colors[split])
    ax.set_title("Accuracy Across Cluster Sizes", fontsize=TITLE_SIZE, fontweight='bold', pad=20)
    ax.set_xlabel("Number of Clusters, k", fontsize=LABEL_SIZE)
    ax.set_ylabel("Accuracy (%)", fontsize=LABEL_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, frameon=True, fancybox=True, shadow=True)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Panel 2: Loss
    ax = axes[1]
    metrics = ['train_loss', 'val_loss', 'test_loss']
    for metric in metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()} Loss"
        ax.errorbar(k_values, means, yerr=stds, label=label,
                    fmt='o-', capsize=CAPSIZE, linewidth=LINEWIDTH,
                    markersize=MARKERSIZE, color=colors[split])
    ax.set_title("Loss Across Cluster Sizes", fontsize=TITLE_SIZE, fontweight='bold', pad=20)
    ax.set_xlabel("Number of Clusters, k", fontsize=LABEL_SIZE)
    ax.set_ylabel("Loss", fontsize=LABEL_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, frameon=True, fancybox=True, shadow=True)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Panel 3: AMI
    ax = axes[2]
    metrics = ['train_ami', 'val_ami']
    ami_colors = {'train': colors['train'], 'val': colors['val']}
    for metric in metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()} AMI"
        ax.errorbar(k_values, means, yerr=stds, label=label,
                    fmt='o-', capsize=CAPSIZE, linewidth=LINEWIDTH,
                    markersize=MARKERSIZE, color=ami_colors[split])
    ax.set_title("Adjusted Mutual Information", fontsize=TITLE_SIZE, fontweight='bold', pad=20)
    ax.set_xlabel("Number of Clusters, k", fontsize=LABEL_SIZE)
    ax.set_ylabel("AMI Score", fontsize=LABEL_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, frameon=True, fancybox=True, shadow=True)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Panel 4: AIC/BIC
    ax = axes[3]
    aic_bic_colors = {'aic': '#d62728', 'bic': '#9467bd'}
    for metric in ['aic', 'bic']:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, means, yerr=stds, label=metric.upper(),
                    fmt='o-', capsize=CAPSIZE, linewidth=LINEWIDTH,
                    markersize=MARKERSIZE, color=aic_bic_colors[metric])
    ax.set_title("Model Selection Criteria", fontsize=TITLE_SIZE, fontweight='bold', pad=20)
    ax.set_xlabel("Number of Clusters, k", fontsize=LABEL_SIZE)
    ax.set_ylabel("Information Criterion Score", fontsize=LABEL_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, frameon=True, fancybox=True, shadow=True)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    # Adjust layout with more spacing
    plt.tight_layout(pad=3.0)
    
    # Create output directory and save
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"✅ Publication-ready grouped metrics plot saved to: {save_path}")

# Call the function
plot_grouped_metrics_for_paper(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='metrics_comparison_panel1.png'
)

✅ Publication-ready grouped metrics plot saved to: C:/Users/theya/Downloads/ADNIFinal\metrics_comparison_panel1.png


In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_grouped_metrics_for_paper_refined(csv_path, output_dir, save_name='final_grouped_metrics_by_k_readable.png'):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')

    fig, axes = plt.subplots(2, 2, figsize=(22, 16))
    axes = axes.flatten()

    # Font sizes and styles
    TITLE_SIZE = 30
    LABEL_SIZE = 24
    TICK_SIZE = 20
    LEGEND_SIZE = 22
    LINEWIDTH = 3
    CAPSIZE = 4
    ALPHA = 0.9
    MARKERS = ['o', 's', 'D', '^', 'v', '*', 'P']

    # Panel 1: Accuracy (Train, Val, Test)
    ax = axes[0]
    for i, metric in enumerate(['train_accuracy', 'val_accuracy', 'test_accuracy']):
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, means, yerr=stds, label=metric.replace("_", " ").title(),
                    fmt=MARKERS[i]+'-', capsize=CAPSIZE, linewidth=LINEWIDTH, alpha=ALPHA)
    ax.set_title("Accuracy Across Cluster Sizes", fontsize=TITLE_SIZE, fontweight='bold')
    ax.set_xlabel("Number of Clusters (k)", fontsize=LABEL_SIZE)
    ax.set_ylabel("Accuracy (%)", fontsize=LABEL_SIZE)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, loc='lower left')
    ax.grid(True, linestyle='--', alpha=0.5)

    # Panel 2: Loss (Train, Val, Test)
    ax = axes[1]
    for i, metric in enumerate(['train_loss', 'val_loss', 'test_loss']):
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, means, yerr=stds, label=metric.replace("_", " ").title(),
                    fmt=MARKERS[i]+'-', capsize=CAPSIZE, linewidth=LINEWIDTH, alpha=ALPHA)
    ax.set_title("Loss Across Cluster Sizes", fontsize=TITLE_SIZE, fontweight='bold')
    ax.set_xlabel("Number of Clusters (k)", fontsize=LABEL_SIZE)
    ax.set_ylabel("Loss", fontsize=LABEL_SIZE)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, loc='upper left')
    ax.grid(True, linestyle='--', alpha=0.5)

    # Panel 3: AMI (Train, Val)
    ax = axes[2]
    for i, metric in enumerate(['train_ami', 'val_ami']):
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, means, yerr=stds, label=metric.replace("_", " ").title(),
                    fmt=MARKERS[i]+'-', capsize=CAPSIZE, linewidth=LINEWIDTH, alpha=ALPHA)
    ax.set_title("Adjusted Mutual Information (AMI)", fontsize=TITLE_SIZE, fontweight='bold')
    ax.set_xlabel("Number of Clusters (k)", fontsize=LABEL_SIZE)
    ax.set_ylabel("AMI Score", fontsize=LABEL_SIZE)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, loc='upper right')
    ax.grid(True, linestyle='--', alpha=0.5)

    # Panel 4: AIC/BIC
    ax = axes[3]
    for i, metric in enumerate(['aic', 'bic']):
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, means, yerr=stds, label=metric.upper(),
                    fmt=MARKERS[i]+'-', capsize=CAPSIZE, linewidth=LINEWIDTH, alpha=ALPHA)
    ax.set_title("Model Selection Criteria (AIC & BIC)", fontsize=TITLE_SIZE, fontweight='bold')
    ax.set_xlabel("Number of Clusters (k)", fontsize=LABEL_SIZE)
    ax.set_ylabel("Information Criterion Score", fontsize=LABEL_SIZE)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)
    ax.legend(fontsize=LEGEND_SIZE, loc='upper left')
    ax.grid(True, linestyle='--', alpha=0.5)

    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"📊 Saved enhanced 2x2 metrics plot to: {save_path}")


plot_grouped_metrics_for_paper_refined(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='metrics_comparison_panel1.png'
)

📊 Saved enhanced 2x2 metrics plot to: C:/Users/theya/Downloads/ADNIFinal\metrics_comparison_panel1.png


In [14]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_stacked_metrics_for_paper(csv_path, output_dir, save_name='stacked_metrics_by_k.png'):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')

    # Font settings
    TITLE_SIZE = 26
    LABEL_SIZE = 22
    TICK_SIZE = 18
    LEGEND_SIZE = 20
    LINEWIDTH = 3
    CAPSIZE = 4
    MARKERS = ['o', 's', '^', 'D', 'v', '*']
    ALPHA = 0.9

    # Setup 4 vertically stacked subplots
    fig, axes = plt.subplots(4, 1, figsize=(16, 20), sharex=True)
    
    # Plot 1: Accuracy
    ax = axes[0]
    for i, metric in enumerate(['train_accuracy', 'val_accuracy', 'test_accuracy']):
        mean = grouped[metric].mean().reindex(k_values).values
        std = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, mean, yerr=std, label=metric.replace('_', ' ').title(),
                    fmt=MARKERS[i] + '-', linewidth=LINEWIDTH, capsize=CAPSIZE, alpha=ALPHA)
    ax.set_ylabel("Accuracy (%)", fontsize=LABEL_SIZE)
    ax.set_title("Train / Validation / Test Accuracy Across Cluster Sizes", fontsize=TITLE_SIZE, fontweight='bold')
    ax.legend(fontsize=LEGEND_SIZE)
    ax.grid(True, linestyle='--', alpha=0.5)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)

    # Plot 2: Loss
    ax = axes[1]
    for i, metric in enumerate(['train_loss', 'val_loss', 'test_loss']):
        mean = grouped[metric].mean().reindex(k_values).values
        std = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, mean, yerr=std, label=metric.replace('_', ' ').title(),
                    fmt=MARKERS[i] + '-', linewidth=LINEWIDTH, capsize=CAPSIZE, alpha=ALPHA)
    ax.set_ylabel("Loss", fontsize=LABEL_SIZE)
    ax.set_title("Train / Validation / Test Loss Across Cluster Sizes", fontsize=TITLE_SIZE, fontweight='bold')
    ax.legend(fontsize=LEGEND_SIZE)
    ax.grid(True, linestyle='--', alpha=0.5)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)

    # Plot 3: AMI
    ax = axes[2]
    for i, metric in enumerate(['train_ami', 'val_ami']):
        mean = grouped[metric].mean().reindex(k_values).values
        std = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, mean, yerr=std, label=metric.replace('_', ' ').title(),
                    fmt=MARKERS[i] + '-', linewidth=LINEWIDTH, capsize=CAPSIZE, alpha=ALPHA)
    ax.set_ylabel("AMI Score", fontsize=LABEL_SIZE)
    ax.set_title("Adjusted Mutual Information (AMI)", fontsize=TITLE_SIZE, fontweight='bold')
    ax.legend(fontsize=LEGEND_SIZE)
    ax.grid(True, linestyle='--', alpha=0.5)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)

    # Plot 4: AIC and BIC
    ax = axes[3]
    for i, metric in enumerate(['aic', 'bic']):
        mean = grouped[metric].mean().reindex(k_values).values
        std = grouped[metric].std().reindex(k_values).values
        ax.errorbar(k_values, mean, yerr=std, label=metric.upper(),
                    fmt=MARKERS[i] + '-', linewidth=LINEWIDTH, capsize=CAPSIZE, alpha=ALPHA)
    ax.set_ylabel("Information Criterion", fontsize=LABEL_SIZE)
    ax.set_xlabel("Number of Clusters (k)", fontsize=LABEL_SIZE)
    ax.set_title("AIC and BIC Scores", fontsize=TITLE_SIZE, fontweight='bold')
    ax.legend(fontsize=LEGEND_SIZE)
    ax.grid(True, linestyle='--', alpha=0.5)
    ax.tick_params(axis='both', labelsize=TICK_SIZE)

    plt.tight_layout(h_pad=3)
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved vertically stacked paper plot to: {save_path}")

plot_stacked_metrics_for_paper(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='metrics_comparison_panel1.png'
)

✅ Saved vertically stacked paper plot to: C:/Users/theya/Downloads/ADNIFinal\metrics_comparison_panel1.png


In [15]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_accuracy_loss_with_split_yaxis(csv_path, output_dir, save_name='split_accuracy_loss_by_k.png'):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')

    metrics_dict = {
        "Accuracy": {
            "metrics": ['train_accuracy', 'val_accuracy', 'test_accuracy'],
            "ylabel": "Accuracy (%)",
            "title": "Train / Validation / Test Accuracy",
        },
        "Loss": {
            "metrics": ['train_loss', 'val_loss', 'test_loss'],
            "ylabel": "Loss",
            "title": "Train / Validation / Test Loss",
        }
    }

    MARKERS = ['o', 's', '^']
    COLORS = ['tab:blue', 'tab:orange', 'tab:green']
    LABELS = ['Train', 'Validation', 'Test']
    LINEWIDTH = 2.5

    fig, axes = plt.subplots(2, 1, figsize=(14, 12), sharex=True)
    
    for idx, (key, info) in enumerate(metrics_dict.items()):
        ax = axes[idx]

        for i, metric in enumerate(info['metrics']):
            means = grouped[metric].mean().reindex(k_values).values
            stds = grouped[metric].std().reindex(k_values).values

            ax.plot(k_values, means, marker=MARKERS[i], linewidth=LINEWIDTH, color=COLORS[i],
                    label=LABELS[i])
            ax.fill_between(k_values, means - stds, means + stds, alpha=0.2, color=COLORS[i])

        ax.set_title(info['title'], fontsize=24, fontweight='bold')
        ax.set_ylabel(info['ylabel'], fontsize=20)
        ax.tick_params(axis='both', labelsize=16)
        ax.grid(True, linestyle='--', alpha=0.5)
        ax.legend(fontsize=16, loc='best')

    axes[-1].set_xlabel("Number of Clusters (k)", fontsize=20)
    axes[-1].tick_params(axis='x', labelsize=16)
    
    plt.tight_layout()
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"✅ Saved accuracy/loss dual plot: {save_path}")


plot_accuracy_loss_with_split_yaxis(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='metrics_comparison_panel1.png'
)


✅ Saved accuracy/loss dual plot: C:/Users/theya/Downloads/ADNIFinal\metrics_comparison_panel1.png


In [21]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_professional_accuracy_loss_improved(csv_path, output_dir, save_name='professional_accuracy_loss_improved.png'):
    df = pd.read_csv(csv_path)
    # Filter to only include integer k values (2, 3, 4, 5, 6)
    df = df[df['num_cluster'].isin([2, 3, 4, 5, 6])]
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    # Set professional style
    plt.style.use('seaborn-v0_8-paper')
    sns.set_palette("deep")
    
    # Create figure with wider dimensions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))  # Made wider
    
    # Define darker, more visible colors
    colors = {
        'train': '#1B4F72',      # Darker blue
        'val': '#7D3C98',        # Darker purple
        'test': '#D35400'        # Darker orange
    }
    
    # Define styling parameters - made thicker and more visible
    LINEWIDTH = 4.0           # Increased from 2.8
    MARKERSIZE = 10           # Increased from 7
    CAPSIZE = 8               # Increased from 5
    CAPTHICK = 2.5            # Increased from 1.5
    
    # Plot Accuracy (Left panel)
    accuracy_metrics = ['train_accuracy', 'val_accuracy', 'test_accuracy']
    for metric in accuracy_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        ax1.errorbar(k_values, means, yerr=stds, 
                    label=label, color=colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=2, markeredgecolor='white')
    
    # Style accuracy plot with larger fonts
    ax1.set_title('Model Accuracy by Cluster Size', fontsize=24, fontweight='700', pad=20)
    ax1.set_xlabel('Number of Clusters (k)', fontsize=20)
    ax1.set_ylabel('Accuracy (%)', fontsize=20)
    ax1.legend(fontsize=18, frameon=True, fancybox=True, shadow=True, 
              edgecolor='gray', facecolor='white', framealpha=0.9)
    ax1.tick_params(axis='both', labelsize=18)  # 5x increase from ~3.6
    ax1.grid(True, linestyle='-', alpha=0.3, linewidth=1.2)
    ax1.set_axisbelow(True)
    
    # Set x-axis to show only integer values
    ax1.set_xticks(k_values)
    ax1.set_xticklabels([str(int(k)) for k in k_values])
    
    # Remove top and right spines for cleaner look
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_linewidth(2)
    ax1.spines['bottom'].set_linewidth(2)
    
    # Plot Loss (Right panel)
    loss_metrics = ['train_loss', 'val_loss', 'test_loss']
    for metric in loss_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        ax2.errorbar(k_values, means, yerr=stds, 
                    label=label, color=colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=2, markeredgecolor='white')
    
    # Style loss plot with larger fonts
    ax2.set_title('Model Loss by Cluster Size', fontsize=24, fontweight='700', pad=20)
    ax2.set_xlabel('Number of Clusters (k)', fontsize=20)
    ax2.set_ylabel('Loss', fontsize=20)
    ax2.legend(fontsize=18, frameon=True, fancybox=True, shadow=True,
              edgecolor='gray', facecolor='white', framealpha=0.9)
    ax2.tick_params(axis='both', labelsize=18)  # 5x increase
    ax2.grid(True, linestyle='-', alpha=0.3, linewidth=1.2)
    ax2.set_axisbelow(True)
    
    # Set x-axis to show only integer values
    ax2.set_xticks(k_values)
    ax2.set_xticklabels([str(int(k)) for k in k_values])
    
    # Remove top and right spines
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_linewidth(2)
    ax2.spines['bottom'].set_linewidth(2)
    
    # Adjust layout with proper spacing
    plt.tight_layout(pad=3.0)
    
    # Add subtle figure border
    fig.patch.set_edgecolor('darkgray')
    fig.patch.set_linewidth(2)
    
    # Save with high quality
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    plt.close()
    print(f"✅ Improved professional accuracy/loss plot saved: {save_path}")

def plot_professional_ami_selection_improved(csv_path, output_dir, save_name='professional_ami_selection_improved.png'):
    df = pd.read_csv(csv_path)
    # Filter to only include integer k values (2, 3, 4, 5, 6)
    df = df[df['num_cluster'].isin([2, 3, 4, 5, 6])]
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    # Set professional style
    plt.style.use('seaborn-v0_8-paper')
    
    # Create figure with wider dimensions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))  # Made wider
    
    # Define darker colors
    ami_colors = {'train': '#1B4F72', 'val': '#7D3C98'}
    criterion_colors = {'aic': '#A93226', 'bic': '#6C3483'}
    
    # Styling parameters - made thicker and more visible
    LINEWIDTH = 4.0
    MARKERSIZE = 10
    CAPSIZE = 8
    CAPTHICK = 2.5
    
    # Plot AMI (Left panel)
    ami_metrics = ['train_ami', 'val_ami']
    for metric in ami_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        ax1.errorbar(k_values, means, yerr=stds, 
                    label=label, color=ami_colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=2, markeredgecolor='white')
    
    # Style AMI plot with larger fonts
    ax1.set_title('Adjusted Mutual Information', fontsize=24, fontweight='700', pad=20)
    ax1.set_xlabel('Number of Clusters (k)', fontsize=20)
    ax1.set_ylabel('AMI Score', fontsize=20)
    ax1.legend(fontsize=18, frameon=True, fancybox=True, shadow=True,
              edgecolor='gray', facecolor='white', framealpha=0.9)
    ax1.tick_params(axis='both', labelsize=18)  # 5x increase
    ax1.grid(True, linestyle='-', alpha=0.3, linewidth=1.2)
    ax1.set_axisbelow(True)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_linewidth(2)
    ax1.spines['bottom'].set_linewidth(2)
    
    # Set x-axis to show only integer values
    ax1.set_xticks(k_values)
    ax1.set_xticklabels([str(int(k)) for k in k_values])
    
    # Plot Model Selection Criteria (Right panel)
    for metric in ['aic', 'bic']:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        
        ax2.errorbar(k_values, means, yerr=stds, 
                    label=metric.upper(), color=criterion_colors[metric],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=2, markeredgecolor='white')
    
    # Style model selection plot with larger fonts
    ax2.set_title('Model Selection Criteria', fontsize=24, fontweight='700', pad=20)
    ax2.set_xlabel('Number of Clusters (k)', fontsize=20)
    ax2.set_ylabel('Information Criterion Score', fontsize=20)
    ax2.legend(fontsize=18, frameon=True, fancybox=True, shadow=True,
              edgecolor='gray', facecolor='white', framealpha=0.9)
    ax2.tick_params(axis='both', labelsize=18)  # 5x increase
    ax2.grid(True, linestyle='-', alpha=0.3, linewidth=1.2)
    ax2.set_axisbelow(True)
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_linewidth(2)
    ax2.spines['bottom'].set_linewidth(2)
    
    # Set x-axis to show only integer values
    ax2.set_xticks(k_values)
    ax2.set_xticklabels([str(int(k)) for k in k_values])
    
    # Adjust layout
    plt.tight_layout(pad=3.0)
    
    # Add subtle figure border
    fig.patch.set_edgecolor('darkgray')
    fig.patch.set_linewidth(2)
    
    # Save with high quality
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    plt.close()
    print(f"✅ Improved professional AMI/selection plot saved: {save_path}")

# Run both functions
plot_professional_accuracy_loss_improved(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='accuracy_loss_professional_improved.png'
)

plot_professional_ami_selection_improved(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='ami_selection_professional_improved.png'
)

✅ Improved professional accuracy/loss plot saved: C:/Users/theya/Downloads/ADNIFinal\accuracy_loss_professional_improved.png
✅ Improved professional AMI/selection plot saved: C:/Users/theya/Downloads/ADNIFinal\ami_selection_professional_improved.png


In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_vertical_stacked_metrics(csv_path, output_dir, save_name='vertical_stacked_metrics.png'):
    df = pd.read_csv(csv_path)
    # Filter to only include integer k values (2, 3, 4, 5, 6)
    df = df[df['num_cluster'].isin([2, 3, 4, 5, 6])]
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    # Add padding to x-axis
    k_values_padded = [1.5] + k_values + [4.5]
    
    # Set white background for maximum contrast
    plt.style.use('classic')
    
    # Create figure with vertical layout: 3 rows for accuracy, 3 rows for loss
    fig, axes = plt.subplots(6, 1, figsize=(16, 20))
    fig.patch.set_facecolor('white')
    
    # Define colors for each split
    colors = {
        'train': '#000080',      # Navy blue
        'val': '#8B0000',        # Dark red
        'test': '#006400'        # Dark green
    }
    
    # ULTRA BOLD styling parameters
    LINEWIDTH = 6.0
    MARKERSIZE = 15
    CAPSIZE = 12
    CAPTHICK = 4.0
    MARKEREDGEWIDTH = 3
    
    # Plot Accuracy metrics (top 3 subplots)
    accuracy_metrics = ['train_accuracy', 'val_accuracy', 'test_accuracy']
    labels = ['Train', 'Validation', 'Test']
    
    for i, (metric, label) in enumerate(zip(accuracy_metrics, labels)):
        ax = axes[i]
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        
        # Add padding points with NaN values (invisible)
        means_padded = [np.nan] + list(means) + [np.nan]
        stds_padded = [0] + list(stds) + [0]
        
        ax.errorbar(k_values_padded, means_padded, yerr=stds_padded, 
                   color=colors[split], fmt='o-', linewidth=LINEWIDTH, 
                   markersize=MARKERSIZE, capsize=CAPSIZE, capthick=CAPTHICK,
                   markeredgewidth=MARKEREDGEWIDTH, markeredgecolor='white',
                   elinewidth=LINEWIDTH)
        
        # Styling
        ax.set_ylabel(f'{label}\nAccuracy (%)', fontsize=18, fontweight='bold')
        ax.tick_params(axis='both', labelsize=16, width=3, length=8)
        for label_obj in ax.get_xticklabels() + ax.get_yticklabels():
            label_obj.set_fontweight('bold')
        
        ax.grid(True, linestyle='-', alpha=0.4, linewidth=2)
        ax.set_axisbelow(True)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_linewidth(4)
        ax.spines['bottom'].set_linewidth(4)
        
        # Only show x-axis labels on bottom plot
        if i < 2:
            ax.set_xticklabels([])
        else:
            ax.set_xticks(k_values)
            ax.set_xticklabels([str(int(k)) for k in k_values], fontweight='bold')
    
    # Plot Loss metrics (bottom 3 subplots)
    loss_metrics = ['train_loss', 'val_loss', 'test_loss']
    
    for i, (metric, label) in enumerate(zip(loss_metrics, labels)):
        ax = axes[i + 3]
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        
        # Add padding points with NaN values (invisible)
        means_padded = [np.nan] + list(means) + [np.nan]
        stds_padded = [0] + list(stds) + [0]
        
        ax.errorbar(k_values_padded, means_padded, yerr=stds_padded, 
                   color=colors[split], fmt='o-', linewidth=LINEWIDTH, 
                   markersize=MARKERSIZE, capsize=CAPSIZE, capthick=CAPTHICK,
                   markeredgewidth=MARKEREDGEWIDTH, markeredgecolor='white',
                   elinewidth=LINEWIDTH)
        
        # Styling
        ax.set_ylabel(f'{label}\nLoss', fontsize=18, fontweight='bold')
        ax.tick_params(axis='both', labelsize=16, width=3, length=8)
        for label_obj in ax.get_xticklabels() + ax.get_yticklabels():
            label_obj.set_fontweight('bold')
        
        ax.grid(True, linestyle='-', alpha=0.4, linewidth=2)
        ax.set_axisbelow(True)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_linewidth(4)
        ax.spines['bottom'].set_linewidth(4)
        
        # Only show x-axis labels on bottom plot
        if i < 2:
            ax.set_xticklabels([])
        else:
            ax.set_xticks(k_values)
            ax.set_xticklabels([str(int(k)) for k in k_values], fontweight='bold')
    
    # Add main titles
    fig.text(0.5, 0.98, 'Model Performance Across Cluster Sizes', 
             ha='center', fontsize=24, fontweight='black')
    fig.text(0.5, 0.68, 'ACCURACY', ha='center', fontsize=20, fontweight='bold')
    fig.text(0.5, 0.35, 'LOSS', ha='center', fontsize=20, fontweight='bold')
    
    # Add x-axis label at bottom
    fig.text(0.5, 0.02, 'Number of Clusters (k)', ha='center', 
             fontsize=20, fontweight='bold')
    
    # Adjust layout
    plt.tight_layout(rect=[0, 0.03, 1, 0.96])
    plt.subplots_adjust(hspace=0.1)  # Minimal space between subplots
    
    # Save
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='black')
    plt.close()
    print(f"✅ Vertical stacked plot saved: {save_path}")

# Example usage
plot_vertical_stacked_metrics(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='vertical_stacked_metrics.png'
)

✅ Vertical stacked plot saved: C:/Users/theya/Downloads/ADNIFinal\vertical_stacked_metrics.png


In [25]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_improved_side_by_side_clean(csv_path, output_dir, save_name='improved_accuracy_loss_clean.png'):
    df = pd.read_csv(csv_path)
    # Filter to only include integer k values (2, 3, 4)
    df = df[df['num_cluster'].isin([2, 3, 4])]
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    # Set white background
    plt.style.use('classic')
    
    # Create figure with side-by-side layout
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
    fig.patch.set_facecolor('white')
    
    # Define much darker, more visible colors
    colors = {
        'train': '#000080',      # Navy blue
        'val': '#8B0000',        # Dark red  
        'test': '#FF8C00'        # Dark orange (changed from green for better contrast)
    }
    
    # ULTRA BOLD styling parameters
    LINEWIDTH = 5.0           # Thick lines
    MARKERSIZE = 12           # Large markers
    CAPSIZE = 10              # Large error bar caps
    CAPTHICK = 3.0            # Thick caps
    MARKEREDGEWIDTH = 2.5     # Thick marker edges
    
    # Plot Accuracy (Left panel)
    accuracy_metrics = ['train_accuracy', 'val_accuracy', 'test_accuracy']
    for metric in accuracy_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        ax1.errorbar(k_values, means, yerr=stds, 
                    label=label, color=colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=MARKEREDGEWIDTH, markeredgecolor='white',
                    elinewidth=LINEWIDTH)
    
    # Style accuracy plot
    ax1.set_title('Model Accuracy by Cluster Size', fontsize=22, fontweight='bold', pad=20)
    ax1.set_xlabel('Number of Clusters (k)', fontsize=18, fontweight='bold')
    ax1.set_ylabel('Accuracy (%)', fontsize=18, fontweight='bold')
    ax1.legend(fontsize=16, frameon=True, fancybox=False, shadow=False,
              edgecolor='black', facecolor='white', framealpha=1.0,
              prop={'weight': 'bold'}, loc='lower left')
    ax1.tick_params(axis='both', labelsize=16, width=2, length=6)
    for label in ax1.get_xticklabels() + ax1.get_yticklabels():
        label.set_fontweight('bold')
    
    # Remove grid completely
    ax1.grid(False)
    ax1.set_axisbelow(True)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_linewidth(2)
    ax1.spines['bottom'].set_linewidth(2)
    ax1.spines['left'].set_color('black')
    ax1.spines['bottom'].set_color('black')
    
    # Set x-axis
    ax1.set_xticks(k_values)
    ax1.set_xticklabels([str(int(k)) for k in k_values], fontweight='bold')
    
    # Plot Loss (Right panel)
    loss_metrics = ['train_loss', 'val_loss', 'test_loss']
    for metric in loss_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        ax2.errorbar(k_values, means, yerr=stds, 
                    label=label, color=colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=MARKEREDGEWIDTH, markeredgecolor='white',
                    elinewidth=LINEWIDTH)
    
    # Style loss plot
    ax2.set_title('Model Loss by Cluster Size', fontsize=22, fontweight='bold', pad=20)
    ax2.set_xlabel('Number of Clusters (k)', fontsize=18, fontweight='bold')
    ax2.set_ylabel('Loss', fontsize=18, fontweight='bold')
    ax2.legend(fontsize=16, frameon=True, fancybox=False, shadow=False,
              edgecolor='black', facecolor='white', framealpha=1.0,
              prop={'weight': 'bold'})
    ax2.tick_params(axis='both', labelsize=16, width=2, length=6)
    for label in ax2.get_xticklabels() + ax2.get_yticklabels():
        label.set_fontweight('bold')
    
    # Remove grid completely
    ax2.grid(False)
    ax2.set_axisbelow(True)
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_linewidth(2)
    ax2.spines['bottom'].set_linewidth(2)
    ax2.spines['left'].set_color('black')
    ax2.spines['bottom'].set_color('black')
    
    # Set x-axis
    ax2.set_xticks(k_values)
    ax2.set_xticklabels([str(int(k)) for k in k_values], fontweight='bold')
    
    # Adjust layout
    plt.tight_layout(pad=3.0)
    
    # Save
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    plt.close()
    print(f"✅ Clean improved plot saved: {save_path}")

def plot_improved_with_offset_lines(csv_path, output_dir, save_name='improved_accuracy_loss_offset.png'):
    df = pd.read_csv(csv_path)
    # Filter to only include integer k values (2, 3, 4)
    df = df[df['num_cluster'].isin([2, 3, 4])]
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    # Set white background
    plt.style.use('classic')
    
    # Create figure with side-by-side layout
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
    fig.patch.set_facecolor('white')
    
    # Define much darker, more visible colors
    colors = {
        'train': '#000080',      # Navy blue
        'val': '#8B0000',        # Dark red  
        'test': '#FF6600'        # Bright orange
    }
    
    # ULTRA BOLD styling parameters
    LINEWIDTH = 5.0
    MARKERSIZE = 12
    CAPSIZE = 10
    CAPTHICK = 3.0
    MARKEREDGEWIDTH = 2.5
    
    # Create small horizontal offsets to separate the lines
    offsets = {'train': -0.05, 'val': 0.0, 'test': 0.05}
    
    # Plot Accuracy (Left panel) with horizontal offsets
    accuracy_metrics = ['train_accuracy', 'val_accuracy', 'test_accuracy']
    for metric in accuracy_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        # Apply small horizontal offset
        k_offset = [k + offsets[split] for k in k_values]
        
        ax1.errorbar(k_offset, means, yerr=stds, 
                    label=label, color=colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=MARKEREDGEWIDTH, markeredgecolor='white',
                    elinewidth=LINEWIDTH)
    
    # Style accuracy plot
    ax1.set_title('Model Accuracy by Cluster Size', fontsize=22, fontweight='bold', pad=20)
    ax1.set_xlabel('Number of Clusters (k)', fontsize=18, fontweight='bold')
    ax1.set_ylabel('Accuracy (%)', fontsize=18, fontweight='bold')
    ax1.legend(fontsize=16, frameon=True, fancybox=False, shadow=False,
              edgecolor='black', facecolor='white', framealpha=1.0,
              prop={'weight': 'bold'}, loc='lower left')
    ax1.tick_params(axis='both', labelsize=16, width=2, length=6)
    for label in ax1.get_xticklabels() + ax1.get_yticklabels():
        label.set_fontweight('bold')
    
    # Clean styling
    ax1.grid(False)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_linewidth(2)
    ax1.spines['bottom'].set_linewidth(2)
    
    # Set x-axis (keep original positions for ticks)
    ax1.set_xticks(k_values)
    ax1.set_xticklabels([str(int(k)) for k in k_values], fontweight='bold')
    
    # Plot Loss (Right panel) with horizontal offsets
    loss_metrics = ['train_loss', 'val_loss', 'test_loss']
    for metric in loss_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        label = f"{split.capitalize()}"
        
        # Apply small horizontal offset
        k_offset = [k + offsets[split] for k in k_values]
        
        ax2.errorbar(k_offset, means, yerr=stds, 
                    label=label, color=colors[split],
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=MARKEREDGEWIDTH, markeredgecolor='white',
                    elinewidth=LINEWIDTH)
    
    # Style loss plot
    ax2.set_title('Model Loss by Cluster Size', fontsize=22, fontweight='bold', pad=20)
    ax2.set_xlabel('Number of Clusters (k)', fontsize=18, fontweight='bold')
    ax2.set_ylabel('Loss', fontsize=18, fontweight='bold')
    ax2.legend(fontsize=16, frameon=True, fancybox=False, shadow=False,
              edgecolor='black', facecolor='white', framealpha=1.0,
              prop={'weight': 'bold'})
    ax2.tick_params(axis='both', labelsize=16, width=2, length=6)
    for label in ax2.get_xticklabels() + ax2.get_yticklabels():
        label.set_fontweight('bold')
    
    # Clean styling
    ax2.grid(False)
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_linewidth(2)
    ax2.spines['bottom'].set_linewidth(2)
    
    # Set x-axis (keep original positions for ticks)
    ax2.set_xticks(k_values)
    ax2.set_xticklabels([str(int(k)) for k in k_values], fontweight='bold')
    
    # Adjust layout
    plt.tight_layout(pad=3.0)
    
    # Save
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', 
                facecolor='white', edgecolor='none')
    plt.close()
    print(f"✅ Offset lines plot saved: {save_path}")

# Create both versions
plot_improved_side_by_side_clean(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='clean_accuracy_loss.png'
)

plot_improved_with_offset_lines(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='offset_accuracy_loss.png'
)

✅ Clean improved plot saved: C:/Users/theya/Downloads/ADNIFinal\clean_accuracy_loss.png
✅ Offset lines plot saved: C:/Users/theya/Downloads/ADNIFinal\offset_accuracy_loss.png


In [None]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def plot_professional_accuracy_loss(csv_path, output_dir, save_name='professional_accuracy_loss.png'):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    plt.style.use('seaborn-v0_8-paper')
    sns.set_palette("deep")
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    colors = {
        'train': '#2E86AB',
        'val': '#A23B72',
        'test': '#F18F01'
    }
    
    LINEWIDTH = 2.8
    MARKERSIZE = 7
    CAPSIZE = 5
    CAPTHICK = 1.5
    
    # Accuracy
    accuracy_metrics = ['train_accuracy', 'val_accuracy', 'test_accuracy']
    for metric in accuracy_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        ax1.errorbar(k_values, means, yerr=stds, 
                     label=split.capitalize(), color=colors[split],
                     fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                     capsize=CAPSIZE, capthick=CAPTHICK,
                     markeredgewidth=1.2, markeredgecolor='white')
    
    ax1.set_title('Model Accuracy by Cluster Size', fontsize=16, fontweight='bold', pad=15)
    ax1.set_xlabel('Number of Clusters (k)', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Accuracy (%)', fontsize=14, fontweight='bold')
    ax1.set_xlim(1.8, 4.2)
    ax1.legend(fontsize=12, frameon=True, fancybox=True, edgecolor='gray', facecolor='white', framealpha=0.9)
    ax1.tick_params(axis='both', labelsize=12)
    ax1.grid(True, linestyle='-', alpha=0.2, linewidth=0.8)
    ax1.set_axisbelow(True)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_linewidth(1.2)
    ax1.spines['bottom'].set_linewidth(1.2)
    
    # Loss
    loss_metrics = ['train_loss', 'val_loss', 'test_loss']
    for metric in loss_metrics:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        ax2.errorbar(k_values, means, yerr=stds, 
                     label=split.capitalize(), color=colors[split],
                     fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                     capsize=CAPSIZE, capthick=CAPTHICK,
                     markeredgewidth=1.2, markeredgecolor='white')
    
    ax2.set_title('Model Loss by Cluster Size', fontsize=16, fontweight='bold', pad=15)
    ax2.set_xlabel('Number of Clusters (k)', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Loss', fontsize=14, fontweight='bold')
    ax2.set_xlim(1.8, 4.2)
    ax2.set_xticks(k_values)
    ax2.legend(fontsize=12, frameon=True, fancybox=True, edgecolor='gray', facecolor='white', framealpha=0.9)
    ax2.tick_params(axis='both', labelsize=12)
    ax2.grid(True, linestyle='-', alpha=0.2, linewidth=0.8)
    ax2.set_axisbelow(True)
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_linewidth(1.2)
    ax2.spines['bottom'].set_linewidth(1.2)
    
    plt.tight_layout(pad=2.0)
    fig.patch.set_edgecolor('lightgray')
    fig.patch.set_linewidth(1)
    
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
    plt.close()
    print(f"✅ Professional accuracy/loss plot saved: {save_path}")


def plot_professional_ami_selection(csv_path, output_dir, save_name='professional_ami_selection.png'):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')
    
    plt.style.use('seaborn-v0_8-paper')
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    ami_colors = {'train': '#2E86AB', 'val': '#A23B72'}
    criterion_colors = {'aic': '#C73E1D', 'bic': '#8B5A83'}
    
    LINEWIDTH = 2.8
    MARKERSIZE = 7
    CAPSIZE = 5
    CAPTHICK = 1.5
    
    # AMI
    for metric in ['train_ami', 'val_ami']:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        split = metric.split('_')[0]
        ax1.errorbar(k_values, means, yerr=stds,
                     label=split.capitalize(), color=ami_colors[split],
                     fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                     capsize=CAPSIZE, capthick=CAPTHICK,
                     markeredgewidth=1.2, markeredgecolor='white')
    
    ax1.set_title('Adjusted Mutual Information', fontsize=16, fontweight='bold', pad=15)
    ax1.set_xlabel('Number of Clusters (k)', fontsize=14, fontweight='bold')
    ax1.set_ylabel('AMI Score', fontsize=14, fontweight='bold')
    ax1.set_xlim(1.8, 4.2)
    ax1.legend(fontsize=12, frameon=True, fancybox=True, edgecolor='gray', facecolor='white', framealpha=0.9)
    ax1.tick_params(axis='both', labelsize=12)
    ax1.grid(True, linestyle='-', alpha=0.2, linewidth=0.8)
    ax1.set_axisbelow(True)
    ax1.spines['top'].set_visible(False)
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_linewidth(1.2)
    ax1.spines['bottom'].set_linewidth(1.2)
    
    # AIC/BIC
    for metric in ['aic', 'bic']:
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values
        ax2.errorbar(k_values, means, yerr=stds, 
                     label=metric.upper(), color=criterion_colors[metric],
                     fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                     capsize=CAPSIZE, capthick=CAPTHICK,
                     markeredgewidth=1.2, markeredgecolor='white')
    
    ax2.set_title('Model Selection Criteria', fontsize=16, fontweight='bold', pad=15)
    ax2.set_xlabel('Number of Clusters (k)', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Information Criterion Score', fontsize=14, fontweight='bold')
    ax2.set_xlim(1.8, 4.2)
    ax2.legend(fontsize=12, frameon=True, fancybox=True, edgecolor='gray', facecolor='white', framealpha=0.9)
    ax2.tick_params(axis='both', labelsize=12)
    ax2.grid(True, linestyle='-', alpha=0.2, linewidth=0.8)
    ax2.set_axisbelow(True)
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_linewidth(1.2)
    ax2.spines['bottom'].set_linewidth(1.2)
    
    plt.tight_layout(pad=2.0)
    fig.patch.set_edgecolor('lightgray')
    fig.patch.set_linewidth(1)
    
    os.makedirs(output_dir, exist_ok=True)
    save_path = os.path.join(output_dir, save_name)
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white', edgecolor='none')
    plt.close()
    print(f"✅ Professional AMI/selection plot saved: {save_path}")

# Run both functions
plot_professional_accuracy_loss(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='accuracy_loss_professional.png'
)

plot_professional_ami_selection(
    csv_path='combined_nested_with_test_metrics.csv',
    output_dir='C:/Users/theya/Downloads/ADNIFinal',
    save_name='ami_selection_professional.png'
)

✅ Professional accuracy/loss plot saved: C:/Users/theya/Downloads/ADNIFinal\accuracy_loss_professional.png
✅ Professional AMI/selection plot saved: C:/Users/theya/Downloads/ADNIFinal\ami_selection_professional.png


✅ Publication-ready grouped metrics plot saved to: C:/Users/theya/Downloads/ADNIFinal\metrics_comparison_panel1.png


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Global config
plt.style.use('seaborn-v0_8-paper')
sns.set_palette("deep")

LINEWIDTH = 2.8
MARKERSIZE = 7
CAPSIZE = 5
CAPTHICK = 1.5
colors = {'train': '#2E86AB', 'val': '#A23B72', 'test': '#F18F01'}
ami_colors = {'train': '#2E86AB', 'val': '#A23B72'}
criterion_colors = {'aic': '#C73E1D', 'bic': '#8B5A83'}

# Utility plotter
def plot_metric(df, k_values, grouped, metrics, ylabel, title, legend_loc, colors_dict, ylim, save_name):
    fig, ax = plt.subplots(figsize=(7, 5))
    for metric in metrics:
        split = metric.split('_')[0] if '_' in metric else metric
        label = split.upper() if metric in ['aic', 'bic'] else split.capitalize()
        means = grouped[metric].mean().reindex(k_values).values
        stds = grouped[metric].std().reindex(k_values).values


        # Clip bounds for metrics like AMI
        if metric.startswith('train_ami') or metric.startswith('val_ami'):
            upper = np.clip(means + stds, 0, 1)
            lower = np.clip(means - stds, 0, 1)
            stds = [upper[i] - means[i] if stds[i] > 0 else 0 for i in range(len(stds))]
            lower_err = [means[i] - lower[i] if stds[i] > 0 else 0 for i in range(len(stds))]
            yerr = [lower_err, stds]
        else:
            yerr = stds

            
        ax.errorbar(k_values, means, yerr=stds,
                    label=label, color=colors_dict.get(split, '#333'),
                    fmt='o-', linewidth=LINEWIDTH, markersize=MARKERSIZE,
                    capsize=CAPSIZE, capthick=CAPTHICK,
                    markeredgewidth=1.2, markeredgecolor='white')

    ax.set_xlim(1.8, 4.2)
    
    if ylim:
        ax.set_ylim(ylim)

    ax.set_xlabel('Number of Clusters (k)', fontsize=18, fontweight='bold')
    ax.set_xticks(k_values)  # Only show ticks at k=2,3,4
    

    ax.set_ylabel(ylabel, fontsize=18, fontweight='bold')
    ax.set_title(title, fontsize=20, fontweight='bold', pad=15)

    ax.legend(loc=legend_loc, fontsize=14, frameon=True, fancybox=True,
              edgecolor='gray', facecolor='white', framealpha=0.9)

    ax.tick_params(axis='both', labelsize=16)
    for tick in ax.get_xticklabels():
        tick.set_fontweight('bold')
    for tick in ax.get_yticklabels():
        tick.set_fontweight('bold')
    ax.grid(True, linestyle='-', alpha=0.5, linewidth=1.0)
    ax.set_axisbelow(True)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_linewidth(1.2)
    ax.spines['bottom'].set_linewidth(1.2)

    save_path = os.path.join('C:/Users/theya/Downloads/ADNIFinal', f'{save_name}.pdf')
    plt.savefig(save_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    print(f"✅ Saved: {save_path}")

# Main runner
def generate_all_plots(csv_path):
    df = pd.read_csv(csv_path)
    k_values = sorted(df['num_cluster'].unique())
    grouped = df.groupby('num_cluster')

    os.makedirs('C:/Users/theya/Downloads/ADNIFinal', exist_ok=True)

    # Accuracy
    plot_metric(df, k_values, grouped,
                metrics=['train_accuracy', 'val_accuracy', 'test_accuracy'],
                ylabel='Accuracy (%)',
                title='Model Accuracy by Cluster Size',
                legend_loc='lower left',
                colors_dict=colors,
                ylim=(80, 105),
                save_name='accuracy_plot')

    # Loss
    plot_metric(df, k_values, grouped,
                metrics=['train_loss', 'val_loss', 'test_loss'],
                ylabel='Loss',
                title='Model Loss by Cluster Size',
                legend_loc='upper left',
                colors_dict=colors,
                ylim=(-1, 2.5),
                save_name='loss_plot')

    # AMI
    plot_metric(df, k_values, grouped,
                metrics=['train_ami', 'val_ami'],
                ylabel='AMI Score',
                title='Adjusted Mutual Information',
                legend_loc='lower left',
                colors_dict=ami_colors,
                ylim=(0.5, 1.05),
                save_name='ami_plot')

    # AIC & BIC
    plot_metric(df, k_values, grouped,
                metrics=['aic', 'bic'],
                ylabel='Information Criterion Score',
                title='Model Fit & Complexity Metrics',
                legend_loc='upper left',
                colors_dict=criterion_colors,
                ylim=None,
                save_name='aic_bic_plot')


# Run the full pipeline
generate_all_plots(
    csv_path='C:/Users/theya/Downloads/ADNIFinal/combined_nested_with_test_metrics.csv'
)


✅ Saved: C:/Users/theya/Downloads/ADNIFinal\accuracy_plot.pdf
✅ Saved: C:/Users/theya/Downloads/ADNIFinal\loss_plot.pdf
✅ Saved: C:/Users/theya/Downloads/ADNIFinal\ami_plot.pdf
✅ Saved: C:/Users/theya/Downloads/ADNIFinal\aic_bic_plot.pdf


In [49]:
def investigate_ami_anomalies(csv_path):
    df = pd.read_csv(csv_path)
    print("\n🔍 Investigating AMI values...")
    
    for split in ['train_ami', 'val_ami']:
        print(f"\n== {split.upper()} ==")
        for k, group in df.groupby('num_cluster'):
            ami_vals = group[split].dropna()
            mean_val = ami_vals.mean()
            std_val = ami_vals.std()
            max_val = ami_vals.max()
            min_val = ami_vals.min()
            over_1 = (ami_vals > 1).sum()

            print(f"k={k}: mean={mean_val:.3f}, std={std_val:.3f}, min={min_val:.3f}, max={max_val:.3f}, >1 count={over_1}")


investigate_ami_anomalies("C:/Users/theya/Downloads/ADNIFinal/combined_nested_with_test_metrics.csv")



🔍 Investigating AMI values...

== TRAIN_AMI ==
k=2: mean=0.950, std=0.116, min=0.625, max=1.000, >1 count=0
k=3: mean=0.947, std=0.068, min=0.823, max=1.000, >1 count=0
k=4: mean=0.886, std=0.126, min=0.639, max=0.985, >1 count=0

== VAL_AMI ==
k=2: mean=0.970, std=0.094, min=0.704, max=1.000, >1 count=0
k=3: mean=0.939, std=0.101, min=0.730, max=1.000, >1 count=0
k=4: mean=0.902, std=0.133, min=0.647, max=1.000, >1 count=0
