# WESAD Label Distribution Analysis

This notebook analyzes the distribution of labels (Baseline, Stress, Amusement) across the preprocessed WESAD data folds.


In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Configuration
PREPROCESSED_DIR = '/fd24T/zzhao3/EDA/preprocessed_data'
SUBJECT_IDS = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]

# Label mapping
LABEL_DICT = {
    1: 'Baseline',
    2: 'Stress', 
    3: 'Amusement'
}


## Load Data and Analyze Label Distribution


In [2]:
def load_fold_data(fold_number):
    """Load data from a specific fold."""
    file_path = os.path.join(PREPROCESSED_DIR, f'fold_{fold_number}.npz')
    if not os.path.exists(file_path):
        print(f"Warning: Data file not found at {file_path}")
        return None
    
    try:
        data = np.load(file_path, allow_pickle=True)
        return {
            'X_train': data['X_train'],
            'L_train': data['L_train'],
            'X_test': data['X_test'],
            'L_test': data['L_test']
        }
    except Exception as e:
        print(f"Error loading fold {fold_number}: {e}")
        return None

# Load all folds
fold_data = {}
for subject_id in SUBJECT_IDS:
    data = load_fold_data(subject_id)
    if data is not None:
        fold_data[subject_id] = data

print(f"Successfully loaded {len(fold_data)} folds")


Successfully loaded 15 folds


In [3]:
# Analyze label distribution for each fold
fold_stats = []

for fold_id, data in fold_data.items():
    # Combine train and test labels for overall fold statistics
    all_labels = np.concatenate([data['L_train'], data['L_test']])
    
    # Count labels
    label_counts = Counter(all_labels)
    total_samples = len(all_labels)
    
    # Calculate percentages
    fold_stat = {
        'Fold': f'S{fold_id}',
        'Total_Samples': total_samples,
        'Train_Samples': len(data['L_train']),
        'Test_Samples': len(data['L_test'])
    }
    
    for label_int, label_name in LABEL_DICT.items():
        count = label_counts.get(label_int, 0)
        percentage = (count / total_samples) * 100 if total_samples > 0 else 0
        fold_stat[f'{label_name}_Count'] = count
        fold_stat[f'{label_name}_Percent'] = percentage
    
    fold_stats.append(fold_stat)

# Convert to DataFrame for easier analysis
df_stats = pd.DataFrame(fold_stats)
print("Label distribution per fold:")
display(df_stats)


Label distribution per fold:


Unnamed: 0,Fold,Total_Samples,Train_Samples,Test_Samples,Baseline_Count,Baseline_Percent,Stress_Count,Stress_Percent,Amusement_Count,Amusement_Percent
0,S2,8217,7691,526,4385,53.364975,2466,30.010953,1366,16.624072
1,S3,8217,7683,534,4385,53.364975,2466,30.010953,1366,16.624072
2,S4,8217,7681,536,4385,53.364975,2466,30.010953,1366,16.624072
3,S5,8217,7667,550,4385,53.364975,2466,30.010953,1366,16.624072
4,S6,8217,7671,546,4385,53.364975,2466,30.010953,1366,16.624072
5,S7,8217,7672,545,4385,53.364975,2466,30.010953,1366,16.624072
6,S8,8217,7670,547,4385,53.364975,2466,30.010953,1366,16.624072
7,S9,8217,7672,545,4385,53.364975,2466,30.010953,1366,16.624072
8,S10,8217,7653,564,4385,53.364975,2466,30.010953,1366,16.624072
9,S11,8217,7665,552,4385,53.364975,2466,30.010953,1366,16.624072


In [None]:
# Calculate overall statistics across all folds
all_labels_combined = []
for data in fold_data.values():
    all_labels_combined.extend(data['L_train'])
    all_labels_combined.extend(data['L_test'])

overall_counts = Counter(all_labels_combined)
total_overall = len(all_labels_combined)

print(f"\nOverall Dataset Statistics:")
print(f"Total samples across all folds: {total_overall}")
print(f"Number of folds: {len(fold_data)}")
print("\nLabel distribution:")

overall_stats = []
for label_int, label_name in LABEL_DICT.items():
    count = overall_counts.get(label_int, 0)
    percentage = (count / total_overall) * 100 if total_overall > 0 else 0
    overall_stats.append({
        'Label': label_name,
        'Count': count,
        'Percentage': f"{percentage:.1f}%"
    })
    print(f"  {label_name}: {count} samples ({percentage:.1f}%)")

df_overall = pd.DataFrame(overall_stats)
display(df_overall)


In [None]:
# 1. Overall pie chart and bar chart
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart for overall distribution
labels = [LABEL_DICT[label] for label in LABEL_DICT.keys()]
sizes = [overall_counts.get(label, 0) for label in LABEL_DICT.keys()]
colors = ['#ff9999', '#66b3ff', '#99ff99']

ax1.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors)
ax1.set_title('Overall Label Distribution\nAcross All Folds', fontsize=14, fontweight='bold')

# Bar chart for overall distribution
ax2.bar(labels, sizes, color=colors, alpha=0.7, edgecolor='black')
ax2.set_title('Overall Sample Counts\nAcross All Folds', fontsize=14, fontweight='bold')
ax2.set_ylabel('Number of Samples')
ax2.set_xlabel('Label')

# Add count labels on bars
for i, v in enumerate(sizes):
    ax2.text(i, v + max(sizes)*0.01, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()


In [None]:
# 2. Distribution per fold
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

# Stacked bar chart showing counts per fold
fold_names = df_stats['Fold'].values
baseline_counts = df_stats['Baseline_Count'].values
stress_counts = df_stats['Stress_Count'].values
amusement_counts = df_stats['Amusement_Count'].values

width = 0.6
ax1.bar(fold_names, baseline_counts, width, label='Baseline', color='#ff9999', alpha=0.8)
ax1.bar(fold_names, stress_counts, width, bottom=baseline_counts, label='Stress', color='#66b3ff', alpha=0.8)
ax1.bar(fold_names, amusement_counts, width, 
        bottom=baseline_counts + stress_counts, label='Amusement', color='#99ff99', alpha=0.8)

ax1.set_title('Sample Count Distribution per Fold', fontsize=14, fontweight='bold')
ax1.set_ylabel('Number of Samples')
ax1.set_xlabel('Fold (Test Subject)')
ax1.legend()
ax1.tick_params(axis='x', rotation=45)

# Percentage distribution per fold
baseline_pct = df_stats['Baseline_Percent'].values
stress_pct = df_stats['Stress_Percent'].values
amusement_pct = df_stats['Amusement_Percent'].values

ax2.bar(fold_names, baseline_pct, width, label='Baseline', color='#ff9999', alpha=0.8)
ax2.bar(fold_names, stress_pct, width, bottom=baseline_pct, label='Stress', color='#66b3ff', alpha=0.8)
ax2.bar(fold_names, amusement_pct, width, 
        bottom=baseline_pct + stress_pct, label='Amusement', color='#99ff99', alpha=0.8)

ax2.set_title('Percentage Distribution per Fold', fontsize=14, fontweight='bold')
ax2.set_ylabel('Percentage (%)')
ax2.set_xlabel('Fold (Test Subject)')
ax2.legend()
ax2.tick_params(axis='x', rotation=45)
ax2.set_ylim(0, 100)

plt.tight_layout()
plt.show()


In [None]:
# 3. Box plot showing distribution variability across folds
fig, ax = plt.subplots(figsize=(10, 6))

# Prepare data for box plot
box_data = [
    df_stats['Baseline_Percent'].values,
    df_stats['Stress_Percent'].values,
    df_stats['Amusement_Percent'].values
]

box_plot = ax.boxplot(box_data, labels=['Baseline', 'Stress', 'Amusement'], 
                      patch_artist=True, notch=True)

# Color the boxes
colors = ['#ff9999', '#66b3ff', '#99ff99']
for patch, color in zip(box_plot['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_title('Label Percentage Distribution Variability Across Folds', fontsize=14, fontweight='bold')
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Label')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Summary statistics
print("Summary Statistics:")
print("=" * 50)

for label_name in ['Baseline', 'Stress', 'Amusement']:
    pct_col = f'{label_name}_Percent'
    percentages = df_stats[pct_col].values
    
    print(f"\n{label_name}:")
    print(f"  Mean: {percentages.mean():.1f}%")
    print(f"  Std:  {percentages.std():.1f}%")
    print(f"  Min:  {percentages.min():.1f}%")
    print(f"  Max:  {percentages.max():.1f}%")

print(f"\nDataset Balance:")
print(f"  Most common label: {max(overall_counts, key=overall_counts.get)} ({LABEL_DICT[max(overall_counts, key=overall_counts.get)]})")
print(f"  Least common label: {min(overall_counts, key=overall_counts.get)} ({LABEL_DICT[min(overall_counts, key=overall_counts.get)]})")

# Calculate imbalance ratio
max_count = max(overall_counts.values())
min_count = min(overall_counts.values())
imbalance_ratio = max_count / min_count if min_count > 0 else float('inf')
print(f"  Imbalance ratio: {imbalance_ratio:.2f}:1")


In [None]:
# Save detailed statistics to CSV
output_file = 'label_distribution_analysis.csv'
df_stats.to_csv(output_file, index=False)
print(f"Detailed statistics saved to: {output_file}")

# Save overall statistics
overall_file = 'overall_label_statistics.csv'
df_overall.to_csv(overall_file, index=False)
print(f"Overall statistics saved to: {overall_file}")
