In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# ===== Step 1: Load Data =====
file_path = "bucket_coil_signal_averages_allsignals_phasewise_rollingmode_with_clusters.csv"
df = pd.read_csv(file_path)

# ===== Step 2: Define Columns =====
exclude_cols = [
    "coil_id", "Phase", "Width_Bin", "Gauge_Bin", "Reduction_Bin",
    "Bucket_ID", "Bucket_Name", "rollingmode", "cluster_label"
]
signal_cols = [col for col in df.columns if col not in exclude_cols]

# ===== Step 3: Scale features (important for KMeans-based interpretation) =====
scaler = StandardScaler()
df[signal_cols] = scaler.fit_transform(df[signal_cols].fillna(0))  # fill NaN before scaling

# ===== Step 4: Prepare Containers =====
bucketwise_importance = {}
top30_per_bucket = {}

# ===== Step 5: Compute Feature Importance per Bucket =====
for bucket, bucket_df in df.groupby("Bucket_ID"):
    if bucket_df["cluster_label"].nunique() < 2:
        print(f"Skipping Bucket {bucket} (only 1 cluster found)")
        continue

    # Cluster-level means
    cluster_means = bucket_df.groupby("cluster_label")[signal_cols].mean()

    # Overall means
    overall_mean = bucket_df[signal_cols].mean()

    # --- A. Mean Difference Importance ---
    mean_diff = cluster_means.max() - cluster_means.min()

    # --- B. Between-Cluster Variance Importance ---
    between_var = ((cluster_means - overall_mean) ** 2).sum()  # sum over clusters

    # Combine into one DataFrame for this bucket
    importance_df = pd.DataFrame({
        "Signal": signal_cols,
        "Mean_Diff": mean_diff.values,
        "Between_Var": between_var.values
    }).sort_values("Mean_Diff", ascending=False)

    # Store in dictionary
    bucketwise_importance[bucket] = importance_df
    top30_per_bucket[bucket] = importance_df.head(30)["Signal"].tolist()

# ===== Step 6: Combine All Buckets into One DataFrame =====
combined = pd.concat(
    [
        imp.assign(Bucket_ID=bucket)
        for bucket, imp in bucketwise_importance.items()
    ],
    ignore_index=True
)

# ===== Step 7: Save to CSV =====
combined.to_csv("bucketwise_feature_importance_mean_diff_and_variance.csv", index=False)
print("âœ… Saved: bucketwise_feature_importance_mean_diff_and_variance.csv")

# ===== Step 8: Display Top 30 Signals per Bucket =====
print("\nðŸŽ¯ Top 30 signals per bucket:")
for bucket, signals in top30_per_bucket.items():
    print(f"\nBucket {bucket}:")
    print(signals)

# ===== Step 9: Common Top 10 Signals Across Buckets (Ignoring Phase) =====

def base_signal_name(signal):
    # remove anything starting with "_Phase" (case-insensitive)
    return signal.split("_Phase")[0]

# Convert each bucket's top signals to base names (without phase suffix)
normalized_top30 = {
    bucket: [base_signal_name(sig) for sig in signals]
    for bucket, signals in top30_per_bucket.items()
}

# Find common base signals
if len(normalized_top30) > 1:
    common_signals = set(normalized_top30[list(normalized_top30.keys())[0]])
    for signals in normalized_top30.values():
        common_signals &= set(signals)
    common_top10 = list(common_signals)[:10]
else:
    common_top10 = []

print("\nðŸ”¥ Common Top 10 signals across all buckets (ignoring phase):")
print(common_top10)

Skipping Bucket 4 (only 1 cluster found)
Skipping Bucket 14 (only 1 cluster found)
Skipping Bucket 15 (only 1 cluster found)
Skipping Bucket 18 (only 1 cluster found)
âœ… Saved: bucketwise_feature_importance_mean_diff_and_variance.csv

ðŸŽ¯ Top 30 signals per bucket:

Bucket 1:
['Stand 4 Bottom Current Feedback_Phase3', 'Stand 1-3 Solution System Pressure_Phase2', 'Stand 4 - Operator Side Force_Phase2', 'Stand 4 Bottom Current Feedback_Phase1', 'Stand 4 Top Current Feedback_Phase3', 'Stand 4 - Operator Side Force_Phase3', 'Stand 4 Bottom Current Feedback_Phase2', 'Stand 4 Solution System Pressure_Phase2', 'Stand 4 - Operator Side Force_Phase1', 'Tension Reel Calculated Tension_Phase3', 'Tension Reel Calculated Tension_Phase2', 'Stand 4 Top Current Feedback_Phase1', 'Stand 3 Top Current Feedback_Phase2', 'Stand 1-3 Solution System Pressure_Phase1', 'Stand 4 Top Current Feedback_Phase2', 'Stand 3 Bottom Current Feedback_Phase2', 'Stand 1-2 Total Tension Feedback_Phase3', 'Stand 4 Thread 