In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# === CONFIG ===
base_dir = "C:/Users/lpnhu/Downloads/Stress_Testing_Analysis"

# Path to merged stress labels + 4-day EDA window file
label_path = os.path.join(base_dir, "eda_qc_reports", "eda_4day_merged_with_labels.csv")

# Path to folder containing per-participant EDA summary files
summary_dir = os.path.join(base_dir, "na_profiles")



# === Load merged labels ===
labels_df = pd.read_csv(label_path, parse_dates=["start_date"])
print(f"✅ Loaded {len(labels_df)} rows from label file")

# === Extract EDA summary stats per window ===
eda_features = []

for i, row in labels_df.iterrows():
    pid = row["participant_id"]
    start_date = row["start_date"]
    
    fname = f"participant{pid}_na_summary.xlsx"
    fpath = os.path.join(summary_dir, fname)
    
    if not os.path.exists(fpath):
        print(f"⚠️ Missing file: {fname}")
        continue

    df = pd.read_excel(fpath, parse_dates=["datetime"])  # Replace with correct time col if needed
    df["date"] = df["datetime"].dt.date

    # Select 4-day window
    end_date = start_date + pd.Timedelta(days=3)
    df_window = df[(df["date"] >= start_date.date()) & (df["date"] <= end_date.date())]

    if df_window.empty:
        print(f"⚠️ No data for participant {pid} from {start_date} to {end_date}")
        continue

    # Calculate EDA summary features
    eda_cols = [col for col in df.columns if "eda" in col.lower()]
    if not eda_cols:
        print(f"⚠️ No EDA columns found in {fname}")
        continue

    feat = {
        "participant_id": pid,
        "start_date": start_date,
        "stress_level": row["stress_level"]
    }

    for col in eda_cols:
        eda = df_window[col].dropna()
        feat[f"{col}_mean"] = eda.mean()
        feat[f"{col}_std"] = eda.std()
        feat[f"{col}_min"] = eda.min()
        feat[f"{col}_max"] = eda.max()
        feat[f"{col}_slope"] = np.polyfit(np.arange(len(eda)), eda, 1)[0] if len(eda) >= 2 else np.nan

    eda_features.append(feat)

# === Combine ===
eda_df = pd.DataFrame(eda_features)
print(f"✅ Extracted features for {len(eda_df)} windows")

# === Drop rows with NaNs ===
eda_df_clean = eda_df.dropna()
print(f"✅ Remaining after dropna: {len(eda_df_clean)}")

# === ML: Train/Test Split ===
X = eda_df_clean.drop(columns=["participant_id", "start_date", "stress_level"])
y = eda_df_clean["stress_level"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# === Train and Evaluate ===
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# === Feature Importance ===
importances = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\n=== Top Features ===")
print(importances.head())

# Optional: Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=importances.head(10), y=importances.head(10).index)
plt.title("Top 10 EDA Features Predicting Stress")
plt.xlabel("Feature Importance")
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/lpnhu/Downloads/Stress_Testing_Analysis\\eda_qc_reports\\eda_4day_merged_with_labels.csv'