In [1]:
"""
FIXED MERGE + SPLIT NOTEBOOK - KEEP ALL SESSIONS

SAVE AS: notebooks/exploratory/08_merge_all_features_FIXED.ipynb
"""

# ========== CELL 1: Import ==========
print("Importing libraries...")
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
print("✓ Libraries imported")

# ========== CELL 2: Load Features ==========
print("\nLoading features...")

FEATURES_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\features')
DATA_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\raw\DAIC-WOZ')
PROCESSED_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Load all feature files
audio_df = pd.read_csv(FEATURES_DIR / 'audio_features.csv')
text_df = pd.read_csv(FEATURES_DIR / 'text_features.csv')
video_df = pd.read_csv(FEATURES_DIR / 'video_features.csv')

print(f"✓ Audio: {audio_df.shape}")
print(f"✓ Text: {text_df.shape}")
print(f"✓ Video: {video_df.shape}")

# ========== CELL 3: Load Labels ==========
print("\nLoading PHQ-8 labels...")
labels_df = pd.read_csv(DATA_DIR / 'train_split_Depression_AVEC2017.csv')

# Standardize session ID
if 'Participant_ID' in labels_df.columns:
    labels_df.rename(columns={'Participant_ID': 'session_id'}, inplace=True)
elif 'participant_id' in labels_df.columns:
    labels_df.rename(columns={'participant_id': 'session_id'}, inplace=True)

# Keep only necessary columns
if 'PHQ8_Score' in labels_df.columns:
    labels_df = labels_df[['session_id', 'PHQ8_Score']]
elif 'PHQ8_Total' in labels_df.columns:
    labels_df.rename(columns={'PHQ8_Total': 'PHQ8_Score'}, inplace=True)
    labels_df = labels_df[['session_id', 'PHQ8_Score']]

print(f"✓ Labels shape: {labels_df.shape}")

# ========== CELL 4: Merge Features (OUTER JOIN) ==========
print("\nMerging all features (outer join)...")

merged_df = audio_df.merge(text_df, on='session_id', how='outer', suffixes=('_audio', '_text'))
merged_df = merged_df.merge(video_df, on='session_id', how='outer', suffixes=('', '_video'))

# Merge with labels (inner join to keep only labeled sessions)
final_df = merged_df.merge(labels_df, on='session_id', how='inner')

print(f"✓ Final dataset: {final_df.shape[0]} sessions, {final_df.shape[1]-2} features")

# ========== CELL 5: Handle Missing Values ==========
print("\nHandling missing values...")

missing_before = final_df.isnull().sum().sum()
print(f"Missing values before: {missing_before}")

# Fill all feature NaNs with column mean
feature_cols = [c for c in final_df.columns if c not in ['session_id', 'PHQ8_Score']]
for col in feature_cols:
    final_df[col] = final_df[col].fillna(final_df[col].mean())

missing_after = final_df.isnull().sum().sum()
print(f"✓ Missing values after: {missing_after}")

# ========== CELL 6: Create Severity Classes ==========
print("\nCreating severity categories...")

def categorize_severity(score):
    if score <= 4:
        return 0  # None
    elif score <= 9:
        return 1  # Mild
    elif score <= 14:
        return 2  # Moderate
    else:
        return 3  # Severe

final_df['severity_class'] = final_df['PHQ8_Score'].apply(categorize_severity)
severity_names = {0: 'None', 1: 'Mild', 2: 'Moderate', 3: 'Severe'}
final_df['severity_name'] = final_df['severity_class'].map(severity_names)

print("✓ Severity categories created")
print(final_df['severity_name'].value_counts().sort_index())

# ========== CELL 7: Train/Val/Test Split ==========
print("\nCreating splits...")

X = final_df[feature_cols].values
y_reg = final_df['PHQ8_Score'].values
y_cls = final_df['severity_class'].values
session_ids = final_df['session_id'].values

# If dataset is small, stratify cannot be used reliably; check minimum class size
min_class_count = final_df['severity_class'].value_counts().min()
if min_class_count < 2:
    stratify = None
    print("⚠ Small dataset: stratification disabled")
else:
    stratify = y_cls

# 70/15/15 split
X_train, X_temp, y_train_reg, y_temp_reg, y_train_cls, y_temp_cls, ids_train, ids_temp = train_test_split(
    X, y_reg, y_cls, session_ids, test_size=0.3, random_state=42, stratify=stratify
)
X_val, X_test, y_val_reg, y_test_reg, y_val_cls, y_test_cls, ids_val, ids_test = train_test_split(
    X_temp, y_temp_reg, y_temp_cls, ids_temp, test_size=0.5, random_state=42,
    stratify=y_temp_cls if stratify is not None else None
)

print(f"✓ Splits created: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

# ========== CELL 8: Save Datasets ==========
print("\nSaving datasets...")

final_df.to_csv(PROCESSED_DIR / 'complete_dataset.csv', index=False)

def save_split(X_split, y_reg_split, y_cls_split, ids_split, filename):
    df = pd.DataFrame(X_split, columns=feature_cols)
    df['session_id'] = ids_split
    df['PHQ8_Score'] = y_reg_split
    df['severity_class'] = y_cls_split
    df.to_csv(PROCESSED_DIR / filename, index=False)
    print(f"✓ Saved: {filename}")

save_split(X_train, y_train_reg, y_train_cls, ids_train, 'train_data.csv')
save_split(X_val, y_val_reg, y_val_cls, ids_val, 'val_data.csv')
save_split(X_test, y_test_reg, y_test_cls, ids_test, 'test_data.csv')

# ========== CELL 9: Feature Summary ==========
audio_features = [c for c in feature_cols if any(x in c for x in ['mfcc','pitch','energy','spectral','zcr','rolloff','duration'])]
text_features = [c for c in feature_cols if 'bert' in c.lower() or any(x in c.lower() for x in ['word','positive','negative','question'])]
video_features = [c for c in feature_cols if 'AU' in c or 'gaze' in c.lower() or any(x in c for x in ['Tx','Ty','Tz','Rx','Ry','Rz'])]

print(f"\nFeatures by modality:")
print(f"  Audio: {len(audio_features)}")
print(f"  Text: {len(text_features)}")
print(f"  Video: {len(video_features)}")
print(f"  Total: {len(feature_cols)}")

# ========== CELL 10: Final Summary ==========
print("\n✅ MERGE + SPLIT COMPLETE!")
print(f"Total sessions: {len(final_df)}")
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Features: {len(feature_cols)}")
print(f"Files saved to: {PROCESSED_DIR}")


Importing libraries...
✓ Libraries imported

Loading features...
✓ Audio: (26, 69)
✓ Text: (26, 769)
✓ Video: (26, 76)

Loading PHQ-8 labels...
✓ Labels shape: (107, 2)

Merging all features (outer join)...
✓ Final dataset: 16 sessions, 911 features

Handling missing values...
Missing values before: 0
✓ Missing values after: 0

Creating severity categories...
✓ Severity categories created
severity_name
Mild        7
Moderate    3
None        5
Severe      1
Name: count, dtype: int64

Creating splits...
⚠ Small dataset: stratification disabled
✓ Splits created: Train=11, Val=2, Test=3

Saving datasets...
✓ Saved: train_data.csv
✓ Saved: val_data.csv
✓ Saved: test_data.csv

Features by modality:
  Audio: 68
  Text: 0
  Video: 72
  Total: 911

✅ MERGE + SPLIT COMPLETE!
Total sessions: 16
Train: 11, Val: 2, Test: 3
Features: 911
Files saved to: C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\processed
