In [1]:
# ========== CELL 1: Setup ==========
"""
FIXED MERGE NOTEBOOK
Properly merges audio, text, video features with correct prefixes
"""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split

PROJECT_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project')
FEATURES_DIR = PROJECT_DIR / 'data' / 'features'
PROCESSED_DIR = PROJECT_DIR / 'data' / 'processed'
RAW_DIR = PROJECT_DIR / 'data' / 'raw' / 'DAIC-WOZ'

PROCESSED_DIR.mkdir(exist_ok=True)

print("✓ Setup complete")
print(f"Features: {FEATURES_DIR}")
print(f"Output: {PROCESSED_DIR}")


# ========== CELL 2: Load PHQ-8 Labels ==========

# Load PHQ-8 scores
train_split = pd.read_csv(RAW_DIR / 'train_split_Depression_AVEC2017.csv')
dev_split = pd.read_csv(RAW_DIR / 'dev_split_Depression_AVEC2017.csv')
test_split = pd.read_csv(RAW_DIR / 'test_split_Depression_AVEC2017.csv')

# Combine all splits
all_labels = pd.concat([train_split, dev_split, test_split], ignore_index=True)

print(f"Total sessions with labels: {len(all_labels)}")
print(f"\nLabel columns: {list(all_labels.columns)}")
print(f"\nSample labels:\n{all_labels.head()}")


# ========== CELL 3: Load Feature Files ==========

# Load audio features
audio_df = pd.read_csv(FEATURES_DIR / 'audio_features.csv')
print(f"\n✓ Audio: {len(audio_df)} rows, {len(audio_df.columns)} columns")

# Load text features  
text_df = pd.read_csv(FEATURES_DIR / 'text_features.csv')
print(f"✓ Text: {len(text_df)} rows, {len(text_df.columns)} columns")

# Load video features
video_df = pd.read_csv(FEATURES_DIR / 'video_features.csv')
print(f"✓ Video: {len(video_df)} rows, {len(video_df.columns)} columns")


# ========== CELL 4: Add Prefixes to Feature Names ==========

print("\n" + "="*60)
print("ADDING FEATURE PREFIXES")
print("="*60)

# Audio: Add 'audio_' prefix to all columns except session_id
audio_cols = [col for col in audio_df.columns if col != 'session_id']
audio_df = audio_df.rename(columns={col: f'audio_{col}' for col in audio_cols})
print(f"✓ Audio features renamed: {len(audio_cols)} columns")
print(f"  Example: {list(audio_df.columns[1:4])}")

# Text: Add 'text_' prefix to all columns except session_id
text_cols = [col for col in text_df.columns if col != 'session_id']
text_df = text_df.rename(columns={col: f'text_{col}' for col in text_cols})
print(f"\n✓ Text features renamed: {len(text_cols)} columns")
print(f"  Example: {list(text_df.columns[1:4])}")

# Video: Add 'video_' prefix to all columns except session_id
video_cols = [col for col in video_df.columns if col != 'session_id']
video_df = video_df.rename(columns={col: f'video_{col}' for col in video_cols})
print(f"\n✓ Video features renamed: {len(video_cols)} columns")
print(f"  Example: {list(video_df.columns[1:4])}")


# ========== CELL 5: Merge All Features ==========

print("\n" + "="*60)
print("MERGING FEATURES")
print("="*60)

# Start with audio
merged_df = audio_df.copy()
print(f"Starting with audio: {merged_df.shape}")

# Merge text
merged_df = merged_df.merge(text_df, on='session_id', how='inner')
print(f"After merging text: {merged_df.shape}")

# Merge video
merged_df = merged_df.merge(video_df, on='session_id', how='inner')
print(f"After merging video: {merged_df.shape}")

print(f"\n✓ Total features: {len(merged_df.columns) - 1}")  # -1 for session_id


# ========== CELL 6: Add PHQ-8 Labels ==========

print("\n" + "="*60)
print("ADDING PHQ-8 LABELS")
print("="*60)

# Merge with labels (keep only sessions we have features for)
merged_df = merged_df.merge(
    all_labels[['Participant_ID', 'PHQ8_Score']], 
    left_on='session_id', 
    right_on='Participant_ID',
    how='left'
)

# Drop the duplicate ID column
merged_df = merged_df.drop('Participant_ID', axis=1)

# Check for missing labels
missing_labels = merged_df['PHQ8_Score'].isna().sum()
if missing_labels > 0:
    print(f"⚠ Warning: {missing_labels} sessions missing PHQ-8 scores")
    print(f"Sessions without labels: {merged_df[merged_df['PHQ8_Score'].isna()]['session_id'].tolist()}")
    # Remove sessions without labels
    merged_df = merged_df.dropna(subset=['PHQ8_Score'])
    print(f"Removed sessions without labels. Remaining: {len(merged_df)}")

print(f"\n✓ Final dataset: {len(merged_df)} sessions")
print(f"✓ Total columns: {len(merged_df.columns)}")
print(f"\nPHQ-8 Score distribution:")
print(merged_df['PHQ8_Score'].describe())


# ========== CELL 7: Verify Feature Prefixes ==========

print("\n" + "="*60)
print("VERIFYING FEATURE PREFIXES")
print("="*60)

audio_features = [col for col in merged_df.columns if col.startswith('audio_')]
text_features = [col for col in merged_df.columns if col.startswith('text_')]
video_features = [col for col in merged_df.columns if col.startswith('video_')]

print(f"✓ Audio features: {len(audio_features)}")
print(f"✓ Text features: {len(text_features)}")
print(f"✓ Video features: {len(video_features)}")

if len(audio_features) == 0 or len(text_features) == 0 or len(video_features) == 0:
    print("\n❌ ERROR: Some feature types missing prefixes!")
else:
    print("\n✅ All features have correct prefixes!")


# ========== CELL 8: Create Train/Val/Test Splits ==========

print("\n" + "="*60)
print("CREATING TRAIN/VAL/TEST SPLITS")
print("="*60)

# Filter to only our sessions (300-325)
our_sessions = merged_df[merged_df['session_id'].between(300, 325)].copy()
print(f"Sessions 300-325: {len(our_sessions)}")

if len(our_sessions) < 20:
    print(f"⚠ Warning: Only {len(our_sessions)} sessions available")

# Split: 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(
    our_sessions, 
    test_size=0.3, 
    random_state=42,
    shuffle=True
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    shuffle=True
)

print(f"\n✓ Train set: {len(train_df)} sessions ({len(train_df)/len(our_sessions)*100:.1f}%)")
print(f"✓ Val set: {len(val_df)} sessions ({len(val_df)/len(our_sessions)*100:.1f}%)")
print(f"✓ Test set: {len(test_df)} sessions ({len(test_df)/len(our_sessions)*100:.1f}%)")

print(f"\nTrain session IDs: {sorted(train_df['session_id'].tolist())}")
print(f"Val session IDs: {sorted(val_df['session_id'].tolist())}")
print(f"Test session IDs: {sorted(test_df['session_id'].tolist())}")


# ========== CELL 9: Save Processed Data ==========

print("\n" + "="*60)
print("SAVING PROCESSED DATA")
print("="*60)

# Save splits
train_df.to_csv(PROCESSED_DIR / 'train_data.csv', index=False)
print(f"✓ Saved: train_data.csv ({len(train_df)} rows, {len(train_df.columns)} cols)")

val_df.to_csv(PROCESSED_DIR / 'val_data.csv', index=False)
print(f"✓ Saved: val_data.csv ({len(val_df)} rows, {len(val_df.columns)} cols)")

test_df.to_csv(PROCESSED_DIR / 'test_data.csv', index=False)
print(f"✓ Saved: test_data.csv ({len(test_df)} rows, {len(test_df.columns)} cols)")

# Save full dataset
our_sessions.to_csv(PROCESSED_DIR / 'full_dataset.csv', index=False)
print(f"✓ Saved: full_dataset.csv ({len(our_sessions)} rows, {len(our_sessions.columns)} cols)")


# ========== CELL 10: Final Verification ==========

print("\n" + "="*60)
print("FINAL VERIFICATION")
print("="*60)

# Load back and verify
train_check = pd.read_csv(PROCESSED_DIR / 'train_data.csv')
val_check = pd.read_csv(PROCESSED_DIR / 'val_data.csv')
test_check = pd.read_csv(PROCESSED_DIR / 'test_data.csv')

# Count feature columns
audio_cols = [c for c in train_check.columns if c.startswith('audio_')]
text_cols = [c for c in train_check.columns if c.startswith('text_')]
video_cols = [c for c in train_check.columns if c.startswith('video_')]

print(f"✓ Train: {len(train_check)} samples")
print(f"✓ Val: {len(val_check)} samples")
print(f"✓ Test: {len(test_check)} samples")
print(f"\nFeature counts:")
print(f"  Audio: {len(audio_cols)}")
print(f"  Text: {len(text_cols)}")
print(f"  Video: {len(video_cols)}")
print(f"  Total: {len(audio_cols) + len(text_cols) + len(video_cols)}")

# Check PHQ-8 scores
print(f"\nPHQ-8 Score ranges:")
print(f"  Train: {train_check['PHQ8_Score'].min():.1f} - {train_check['PHQ8_Score'].max():.1f}")
print(f"  Val: {val_check['PHQ8_Score'].min():.1f} - {val_check['PHQ8_Score'].max():.1f}")
print(f"  Test: {test_check['PHQ8_Score'].min():.1f} - {test_check['PHQ8_Score'].max():.1f}")

# Final success check
if len(audio_cols) > 0 and len(text_cols) > 0 and len(video_cols) > 0:
    print("\n" + "="*60)
    print("✅✅✅ DATA MERGE SUCCESSFUL! ✅✅✅")
    print("="*60)
    print("\nAll features have correct prefixes!")
    print("You can now re-run all modeling notebooks (10-21)")
else:
    print("\n❌ ERROR: Feature prefixes still missing!")
    
print(f"\nFiles saved to: {PROCESSED_DIR}")
print("\nNext step: Re-run Notebook 10 (HCMA training)")

✓ Setup complete
Features: C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\features
Output: C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\processed
Total sessions with labels: 189

Label columns: ['Participant_ID', 'PHQ8_Binary', 'PHQ8_Score', 'Gender', 'PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 'PHQ8_Tired', 'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving', 'participant_ID']

Sample labels:
   Participant_ID  PHQ8_Binary  PHQ8_Score  Gender  PHQ8_NoInterest  \
0           303.0          0.0         0.0       0              0.0   
1           304.0          0.0         6.0       0              0.0   
2           305.0          0.0         7.0       1              0.0   
3           310.0          0.0         4.0       1              1.0   
4           312.0          0.0         2.0       1              0.0   

   PHQ8_Depressed  PHQ8_Sleep  PHQ8_Tired  PHQ8_Appetite  PHQ8_Failure  \
0             0.0         0.0         0.0   