In [3]:
"""
DIAGNOSTIC NOTEBOOK - Find Data Issues

SAVE AS: notebooks/exploratory/09_diagnose_data_issue.ipynb

RUN THIS FIRST to identify the problem
"""

# ========== CELL 1: Import ==========
import pandas as pd
import numpy as np
from pathlib import Path

FEATURES_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\features')
PROCESSED_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\processed')
DATA_DIR = Path(r'C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\raw\DAIC-WOZ')

# ========== CELL 2: Check Feature Files ==========
print("="*60)
print("CHECKING FEATURE FILES")
print("="*60)

# Audio
audio_df = pd.read_csv(FEATURES_DIR / 'audio_features.csv')
print(f"\n📊 Audio Features:")
print(f"  Sessions: {len(audio_df)}")
print(f"  Features: {len(audio_df.columns) - 1}")
print(f"  Session IDs: {sorted(audio_df['session_id'].tolist())}")

# Text
text_df = pd.read_csv(FEATURES_DIR / 'text_features.csv')
print(f"\n📊 Text Features:")
print(f"  Sessions: {len(text_df)}")
print(f"  Features: {len(text_df.columns) - 1}")
print(f"  Session IDs: {sorted(text_df['session_id'].tolist())}")

# Video
video_df = pd.read_csv(FEATURES_DIR / 'video_features.csv')
print(f"\n📊 Video Features:")
print(f"  Sessions: {len(video_df)}")
print(f"  Features: {len(video_df.columns) - 1}")
print(f"  Session IDs: {sorted(video_df['session_id'].tolist())}")

# ========== CELL 3: Find Common Sessions ==========
print("\n" + "="*60)
print("FINDING COMMON SESSIONS")
print("="*60)

audio_ids = set(audio_df['session_id'])
text_ids = set(text_df['session_id'])
video_ids = set(video_df['session_id'])

common_ids = audio_ids & text_ids & video_ids
print(f"\n✓ Common sessions (have all 3 modalities): {len(common_ids)}")
print(f"  IDs: {sorted(common_ids)}")

# Missing
audio_only = audio_ids - text_ids - video_ids
text_only = text_ids - audio_ids - video_ids
video_only = video_ids - audio_ids - text_ids

if audio_only:
    print(f"\n⚠ Audio only (missing text/video): {sorted(audio_only)}")
if text_only:
    print(f"⚠ Text only (missing audio/video): {sorted(text_only)}")
if video_only:
    print(f"⚠ Video only (missing audio/text): {sorted(video_only)}")

# ========== CELL 4: Check Labels ==========
print("\n" + "="*60)
print("CHECKING PHQ-8 LABELS")
print("="*60)

labels_df = pd.read_csv(DATA_DIR / 'train_split_Depression_AVEC2017.csv')
print(f"\n📊 Labels file:")
print(f"  Total sessions with labels: {len(labels_df)}")
print(f"  Columns: {labels_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(labels_df.head())

# Check overlap
label_col = 'Participant_ID' if 'Participant_ID' in labels_df.columns else labels_df.columns[0]
label_ids = set(labels_df[label_col].tolist())

overlap = common_ids & label_ids
print(f"\n✓ Sessions with features AND labels: {len(overlap)}")
print(f"  IDs: {sorted(overlap)}")

# ========== CELL 5: Check Merged Data ==========
print("\n" + "="*60)
print("CHECKING MERGED DATA")
print("="*60)

if (PROCESSED_DIR / 'train_data.csv').exists():
    train_df = pd.read_csv(PROCESSED_DIR / 'train_data.csv')
    val_df = pd.read_csv(PROCESSED_DIR / 'val_data.csv')
    test_df = pd.read_csv(PROCESSED_DIR / 'test_data.csv')
    
    print(f"\n📊 Current merged data:")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Val: {len(val_df)} samples")
    print(f"  Test: {len(test_df)} samples")
    print(f"  Total: {len(train_df) + len(val_df) + len(test_df)}")
    
    print(f"\n⚠ PROBLEM FOUND!")
    print(f"  Expected: {len(overlap)} sessions")
    print(f"  Got: {len(train_df) + len(val_df) + len(test_df)} sessions")
    
    if len(train_df) + len(val_df) + len(test_df) < len(overlap):
        print(f"\n  Missing {len(overlap) - (len(train_df) + len(val_df) + len(test_df))} sessions!")

# ========== CELL 6: Summary ==========
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

print(f"\n✅ What we have:")
print(f"  Audio features: {len(audio_df)} sessions")
print(f"  Text features: {len(text_df)} sessions")
print(f"  Video features: {len(video_df)} sessions")
print(f"  Common (all 3): {len(common_ids)} sessions")
print(f"  With labels: {len(overlap)} sessions")

print(f"\n❌ What's wrong:")
if (PROCESSED_DIR / 'train_data.csv').exists():
    total = len(train_df) + len(val_df) + len(test_df)
    print(f"  Merged data only has: {total} sessions")
    print(f"  Should have: {len(overlap)} sessions")
    print(f"  Missing: {len(overlap) - total} sessions")

print(f"\n💡 Solution:")
print(f"  Re-run merge notebook (08) with fixed code")
print(f"  Expected result: ~{len(overlap)} total samples")

print("="*60)

CHECKING FEATURE FILES

📊 Audio Features:
  Sessions: 26
  Features: 68
  Session IDs: [300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325]

📊 Text Features:
  Sessions: 26
  Features: 768
  Session IDs: [300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325]

📊 Video Features:
  Sessions: 26
  Features: 75
  Session IDs: [300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325]

FINDING COMMON SESSIONS

✓ Common sessions (have all 3 modalities): 26
  IDs: [300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325]

CHECKING PHQ-8 LABELS

📊 Labels file:
  Total sessions with labels: 107
  Columns: ['Participant_ID', 'PHQ8_Binary', 'PHQ8_Score', 'Gender', 'PHQ8_NoInterest', 'PHQ8_Depressed', 'PH