In [16]:
import os
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
import torch

# Initialize BERT model and tokenizer (example)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device('cpu')
model.to(device)

def extract_bert_features(text):
    """Extract BERT features for a given text string."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Use CLS token embedding as sentence feature
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

def process_session(session_id, base_dir):
    transcript_path = base_dir / f"{session_id}_P" / f"{session_id}_TRANSCRIPT.csv"
    try:
        # Fix: specify delimiter='\t' for tab-separated transcript files
        df = pd.read_csv(transcript_path, delimiter='\t')
    except Exception as e:
        print(f"⚠️ Error reading {transcript_path}: {e}")
        return None

    # Check required columns exist
    required_cols = {'start_time', 'stop_time', 'speaker', 'value'}
    if not required_cols.issubset(df.columns):
        print(f"⚠️ Required columns missing in: {transcript_path.name}")
        print(f"   Found columns: {df.columns.tolist()}")
        return None

    # Extract text from participant (assuming 'Ellie' is the interviewer, skip her text)
    participant_texts = df[df['speaker'] != 'Ellie']['value'].tolist()
    full_text = ' '.join(participant_texts)

    # Extract features from combined text
    features = extract_bert_features(full_text)

    # Return as dict with session_id for dataframe
    return {'session_id': session_id, **{f'feat_{i}': features[i] for i in range(len(features))}}

def main():
    base_dir = Path(r"C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\raw\DAIC-WOZ")
    output_file = Path(r"C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\features\text_features.csv")

    session_ids = [str(i) for i in range(300, 326)]  # Sessions 300 to 325
    all_features = []

    print("Processing all sessions...")
    for session_id in tqdm(session_ids):
        features = process_session(session_id, base_dir)
        if features is not None:
            all_features.append(features)

    if all_features:
        df_features = pd.DataFrame(all_features)
        df_features.sort_values('session_id', inplace=True)
        df_features.to_csv(output_file, index=False)
        print(f"✅ Saved features to: {output_file}")
    else:
        print("⚠ No features extracted. DataFrame is empty.")

if __name__ == "__main__":
    main()


Processing all sessions...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:26<00:00,  1.03s/it]

✅ Saved features to: C:\Users\VIJAY BHUSHAN SINGH\depression_detection_project\data\features\text_features.csv



