# Imports + Config

In [1]:
import json
import requests
import pandas as pd
import numpy as np
#!pip install --upgrade typing_extensions
#!pip install pydantic
from pydantic import BaseModel
from typing import List
from typing import Union
import csv
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from matplotlib.gridspec import GridSpec
import math
import re

# Ollama Config
OLLAMA_NODE = "arctrdagn031"
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"

model = "gemma3-optimized:27b" # qwq:latest


#  Grabbing DAIC-WOZ data (Data Loader)

## Grabbing all participant IDs that have all the PHQ-8 questionare data

In [None]:
"""
Specifying which participant IDs should be analyzed
"""

dev_split_phq8 = pd.read_csv(r"/data/users4/user/ai-psychiatrist/datasets/daic_woz_dataset/dev_split_Depression_AVEC2017.csv")
train_split_phq8 = pd.read_csv(r"/data/users4/user/ai-psychiatrist/datasets/daic_woz_dataset/train_split_Depression_AVEC2017.csv")

participant_ids = set()

# Grabbing unique participant ID values and putting them in a list

participant_ids.update(dev_split_phq8['Participant_ID'])
participant_ids.update(train_split_phq8['Participant_ID'])

participant_list = sorted(list(participant_ids))

print(participant_list)
print(dev_split_phq8)
print('\n\n\n')
print(train_split_phq8)

[404, 422, 474]
    Participant_ID  PHQ8_Binary  PHQ8_Score  Gender  PHQ8_NoInterest  \
0              302            0           4       1                1   
1              307            0           4       0                0   
2              331            0           8       1                1   
3              335            1          12       0                1   
4              346            1          23       0                2   
5              367            1          19       1                3   
6              377            1          16       0                2   
7              381            1          16       1                2   
8              382            0           0       1                0   
9              388            1          17       1                1   
10             389            1          14       1                1   
11             390            0           9       1                2   
12             395            0           7     

## Quantitative Assesment

In [3]:
"""
Runs basic quantitative analysis on the participants. Instructions to grab evidence, give reasoning, and finally output a score. Outputs validated with pydantic.
"""

class PHQ8ScoreWithExplanation(BaseModel):
    evidence: str  # Direct quotes or references from interview
    reason: str  # Brief explanation for the score
    score: Union[int, str]  # 0-3 or "N/A"

class PHQ8ScoresWithExplanations(BaseModel):
    PHQ8_NoInterest: PHQ8ScoreWithExplanation  # Little interest or pleasure in doing things
    PHQ8_Depressed: PHQ8ScoreWithExplanation   # Feeling down, depressed, or hopeless
    PHQ8_Sleep: PHQ8ScoreWithExplanation       # Trouble falling/staying asleep, or sleeping too much
    PHQ8_Tired: PHQ8ScoreWithExplanation       # Feeling tired or having little energy
    PHQ8_Appetite: PHQ8ScoreWithExplanation    # Poor appetite or overeating
    PHQ8_Failure: PHQ8ScoreWithExplanation     # Feeling bad about self or like a failure
    PHQ8_Concentrating: PHQ8ScoreWithExplanation # Trouble concentrating
    PHQ8_Moving: PHQ8ScoreWithExplanation      # Moving/speaking slowly or being fidgety/restless

system_prompt = """You are a AI psychiatrist assessment tool specialized in analyzing interview transcripts to evaluate PHQ-8 (Patient Health Questionnaire-8) scores. The PHQ-8 is a validated depression screening instrument that assesses eight of the nine DSM-IV criteria for depressive disorders over the past 2 weeks.

PHQ-8 Scoring System:
- 0 = Not at all (0-1 days in the past 2 weeks)
- 1 = Several days (2-6 days)
- 2 = More than half the days (7-11 days)
- 3 = Nearly every day (12-14 days)

Key Principles:
1. Base all assessments strictly on evidence from the transcript
2. When no relevant evidence exists for a symptom, output "N/A" rather than assuming absence (score of 0)
3. Consider both explicit statements and behavioral indicators in the interview
4. Be conservative - only assign scores when there is clear evidence of frequency
5. Remember that absence of mention does not necessarily mean absence of symptom
6. Follow the format specified exactly"""

def run_phq8_analysis(patient_transcript, ground_truth):
    """
    Predicts PHQ8 values from a patients transcript, compares to ground truth, and calculates evaluation metrics

    Parameters
    ----------
    patient_transcript : string
        The patients transcript
    ground_truth : pandas series
        The ground truth PHQ8 values for a given participant

    Returns
    -------
    tuple
        A 5-tuple containing:
        - phq8_scores (PHQ8ScoresWithExplanations or None): Parsed PHQ8 scores with evidence and reasoning
        - avg_difference (float or None): Average absolute difference between predicted and ground truth scores
        - accuracy_on_available (float or None): Accuracy score on questions that weren't marked N/A (0-1 scale)
        - num_questions_NA (int or None): Number of questions marked as N/A out of 8 total
        - overall_accuracy (float or None): Overall accuracy adjusted for N/A responses (0-1 scale)
        
        Returns (None, None, None, None, None) if parsing fails.
    """
    print("DEBUG: Starting run_phq8_analysis function")
    print(f"DEBUG: Patient transcript length: {len(patient_transcript) if patient_transcript else 'None'}")
    print(f"DEBUG: Ground truth type: {type(ground_truth)}")
    
    quantitative_prompt = f"""Analyze the following interview transcript and predict PHQ-8 scores for each symptom domain.

Interview transcript:
<transcript>
{patient_transcript}
</transcript>

Analyze each symptom using the following approach in <thinking> tags:
1. Search for direct quotes or behavioral evidence related to each PHQ-8 symptom
2. Evaluate the frequency/severity based on available evidence
3. If no relevant evidence exists, mark as "N/A" rather than assuming absence
4. Only assign numeric scores (0-3) when evidence clearly indicates frequency

After your analysis, provide your final assessment in <answer> tags as a JSON object.

For each symptom, provide:
1. "evidence": exact quotes from transcript (use "No relevant evidence found" if not discussed)
2. "reason": explanation of scoring decision, including why N/A if applicable
3. "score": integer 0-3 based on evidence, or "N/A" if no relevant evidence

Return ONLY a JSON object in <answer> tags with these exact keys:
- "PHQ8_NoInterest": {{evidence, reason, score}} for little interest or pleasure in doing things (anhedonia)
- "PHQ8_Depressed": {{evidence, reason, score}} for feeling down, depressed, or hopeless (depressed mood)
- "PHQ8_Sleep": {{evidence, reason, score}} for trouble falling or staying asleep, or sleeping too much (sleep problems)
- "PHQ8_Tired": {{evidence, reason, score}} for feeling tired or having little energy (fatigue)
- "PHQ8_Appetite": {{evidence, reason, score}} for poor appetite or overeating (appetite/weight changes)
- "PHQ8_Failure": {{evidence, reason, score}} for feeling bad about yourself or that you are a failure (negative self-perception)
- "PHQ8_Concentrating": {{evidence, reason, score}} for trouble concentrating on things like reading or watching TV (concentration problems)
- "PHQ8_Moving": {{evidence, reason, score}} for moving or speaking slowly, or being fidgety/restless (psychomotor changes)"""
    print("DEBUG: Created quantitative prompt")
    
    # Most deterministic temp, top_k, and top_p
    print("DEBUG: Making API request...")
    try:
        response = requests.post(
            BASE_URL,
            json={
                "model": model,
                "messages": [{"role": "system", "content": system_prompt},
                            {"role": "user", "content": quantitative_prompt}],
                "stream": False,
                "options": {
                    "temperature": 0,
                    "top_k": 1,
                    "top_p": 1.0
                }
            }
        )
        print(f"DEBUG: API request completed with status code: {response.status_code}")
    except Exception as e:
        print(f"DEBUG: API request failed with error: {e}")
        return None, None, None, None, None

    def clean_unicode_quotes(text):
        print("DEBUG: Cleaning unicode quotes")
        replacements = {
            '\u201c': '"',  # Left double quotation mark
            '\u201d': '"',  # Right double quotation mark
            '\u2018': "'",  # Left single quotation mark
            '\u2019': "'",  # Right single quotation mark
            '\u2013': '-',  # En dash
            '\u2014': '-',  # Em dash
        }
        
        for old, new in replacements.items():
            text = text.replace(old, new)
        
        print("DEBUG: Unicode quotes cleaned")
        return text

    # Parse and validate the response
    print("DEBUG: Starting response parsing")
    try:
        print("DEBUG: Extracting JSON from response")
        response_data = response.json()
        print("DEBUG: Successfully parsed response JSON")
        
        content = response_data['message']['content']
        print(f"DEBUG: Extracted content, length: {len(content)}")
        print(f"DEBUG: Content preview (first 200 chars): {content[:200]}")
        
        # Extract content from <answer> tags if present
        print("DEBUG: Checking for <answer> tags")
        if '<answer>' in content and '</answer>' in content:
            print("DEBUG: Found <answer> tags, extracting content")
            content = content.split('<answer>')[1].split('</answer>')[0].strip()
            print(f"DEBUG: Extracted content from <answer> tags, new length: {len(content)}")
        else:
            print("DEBUG: No <answer> tags found")
        
        # Remove markdown code blocks if present
        print("DEBUG: Checking for markdown code blocks")
        if content.startswith('```json'):
            print("DEBUG: Found ```json code block, extracting")
            content = content.split('```json')[1].split('```')[0].strip()
            print(f"DEBUG: Extracted from ```json block, new length: {len(content)}")
        elif content.startswith('```'):
            print("DEBUG: Found ``` code block, extracting")
            content = content.split('```')[1].split('```')[0].strip()
            print(f"DEBUG: Extracted from ``` block, new length: {len(content)}")
        else:
            print("DEBUG: No markdown code blocks found")
        
        # Clean Unicode quotes before JSON parsing
        print("DEBUG: About to clean unicode quotes")
        content = clean_unicode_quotes(content)
        
        # Parse the JSON response
        print("DEBUG: Attempting to parse JSON")
        print(f"DEBUG: JSON content to parse: {content[:500]}...")  # Show first 500 chars
        try:
            scores_dict = json.loads(content)
            print("DEBUG: Successfully parsed JSON")
            print(f"DEBUG: Parsed dict keys: {list(scores_dict.keys())}")
        except json.JSONDecodeError as json_error:
            print(f"DEBUG: JSON parsing failed: {json_error}")
            print(f"DEBUG: Failed content: {content}")
            raise
        
        # Helper function to parse scores, handling N/A values
        def parse_score(score_value):
            print(f"DEBUG: Parsing score value: {score_value} (type: {type(score_value)})")
            if isinstance(score_value, str) and score_value.upper() in ['N/A', 'NA', 'NULL', 'NONE']:
                print("DEBUG: Score parsed as N/A")
                return "N/A"
            try:
                result = int(score_value)
                print(f"DEBUG: Score parsed as integer: {result}")
                return result
            except (ValueError, TypeError) as e:
                print(f"DEBUG: Score parsing failed, defaulting to N/A. Error: {e}")
                return "N/A"
        
        # Convert N/A strings to consistent format before Pydantic validation
        print("DEBUG: Converting scores in dictionary")
        for key in scores_dict:
            print(f"DEBUG: Processing key: {key}")
            if 'score' in scores_dict[key]:
                old_score = scores_dict[key]['score']
                scores_dict[key]['score'] = parse_score(scores_dict[key]['score'])
                print(f"DEBUG: Converted score for {key}: {old_score} -> {scores_dict[key]['score']}")
            else:
                print(f"DEBUG: No 'score' key found in {key}")
        
        print("DEBUG: Creating PHQ8ScoresWithExplanations object")
        try:
            phq8_scores = PHQ8ScoresWithExplanations(**scores_dict)
            print("DEBUG: Successfully created PHQ8ScoresWithExplanations object")
        except Exception as pydantic_error:
            print(f"DEBUG: Pydantic validation failed: {pydantic_error}")
            print(f"DEBUG: scores_dict structure: {scores_dict}")
            raise
        
        # Extract the 8 PHQ-8 score values
        print("DEBUG: Extracting score values from PHQ8 object")
        scores_list = [
            phq8_scores.PHQ8_NoInterest.score,
            phq8_scores.PHQ8_Depressed.score,
            phq8_scores.PHQ8_Sleep.score,
            phq8_scores.PHQ8_Tired.score,
            phq8_scores.PHQ8_Appetite.score,
            phq8_scores.PHQ8_Failure.score,
            phq8_scores.PHQ8_Concentrating.score,
            phq8_scores.PHQ8_Moving.score
        ]
        print(f"DEBUG: Extracted scores list: {scores_list}")
        
        print("DEBUG: Starting comparison calculations")
        print("Comparison of Predicted vs Ground Truth:")
        print("Metric\t\t\tPredicted\tGround Truth\tDifference")
        print("-" * 65)

        differences = []
        n_available = 0
        num_questions_NA = 0
        metrics = ['PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 'PHQ8_Tired', 
                'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving']
        predicted_values = [phq8_scores.PHQ8_NoInterest.score, phq8_scores.PHQ8_Depressed.score, phq8_scores.PHQ8_Sleep.score, 
                        phq8_scores.PHQ8_Tired.score, phq8_scores.PHQ8_Appetite.score, phq8_scores.PHQ8_Failure.score,
                        phq8_scores.PHQ8_Concentrating.score, phq8_scores.PHQ8_Moving.score]

        print(f"DEBUG: Processing {len(metrics)} metrics")
        for i, (metric, pred_val) in enumerate(zip(metrics, predicted_values)):
            print(f"DEBUG: Processing metric {i+1}/{len(metrics)}: {metric}")
            # Handle ground truth values that might be NaN
            gt_raw = ground_truth[metric]
            print(f"DEBUG: {metric} gt_raw: {gt_raw} (type: {type(gt_raw)})")
            
            if pd.isna(gt_raw):
                print(f"DEBUG: Ground truth for {metric} is NaN, skipping this metric")
                continue
                
            try:
                gt_val = int(gt_raw)
                print(f"DEBUG: Ground truth value converted to int: {gt_val}")
            except (ValueError, TypeError) as e:
                print(f"DEBUG: Failed to convert ground truth to int: {e}")
                continue
            
            if pred_val == "N/A":
                diff_str = "N/A"
                num_questions_NA += 1
                print(f"DEBUG: Predicted value is N/A, incrementing NA count to {num_questions_NA}")
            else:
                diff = abs(pred_val - gt_val)
                differences.append(diff)
                diff_str = str(diff)
                n_available += 1
                print(f"DEBUG: Calculated difference: {diff}, n_available now: {n_available}")
            print(f"{metric:<23} {str(pred_val):<12} {gt_val:<15} {diff_str}")

        print("DEBUG: Calculating final metrics")
        # Calculate metrics
        if n_available > 0:
            avg_difference = sum(differences) / n_available
            accuracy_on_available = 1 - (avg_difference / 3)
            print(f"DEBUG: avg_difference: {avg_difference}, accuracy_on_available: {accuracy_on_available}")
        else:
            avg_difference = float('inf')
            accuracy_on_available = 0
            print("DEBUG: No available questions, setting defaults")
        
        # Accuracy * % available questions
        overall_accuracy = accuracy_on_available * (1 - (num_questions_NA / 8))
        print(f"DEBUG: overall_accuracy: {overall_accuracy}")
        
        print("-" * 65)
        if n_available > 0:
            print(f"Average Absolute Difference (on available): {avg_difference:.2f}")
            print(f"Accuracy on available questions: {accuracy_on_available:.2%}")
        print(f"Questions marked N/A: {num_questions_NA}/8")
        print(f"Overall accuracy: {overall_accuracy:.2%}")
        
        # Reasoning and evidence section
        print("DEBUG: Starting detailed reasoning section")
        print("\n\nDetailed Reasoning for Each Score:")
        print("=" * 80)
        
        symptom_names = {
            'PHQ8_NoInterest': 'Little Interest/Pleasure',
            'PHQ8_Depressed': 'Feeling Depressed',
            'PHQ8_Sleep': 'Sleep Problems',
            'PHQ8_Tired': 'Fatigue',
            'PHQ8_Appetite': 'Appetite Changes',
            'PHQ8_Failure': 'Negative Self-Perception',
            'PHQ8_Concentrating': 'Concentration Problems',
            'PHQ8_Moving': 'Psychomotor Changes'
        }
        
        for key, symptom_name in symptom_names.items():
            print(f"DEBUG: Processing reasoning for {key}")
            try:
                score_data = getattr(phq8_scores, key)
                print(f"\n{symptom_name} (Score: {score_data.score})")
                print("-" * 40)
                print(f"Evidence: {score_data.evidence}")
                print(f"Reason: {score_data.reason}")
            except Exception as e:
                print(f"DEBUG: Error processing reasoning for {key}: {e}")

        print("DEBUG: Function completed successfully")
        return phq8_scores, avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy

    except (json.JSONDecodeError, KeyError, ValueError) as e:
        print(f"DEBUG: Exception caught in main try block: {type(e).__name__}: {e}")
        print("DEBUG: Raw response object:", response)
        try:
            print("DEBUG: Response JSON:")
            print(json.dumps(response.json(), indent=2))
        except Exception as json_error:
            print(f"DEBUG: Could not parse response as JSON: {json_error}")
            print(f"DEBUG: Raw response text: {response.text}")
        return None, None, None, None, None
    except Exception as unexpected_error:
        print(f"DEBUG: Unexpected error: {type(unexpected_error).__name__}: {unexpected_error}")
        import traceback
        print("DEBUG: Full traceback:")
        traceback.print_exc()
        return None, None, None, None, None


## **Execution Loop**

In [None]:
"""
Execution loop 
"""

csv_file = f"/data/users2/user/ai-psychiatrist/analysis_output/results.csv"
json_file = f"/data/users2/user/ai-psychiatrist/analysis_output/results_detailed.jsonl"

# Initialize CSV file with headers
with open(csv_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['participant_id', 'timestamp', 'PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 
                     'PHQ8_Tired', 'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving',
                     'avg_difference', 'accuracy_on_available', 'num_questions_na', 'overall_accuracy'])

# Execution loop
for participant_id in participant_list:
    current_transcript = pd.read_csv(fr"/data/users4/user/ai-psychiatrist/datasets/daic_woz_dataset/{participant_id}_P/{participant_id}_TRANSCRIPT.csv", sep="\t")
    
    # Reformatting transcript data to be a string with speaker name + text
    current_patient_transcript = '\n'.join(current_transcript['speaker'] + ': ' + current_transcript['value'])
    
    # Get ground truth data for participant
    if participant_id in train_split_phq8['Participant_ID'].values:
        ground_truth = train_split_phq8[train_split_phq8['Participant_ID'] == participant_id].iloc[0]
    else:
        ground_truth = dev_split_phq8[dev_split_phq8['Participant_ID'] == participant_id].iloc[0]

    # Run analysis
    print("RUNNING ANALYSIS!!!")
    phq8_scores, avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy = run_phq8_analysis(current_patient_transcript, ground_truth)
    print("ANALYSIS COMPLETE!!")

    if phq8_scores is not None:
        # Save to CSV
        timestamp = datetime.now().isoformat()
        with open(csv_file, 'a', newline='') as f:
            writer = csv.writer(f)
            
            # Handle NaN values by converting them to None or a placeholder
            def handle_nan(value):
                if pd.isna(value) or (isinstance(value, float) and math.isnan(value)):
                    return None  # or return "N/A" if you prefer a string placeholder
                return value
            
            writer.writerow([
                participant_id, timestamp,
                handle_nan(phq8_scores.PHQ8_NoInterest.score), 
                handle_nan(phq8_scores.PHQ8_Depressed.score), 
                handle_nan(phq8_scores.PHQ8_Sleep.score),
                handle_nan(phq8_scores.PHQ8_Tired.score), 
                handle_nan(phq8_scores.PHQ8_Appetite.score), 
                handle_nan(phq8_scores.PHQ8_Failure.score),
                handle_nan(phq8_scores.PHQ8_Concentrating.score), 
                handle_nan(phq8_scores.PHQ8_Moving.score),
                handle_nan(avg_difference), 
                handle_nan(accuracy_on_available), 
                handle_nan(num_questions_NA), 
                handle_nan(overall_accuracy)
            ])

        # Save detailed data to JSONL
        detailed_data = {
            "participant_id": participant_id,
            "timestamp": timestamp,
            "PHQ8_NoInterest": {"evidence": phq8_scores.PHQ8_NoInterest.evidence, "reason": phq8_scores.PHQ8_NoInterest.reason, "score": handle_nan(phq8_scores.PHQ8_NoInterest.score)},
            "PHQ8_Depressed": {"evidence": phq8_scores.PHQ8_Depressed.evidence, "reason": phq8_scores.PHQ8_Depressed.reason, "score": handle_nan(phq8_scores.PHQ8_Depressed.score)},
            "PHQ8_Sleep": {"evidence": phq8_scores.PHQ8_Sleep.evidence, "reason": phq8_scores.PHQ8_Sleep.reason, "score": handle_nan(phq8_scores.PHQ8_Sleep.score)},
            "PHQ8_Tired": {"evidence": phq8_scores.PHQ8_Tired.evidence, "reason": phq8_scores.PHQ8_Tired.reason, "score": handle_nan(phq8_scores.PHQ8_Tired.score)},
            "PHQ8_Appetite": {"evidence": phq8_scores.PHQ8_Appetite.evidence, "reason": phq8_scores.PHQ8_Appetite.reason, "score": handle_nan(phq8_scores.PHQ8_Appetite.score)},
            "PHQ8_Failure": {"evidence": phq8_scores.PHQ8_Failure.evidence, "reason": phq8_scores.PHQ8_Failure.reason, "score": handle_nan(phq8_scores.PHQ8_Failure.score)},
            "PHQ8_Concentrating": {"evidence": phq8_scores.PHQ8_Concentrating.evidence, "reason": phq8_scores.PHQ8_Concentrating.reason, "score": handle_nan(phq8_scores.PHQ8_Concentrating.score)},
            "PHQ8_Moving": {"evidence": phq8_scores.PHQ8_Moving.evidence, "reason": phq8_scores.PHQ8_Moving.reason, "score": handle_nan(phq8_scores.PHQ8_Moving.score)}
        }

        with open(json_file, 'a') as f:
            f.write(json.dumps(detailed_data) + '\n')
        
        print(f"\nCompleted analysis for participant {participant_id}")
    else:
        print(f"\nFailed to analyze participant {participant_id} - skipping")
        print(f"Analysis Stats:\nPHQ8_Scores: {phq8_scores}\nAvg_Difference: {avg_difference}\nAccuracy_On_Available: {accuracy_on_available}\nNum_Questions_NA: {num_questions_NA}\nOverall_Accuracy: {overall_accuracy}")
    
    print("="*80)

RUNNING ANALYSIS!!!
DEBUG: Starting run_phq8_analysis function
DEBUG: Patient transcript length: 11920
DEBUG: Ground truth type: <class 'pandas.core.series.Series'>
DEBUG: Created quantitative prompt
DEBUG: Making API request...


KeyboardInterrupt: 

In [None]:

participant_id = 422

current_transcript = pd.read_csv(fr"/data/users4/user/ai-psychiatrist/datasets/daic_woz_dataset/{participant_id}_P/{participant_id}_TRANSCRIPT.csv", sep="\t")

# Reformatting transcript data to be a string with speaker name + text
current_patient_transcript = '\n'.join(current_transcript['speaker'] + ': ' + current_transcript['value'])

quantitative_prompt = f"""Analyze the following interview transcript and predict PHQ-8 scores for each symptom domain.

Interview transcript:
<transcript>
{current_patient_transcript}
</transcript>

Analyze each symptom using the following approach in <thinking> tags:
1. Search for direct quotes or behavioral evidence related to each PHQ-8 symptom
2. Evaluate the frequency/severity based on available evidence
3. If no relevant evidence exists, mark as "N/A" rather than assuming absence
4. Only assign numeric scores (0-3) when evidence clearly indicates frequency

After your analysis, provide your final assessment in <answer> tags as a JSON object.

For each symptom, provide:
1. "evidence": exact quotes from transcript (use "No relevant evidence found" if not discussed)
2. "reason": explanation of scoring decision, including why N/A if applicable
3. "score": integer 0-3 based on evidence, or "N/A" if no relevant evidence

Return ONLY a JSON object in <answer> tags with these exact keys:
- "PHQ8_NoInterest": {{evidence, reason, score}} for little interest or pleasure in doing things (anhedonia)
- "PHQ8_Depressed": {{evidence, reason, score}} for feeling down, depressed, or hopeless (depressed mood)
- "PHQ8_Sleep": {{evidence, reason, score}} for trouble falling or staying asleep, or sleeping too much (sleep problems)
- "PHQ8_Tired": {{evidence, reason, score}} for feeling tired or having little energy (fatigue)
- "PHQ8_Appetite": {{evidence, reason, score}} for poor appetite or overeating (appetite/weight changes)
- "PHQ8_Failure": {{evidence, reason, score}} for feeling bad about yourself or that you are a failure (negative self-perception)
- "PHQ8_Concentrating": {{evidence, reason, score}} for trouble concentrating on things like reading or watching TV (concentration problems)
- "PHQ8_Moving": {{evidence, reason, score}} for moving or speaking slowly, or being fidgety/restless (psychomotor changes)"""

response = requests.post(
    BASE_URL,
    json={
        "model": model,
        "messages": [{"role": "system", "content": system_prompt},
                    {"role": "user", "content": quantitative_prompt}],
        "stream": True,  # Enable streaming
        "options": {
            "temperature": 0.1,
            "top_k": 10,
            "top_p": 0.8
        }
    },
    stream=True  # Enable streaming in requests
)

# Process the streaming response
full_content = ""
for line in response.iter_lines():
    if line:
        try:
            # Parse the JSON chunk
            chunk = json.loads(line.decode('utf-8'))
            
            # Extract content from the message if it exists
            if 'message' in chunk and 'content' in chunk['message']:
                content = chunk['message']['content']
                print(content, end='', flush=True)  # Print without newline, flush immediately
                full_content += content
            
            # Check if streaming is complete
            if chunk.get('done', False):
                print()  # Add final newline
                print(f"\nStreaming complete. Total tokens: {chunk.get('eval_count', 'unknown')}")
                break
                
        except json.JSONDecodeError:
            # Skip malformed lines
            continue

print(f"\nFull response: {full_content}")


In [None]:
"""
Execution loop 
"""

csv_file = f"/data/users2/user/ai-psychiatrist/analysis_output/results.csv"
json_file = f"/data/users2/user/ai-psychiatrist/analysis_output/results_detailed.jsonl"

# Initialize CSV file with headers
with open(csv_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['participant_id', 'timestamp', 'PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 
                     'PHQ8_Tired', 'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving',
                     'avg_difference', 'accuracy_on_available', 'num_questions_na', 'overall_accuracy'])

# Execution loop
for participant_id in participant_list:
    current_transcript = pd.read_csv(fr"/data/users4/user/ai-psychiatrist/datasets/daic_woz_dataset/{participant_id}_P/{participant_id}_TRANSCRIPT.csv", sep="\t")
    
    # Reformatting transcript data to be a string with speaker name + text
    current_patient_transcript = '\n'.join(current_transcript['speaker'] + ': ' + current_transcript['value'])
    
    # Get ground truth data for participant
    if participant_id in train_split_phq8['Participant_ID'].values:
        ground_truth = train_split_phq8[train_split_phq8['Participant_ID'] == participant_id].iloc[0]
    else:
        ground_truth = dev_split_phq8[dev_split_phq8['Participant_ID'] == participant_id].iloc[0]

    # Run analysis
    print("RUNNING ANALYSIS!!!")
    phq8_scores, avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy = run_phq8_analysis(current_patient_transcript, ground_truth)
    print("ANALYSIS COMPLETE!!")

    if phq8_scores is not None:
        # Save to CSV
        timestamp = datetime.now().isoformat()
        with open(csv_file, 'a', newline='') as f:
            writer = csv.writer(f)
            
            # Handle NaN values by converting them to None or a placeholder
            def handle_nan(value):
                if pd.isna(value) or (isinstance(value, float) and math.isnan(value)):
                    return None  # or return "N/A" if you prefer a string placeholder
                return value
            
            writer.writerow([
                participant_id, timestamp,
                handle_nan(phq8_scores.PHQ8_NoInterest.score), 
                handle_nan(phq8_scores.PHQ8_Depressed.score), 
                handle_nan(phq8_scores.PHQ8_Sleep.score),
                handle_nan(phq8_scores.PHQ8_Tired.score), 
                handle_nan(phq8_scores.PHQ8_Appetite.score), 
                handle_nan(phq8_scores.PHQ8_Failure.score),
                handle_nan(phq8_scores.PHQ8_Concentrating.score), 
                handle_nan(phq8_scores.PHQ8_Moving.score),
                handle_nan(avg_difference), 
                handle_nan(accuracy_on_available), 
                handle_nan(num_questions_NA), 
                handle_nan(overall_accuracy)
            ])

        # Save detailed data to JSONL
        detailed_data = {
            "participant_id": participant_id,
            "timestamp": timestamp,
            "PHQ8_NoInterest": {"evidence": phq8_scores.PHQ8_NoInterest.evidence, "reason": phq8_scores.PHQ8_NoInterest.reason, "score": handle_nan(phq8_scores.PHQ8_NoInterest.score)},
            "PHQ8_Depressed": {"evidence": phq8_scores.PHQ8_Depressed.evidence, "reason": phq8_scores.PHQ8_Depressed.reason, "score": handle_nan(phq8_scores.PHQ8_Depressed.score)},
            "PHQ8_Sleep": {"evidence": phq8_scores.PHQ8_Sleep.evidence, "reason": phq8_scores.PHQ8_Sleep.reason, "score": handle_nan(phq8_scores.PHQ8_Sleep.score)},
            "PHQ8_Tired": {"evidence": phq8_scores.PHQ8_Tired.evidence, "reason": phq8_scores.PHQ8_Tired.reason, "score": handle_nan(phq8_scores.PHQ8_Tired.score)},
            "PHQ8_Appetite": {"evidence": phq8_scores.PHQ8_Appetite.evidence, "reason": phq8_scores.PHQ8_Appetite.reason, "score": handle_nan(phq8_scores.PHQ8_Appetite.score)},
            "PHQ8_Failure": {"evidence": phq8_scores.PHQ8_Failure.evidence, "reason": phq8_scores.PHQ8_Failure.reason, "score": handle_nan(phq8_scores.PHQ8_Failure.score)},
            "PHQ8_Concentrating": {"evidence": phq8_scores.PHQ8_Concentrating.evidence, "reason": phq8_scores.PHQ8_Concentrating.reason, "score": handle_nan(phq8_scores.PHQ8_Concentrating.score)},
            "PHQ8_Moving": {"evidence": phq8_scores.PHQ8_Moving.evidence, "reason": phq8_scores.PHQ8_Moving.reason, "score": handle_nan(phq8_scores.PHQ8_Moving.score)}
        }

        with open(json_file, 'a') as f:
            f.write(json.dumps(detailed_data) + '\n')
        
        print(f"\nCompleted analysis for participant {participant_id}")
    else:
        print(f"\nFailed to analyze participant {participant_id} - skipping")
        print(f"Analysis Stats:\nPHQ8_Scores: {phq8_scores}\nAvg_Difference: {avg_difference}\nAccuracy_On_Available: {accuracy_on_available}\nNum_Questions_NA: {num_questions_NA}\nOverall_Accuracy: {overall_accuracy}")
    
    print("="*80)

RUNNING ANALYSIS!!!
DEBUG: Starting run_phq8_analysis function
DEBUG: Patient transcript length: 11920
DEBUG: Ground truth type: <class 'pandas.core.series.Series'>
DEBUG: Created quantitative prompt
DEBUG: Making API request...


KeyboardInterrupt: 