# Imports + Config

In [1]:
import json
import requests
import pandas as pd
import numpy as np
#!pip install --upgrade typing_extensions
#!pip install pydantic
from pydantic import BaseModel
from typing import List
from typing import Union
import csv
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from matplotlib.gridspec import GridSpec

# Ollama Config
OLLAMA_NODE = "arctrdagn032"
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"

model = "gemma3-optimized:27b" # qwq:latest


#  Grabbing DAIC-WOZ data (Data Loader)

## Grabbing all participant IDs that have all the PHQ-8 questionare data

In [None]:
"""
Specifying which participant IDs should be analyzed
"""

dev_split_phq8 = pd.read_csv(r"/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/dev_split_Depression_AVEC2017.csv")
train_split_phq8 = pd.read_csv(r"/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/train_split_Depression_AVEC2017.csv")

participant_ids = set()

# Grabbing unique participant ID values and putting them in a list

participant_ids.update(dev_split_phq8['Participant_ID'])
participant_ids.update(train_split_phq8['Participant_ID'])

participant_list = sorted(list(participant_ids))

# Specified values already done, because the basic execution code doesn't check for old values. Also, certain values made the analysis freeze so they were included here as well.
values_already_done = {302, 303, 304, 305, 307, 310, 312, 313, 315, 316, 317, 318, 319, 320, 321, 322, 324, 325, 326, 327, 328, 330, 331, 333, 335, 336, 338, 339, 340, 341, 343, 344, 345, 346, 347, 348, 350, 351, 352, 353, 355, 356, 357, 358, 360, 362, 363, 364, 366, 367, 368, 369, 370, 371, 372, 374, 375, 376, 377, 379, 380, 381, 382, 383, 385, 386, 388, 389, 390, 391, 392, 393, 395, 397, 400, 401, 402, 403, 404, 406, 409, 412, 413, 414, 415, 416, 417, 418, 419, 420, 422, 423, 425, 426, 427, 428, 429, 430, 433, 434, 436, 437, 439, 440, 441, 443, 444, 445, 446, 447, 448, 449, 451, 454, 455, 456, 457, 458, 459, 463, 464, 468, 471, 472, 473}
for value in values_already_done:
    if value in participant_list:
        participant_list.remove(value)
print(participant_list)
print(dev_split_phq8)
print('\n\n\n')
print(train_split_phq8)

[474, 475, 476, 477, 478, 479, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492]
    Participant_ID  PHQ8_Binary  PHQ8_Score  Gender  PHQ8_NoInterest  \
0              302            0           4       1                1   
1              307            0           4       0                0   
2              331            0           8       1                1   
3              335            1          12       0                1   
4              346            1          23       0                2   
5              367            1          19       1                3   
6              377            1          16       0                2   
7              381            1          16       1                2   
8              382            0           0       1                0   
9              388            1          17       1                1   
10             389            1          14       1                1   
11             390            0           9       

## Quantitative Assesment

In [None]:
"""
Runs basic quantitative analysis on the participants. Instructions to grab evidence, give reasoning, and finally output a score. Outputs validated with pydantic.
"""

class PHQ8ScoreWithExplanation(BaseModel):
    evidence: str  # Direct quotes or references from interview
    reason: str  # Brief explanation for the score
    score: Union[int, str]  # 0-3 or "N/A"

class PHQ8ScoresWithExplanations(BaseModel):
    PHQ8_NoInterest: PHQ8ScoreWithExplanation  # Little interest or pleasure in doing things
    PHQ8_Depressed: PHQ8ScoreWithExplanation   # Feeling down, depressed, or hopeless
    PHQ8_Sleep: PHQ8ScoreWithExplanation       # Trouble falling/staying asleep, or sleeping too much
    PHQ8_Tired: PHQ8ScoreWithExplanation       # Feeling tired or having little energy
    PHQ8_Appetite: PHQ8ScoreWithExplanation    # Poor appetite or overeating
    PHQ8_Failure: PHQ8ScoreWithExplanation     # Feeling bad about self or like a failure
    PHQ8_Concentrating: PHQ8ScoreWithExplanation # Trouble concentrating
    PHQ8_Moving: PHQ8ScoreWithExplanation      # Moving/speaking slowly or being fidgety/restless

system_prompt = """You are a AI psychiatrist assessment tool specialized in analyzing interview transcripts to evaluate PHQ-8 (Patient Health Questionnaire-8) scores. The PHQ-8 is a validated depression screening instrument that assesses eight of the nine DSM-IV criteria for depressive disorders over the past 2 weeks.

PHQ-8 Scoring System:
- 0 = Not at all (0-1 days in the past 2 weeks)
- 1 = Several days (2-6 days)
- 2 = More than half the days (7-11 days)
- 3 = Nearly every day (12-14 days)

Key Principles:
1. Base all assessments strictly on evidence from the transcript
2. When no relevant evidence exists for a symptom, output "N/A" rather than assuming absence (score of 0)
3. Consider both explicit statements and behavioral indicators in the interview
4. Be conservative - only assign scores when there is clear evidence of frequency
5. Remember that absence of mention does not necessarily mean absence of symptom
6. Follow the format specified exactly"""

def run_phq8_analysis(patient_transcript, ground_truth):
    """
    Predicts PHQ8 values from a patients transcript, compares to ground truth, and calculates evaluation metrics

    Parameters
    ----------
    patient_transcript : string
        The patients transcript
    ground_truth : pandas series
        The ground truth PHQ8 values for a given participant

    Returns
    -------
    tuple
        A 5-tuple containing:
        - phq8_scores (PHQ8ScoresWithExplanations or None): Parsed PHQ8 scores with evidence and reasoning
        - avg_difference (float or None): Average absolute difference between predicted and ground truth scores
        - accuracy_on_available (float or None): Accuracy score on questions that weren't marked N/A (0-1 scale)
        - num_questions_NA (int or None): Number of questions marked as N/A out of 8 total
        - overall_accuracy (float or None): Overall accuracy adjusted for N/A responses (0-1 scale)
        
        Returns (None, None, None, None, None) if parsing fails.
    """
    quantitative_prompt = f"""Analyze the following interview transcript and predict PHQ-8 scores for each symptom domain.

Interview transcript:
<transcript>
{patient_transcript}
</transcript>

Analyze each symptom using the following approach in <thinking> tags:
1. Search for direct quotes or behavioral evidence related to each PHQ-8 symptom
2. Evaluate the frequency/severity based on available evidence
3. If no relevant evidence exists, mark as "N/A" rather than assuming absence
4. Only assign numeric scores (0-3) when evidence clearly indicates frequency

After your analysis, provide your final assessment in <answer> tags as a JSON object.

For each symptom, provide:
1. "evidence": exact quotes from transcript (use "No relevant evidence found" if not discussed)
2. "reason": explanation of scoring decision, including why N/A if applicable
3. "score": integer 0-3 based on evidence, or "N/A" if no relevant evidence

Return ONLY a JSON object in <answer> tags with these exact keys:
- "nointerest": {{evidence, reason, score}} for little interest or pleasure in doing things (anhedonia)
- "depressed": {{evidence, reason, score}} for feeling down, depressed, or hopeless (depressed mood)
- "sleep": {{evidence, reason, score}} for trouble falling or staying asleep, or sleeping too much (sleep problems)
- "tired": {{evidence, reason, score}} for feeling tired or having little energy (fatigue)
- "appetite": {{evidence, reason, score}} for poor appetite or overeating (appetite/weight changes)
- "failure": {{evidence, reason, score}} for feeling bad about yourself or that you are a failure (negative self-perception)
- "concentrating": {{evidence, reason, score}} for trouble concentrating on things like reading or watching TV (concentration problems)
- "moving": {{evidence, reason, score}} for moving or speaking slowly, or being fidgety/restless (psychomotor changes)"""

    # Most deterministic temp, top_k, and top_p
    response = requests.post(
        BASE_URL,
        json={
            "model": model,
            "messages": [{"role": "system", "content": system_prompt},
                        {"role": "user", "content": quantitative_prompt}],
            "stream": False,
            "options": {
                "temperature": 0,
                "top_k": 1,
                "top_p": 1.0
            }
        }
    )

    # Parse and validate the response
    try:
        response_data = response.json()
        content = response_data['message']['content']
        
        # Extract content from <answer> tags if present
        if '<answer>' in content and '</answer>' in content:
            content = content.split('<answer>')[1].split('</answer>')[0].strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```json'):
            content = content.split('```json')[1].split('```')[0].strip()
        elif content.startswith('```'):
            content = content.split('```')[1].split('```')[0].strip()
        
        # Parse the JSON response and validate with Pydantic
        scores_dict = json.loads(content)
        phq8_scores = PHQ8ScoresWithExplanations(**scores_dict)
        
        # Extract the 8 PHQ-8 score values
        scores_list = [
            phq8_scores.nointerest.score,
            phq8_scores.depressed.score,
            phq8_scores.sleep.score,
            phq8_scores.tired.score,
            phq8_scores.appetite.score,
            phq8_scores.failure.score,
            phq8_scores.concentrating.score,
            phq8_scores.moving.score
        ]
        
        print("Comparison of Predicted vs Ground Truth:")
        print("Metric\t\t\tPredicted\tGround Truth\tDifference")
        print("-" * 65)

        differences = []
        n_available = 0
        num_questions_NA = 0
        metrics = ['PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 'PHQ8_Tired', 
                'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving']
        predicted_values = [phq8_scores.nointerest.score, phq8_scores.depressed.score, phq8_scores.sleep.score, 
                        phq8_scores.tired.score, phq8_scores.appetite.score, phq8_scores.failure.score,
                        phq8_scores.concentrating.score, phq8_scores.moving.score]

        for metric, pred_val in zip(metrics, predicted_values):
            gt_val = int(ground_truth[metric])
            if pred_val == "N/A":
                diff_str = "N/A"
                num_questions_NA += 1
            else:
                diff = abs(pred_val - gt_val)
                differences.append(diff)
                diff_str = str(diff)
                n_available += 1
            print(f"{metric:<23} {str(pred_val):<12} {gt_val:<15} {diff_str}")

        # Calculate metrics
        if n_available > 0:
            avg_difference = sum(differences) / n_available
            accuracy_on_available = 1 - (avg_difference / 3)
        else:
            avg_difference = float('inf')
            accuracy_on_available = 0
        
        # Accuracy * % available questions
        overall_accuracy = accuracy_on_available * (1 - (num_questions_NA / 8))
        
        print("-" * 65)
        if n_available > 0:
            print(f"Average Absolute Difference (on available): {avg_difference:.2f}")
            print(f"Accuracy on available questions: {accuracy_on_available:.2%}")
        print(f"Questions marked N/A: {num_questions_NA}/8")
        print(f"Overall accuracy: {overall_accuracy:.2%}")
        
        # Reasoning and evidence section
        print("\n\nDetailed Reasoning for Each Score:")
        print("=" * 80)
        
        symptom_names = {
            'PHQ8_NoInterest': 'Little Interest/Pleasure',
            'PHQ8_Depressed': 'Feeling Depressed',
            'PHQ8_Sleep': 'Sleep Problems',
            'PHQ8_Tired': 'Fatigue',
            'PHQ8_Appetite': 'Appetite Changes',
            'PHQ8_Failure': 'Negative Self-Perception',
            'PHQ8_Concentrating': 'Concentration Problems',
            'PHQ8_Moving': 'Psychomotor Changes'
        }
        
        for key, symptom_name in symptom_names.items():
            score_data = getattr(phq8_scores, key)
            print(f"\n{symptom_name} (Score: {score_data.score})")
            print("-" * 40)
            print(f"Evidence: {score_data.evidence}")
            print(f"Reason: {score_data.reason}")

        return phq8_scores, avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy

    except (json.JSONDecodeError, KeyError, ValueError) as e:
        print(f"Error parsing response: {e}")
        print("Raw response:", response)
        print(json.dumps(response.json(), indent=2))
        return None, None, None, None, None


## **Execution Loop (Slightly broken)**

In [None]:
"""
Execution loop 
"""

csv_file = f"/data/users2/agreene46/ai-psychiatrist/analysis_output/results.csv"
json_file = f"/data/users2/agreene46/ai-psychiatrist/analysis_output/results_detailed.jsonl"

# Initialize CSV file with headers
with open(csv_file, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['participant_id', 'timestamp', 'nointerest_score', 'depressed_score', 'sleep_score', 
                     'tired_score', 'appetite_score', 'failure_score', 'concentrating_score', 'moving_score',
                     'avg_difference', 'accuracy_on_available', 'num_questions_na', 'overall_accuracy'])

# Execution loop
for participant_id in participant_list:
    current_transcript = pd.read_csv(fr"/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/{participant_id}_P/{participant_id}_TRANSCRIPT.csv", sep="\t")
    
    # Reformatting transcript data to be a string with speaker name + text
    current_patient_transcript = '\n'.join(current_transcript['speaker'] + ': ' + current_transcript['value'])
    
    # Get ground truth data for participant
    if participant_id in train_split_phq8['Participant_ID'].values:
        ground_truth = train_split_phq8[train_split_phq8['Participant_ID'] == participant_id].iloc[0]
    else:
        ground_truth = dev_split_phq8[dev_split_phq8['Participant_ID'] == participant_id].iloc[0]

    # Run analysis
    phq8_scores, avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy = run_phq8_analysis(current_patient_transcript, ground_truth)

    if phq8_scores is not None:
        # Save to CSV
        timestamp = datetime.now().isoformat()
        with open(csv_file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([
                participant_id, timestamp,
                phq8_scores.nointerest.score, phq8_scores.depressed.score, phq8_scores.sleep.score,
                phq8_scores.tired.score, phq8_scores.appetite.score, phq8_scores.failure.score,
                phq8_scores.concentrating.score, phq8_scores.moving.score,
                avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy
            ])
        
        # Save detailed data to JSONL
        detailed_data = {
            "participant_id": participant_id,
            "timestamp": timestamp,
            "PHQ8_NoInterest": {"evidence": phq8_scores.nointerest.evidence, "reason": phq8_scores.nointerest.reason, "score": phq8_scores.nointerest.score},
            "PHQ8_Depressed": {"evidence": phq8_scores.depressed.evidence, "reason": phq8_scores.depressed.reason, "score": phq8_scores.depressed.score},
            "PHQ8_Sleep": {"evidence": phq8_scores.sleep.evidence, "reason": phq8_scores.sleep.reason, "score": phq8_scores.sleep.score},
            "PHQ8_Tired": {"evidence": phq8_scores.tired.evidence, "reason": phq8_scores.tired.reason, "score": phq8_scores.tired.score},
            "PHQ8_Appetite": {"evidence": phq8_scores.appetite.evidence, "reason": phq8_scores.appetite.reason, "score": phq8_scores.appetite.score},
            "PHQ8_Failure": {"evidence": phq8_scores.failure.evidence, "reason": phq8_scores.failure.reason, "score": phq8_scores.failure.score},
            "PHQ8_Concentrating": {"evidence": phq8_scores.concentrating.evidence, "reason": phq8_scores.concentrating.reason, "score": phq8_scores.concentrating.score},
            "PHQ8_Moving": {"evidence": phq8_scores.moving.evidence, "reason": phq8_scores.moving.reason, "score": phq8_scores.moving.score}
        }

        with open(json_file, 'a') as f:
            f.write(json.dumps(detailed_data) + '\n')
        
        print(f"\nCompleted analysis for participant {participant_id}")
    else:
        print(f"\nFailed to analyze participant {participant_id} - skipping")
    
    print("="*80)

ConnectionError: HTTPConnectionPool(host='arctrdagn032', port=11434): Max retries exceeded with url: /api/chat (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x785906a65660>: Failed to establish a new connection: [Errno 111] Connection refused'))