# Imports + Config

In [30]:
import json
import requests
import pandas as pd
import numpy as np
#!pip install --upgrade typing_extensions
#!pip install pydantic
from pydantic import BaseModel
from typing import List
from typing import Union
import csv
from datetime import datetime
import pickle
import os
from sklearn.metrics.pairwise import cosine_similarity


# Ollama Config
OLLAMA_NODE = "arctrdagn039"
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"

model = "gemma3-optimized:27b" # qwq:latest


## Grabbing all participant IDs that have all the PHQ-8 questionare data

In [35]:
dev_split_phq8 = pd.read_csv(r"/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/dev_split_Depression_AVEC2017.csv")
train_split_phq8 = pd.read_csv(r"/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/train_split_Depression_AVEC2017.csv")
# Combining the 2 dataframes
phq8_ground_truths = pd.concat([dev_split_phq8, train_split_phq8], ignore_index=True)
# Sort by Participant_ID
phq8_ground_truths = phq8_ground_truths.sort_values('Participant_ID').reset_index(drop=True)

# Participants I was able to analyze without analysis breaking
unique_participants = [302, 303, 304, 305, 307, 310, 312, 313, 315, 316, 317, 318, 320, 321, 322, 324, 325, 326, 327, 328, 330, 331, 333, 335, 338, 339, 340, 341, 343, 344, 345, 346, 347, 348, 350, 351, 352, 353, 355, 357, 358, 360, 362, 363, 364, 366, 367, 368, 369, 370, 371, 374, 375, 376, 377, 379, 380, 381, 382, 383, 385, 386, 388, 389, 390, 391, 392, 393, 395, 397, 400, 401, 402, 403, 406, 409, 412, 413, 414, 415, 416, 417, 418, 419, 420, 423, 425, 426, 427, 428, 429, 430, 433, 434, 436, 437, 439, 440, 443, 444, 445, 446, 447, 448, 449, 451, 454, 455, 456, 457, 458, 459, 463, 464, 468, 471, 472, 473, 475, 476, 477, 478, 479, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492]

participant_id = 320
current_transcript = pd.read_csv(fr"/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/{participant_id}_P/{participant_id}_TRANSCRIPT.csv", sep="\t")
# Reformatting transcript data to be a string with speaker name + text
current_patient_transcript = '\n'.join(current_transcript['speaker'] + ': ' + current_transcript['value'])

print(current_patient_transcript[:500])
print("-"*50)
print(phq8_ground_truths.loc[phq8_ground_truths['Participant_ID'] == participant_id].T)

Participant: [syncing]
Ellie: hi i'm ellie thanks for coming in today
Ellie: i was created to talk to people in a safe and secure environment
Ellie: i'm not a therapist but i'm here to learn about people and would love to learn about you i'll ask a few questions to get us started 
Ellie: and please feel free to tell me anything your answers are totally confidential
Ellie: are you okay with this
Participant: yes
Ellie: okay
Ellie: so how are you doing today
Participant: i'm a little nervous
Ellie
--------------------------------------------------
                       13
Participant_ID      320.0
PHQ8_Binary           1.0
PHQ8_Score           11.0
Gender                0.0
PHQ8_NoInterest       1.0
PHQ8_Depressed        1.0
PHQ8_Sleep            3.0
PHQ8_Tired            1.0
PHQ8_Appetite         2.0
PHQ8_Failure          1.0
PHQ8_Concentrating    1.0
PHQ8_Moving           1.0


## Grab knowledgebase transcripts

In [None]:
94*

94


In [None]:
# Get first 40 participants (last processed participant is 357)
first_40_participants = unique_participants[:40]
print(first_40_participants)

# Dictionary with participant transcripts
participant_transcripts = {}

for participant_id in first_40_participants:
    try:
        current_transcript = pd.read_csv(fr"/data/{participant_id}_P/{participant_id}_TRANSCRIPT.csv", sep="\t")
        
        # Handle missing values by converting to string and replacing NaN
        current_transcript['speaker'] = current_transcript['speaker'].fillna('Unknown').astype(str)
        current_transcript['value'] = current_transcript['value'].fillna('').astype(str)
        
        # Reformatting transcript data to be a string with speaker name + text
        current_patient_transcript = '\n'.join(current_transcript['speaker'] + ': ' + current_transcript['value'])
        participant_transcripts[participant_id] = current_patient_transcript
        
    except FileNotFoundError:
        print(f"File for participant {participant_id} not found")
        participant_transcripts[participant_id] = None
    except Exception as e:
        print(f"Error processing participant {participant_id}: {e}")
        participant_transcripts[participant_id] = None

[302, 303, 304, 305, 307, 310, 312, 313, 315, 316, 317, 318, 320, 321, 322, 324, 325, 326, 327, 328, 330, 331, 333, 335, 338, 339, 340, 341, 343, 344, 345, 346, 347, 348, 350, 351, 352, 353, 355, 357]


## Embed knowledgebase transcripts

In [None]:
def load_existing_embeddings(pickle_file):
    if os.path.exists(pickle_file):
        try:
            with open(pickle_file, 'rb') as f:
                return pickle.load(f)
        except:
            print(f"Error loading {pickle_file}")
            return {}
    return {}

def save_embeddings(embeddings_dict, pickle_file):
    with open(pickle_file, 'wb') as f:
        pickle.dump(embeddings_dict, f)

def get_embedding(text, model="dengcao/Qwen3-Embedding-8B:Q8_0"):
    BASE_URL = f"http://{OLLAMA_NODE}:11434/api/embeddings"
    
    response = requests.post(
        BASE_URL,
        json={
            "model": model,
            "prompt": text
        }
    )
    
    if response.status_code == 200:
        return response.json()["embedding"]
    else:
        raise Exception(f"API call failed with status {response.status_code}: {response.text}")

def create_sliding_chunks(transcript_text, chunk_size=4, step_size=2):
    lines = transcript_text.split('\n')
    
    # Remove any empty lines at the end
    while lines and lines[-1] == '':
        lines.pop()
    
    chunks = []
    
    # If fewer lines than chunk_size, just return the whole thing
    if len(lines) <= chunk_size:
        return ['\n'.join(lines)]
    
    # Create sliding windows
    for i in range(0, len(lines) - chunk_size + 1, step_size):
        chunk = '\n'.join(lines[i:i + chunk_size])
        chunks.append(chunk)
    
    # Handle the case where the last chunk might not align perfectly
    # If the last chunk doesn't include the final lines, add one more chunk
    last_chunk_start = len(lines) - chunk_size
    if last_chunk_start > 0 and (last_chunk_start % step_size) != 0:
        final_chunk = '\n'.join(lines[last_chunk_start:])
        if final_chunk not in chunks:
            chunks.append(final_chunk)
    
    return chunks

def process_transcripts(participant_transcripts, pickle_file):
    
    # Load existing embeddings
    participant_embedded_transcripts = load_existing_embeddings(pickle_file)
    
    for participant_id, transcript in participant_transcripts.items():
        # Skip if already processed
        if participant_id in participant_embedded_transcripts:
            print(f"Skipping participant {participant_id} - already processed")
            continue
        
        print(f"Processing participant {participant_id}...")
        
        try:
            # Create sliding window chunks
            chunks = create_sliding_chunks(transcript)
            
            # Get embeddings for each chunk
            embeddings_list = []
            for i, chunk in enumerate(chunks):
                print(f"  Processing chunk {i+1}/{len(chunks)}")
                embedding = get_embedding(chunk)
                embeddings_list.append((chunk, embedding))
            
            # Convert to numpy array and store
            participant_embedded_transcripts[participant_id] = np.array(embeddings_list, dtype=object)
            
            # Save after each participant
            save_embeddings(participant_embedded_transcripts, pickle_file)
            print(f"Completed participant {participant_id} - saved to {pickle_file}")
            
        except Exception as e:
            print(f"Error processing participant {participant_id}: {e}")
            print("Stopping processing and saving current progress...")
            save_embeddings(participant_embedded_transcripts, pickle_file)
            break
    
    return participant_embedded_transcripts

pickle_file = "/data/users2/agreene46/ai-psychiatrist/participant_embedded_transcripts.pkl"
participant_embedded_transcripts = process_transcripts(participant_transcripts, pickle_file)

Processing participant 302...
  Processing chunk 1/92
  Processing chunk 2/92
  Processing chunk 3/92
  Processing chunk 4/92
  Processing chunk 5/92
  Processing chunk 6/92
  Processing chunk 7/92
  Processing chunk 8/92
  Processing chunk 9/92
  Processing chunk 10/92
  Processing chunk 11/92
  Processing chunk 12/92
  Processing chunk 13/92
  Processing chunk 14/92
  Processing chunk 15/92
  Processing chunk 16/92
  Processing chunk 17/92
  Processing chunk 18/92
  Processing chunk 19/92
  Processing chunk 20/92
  Processing chunk 21/92
  Processing chunk 22/92
  Processing chunk 23/92
  Processing chunk 24/92
  Processing chunk 25/92
  Processing chunk 26/92
  Processing chunk 27/92
  Processing chunk 28/92
  Processing chunk 29/92
  Processing chunk 30/92
  Processing chunk 31/92
  Processing chunk 32/92
  Processing chunk 33/92
  Processing chunk 34/92
  Processing chunk 35/92
  Processing chunk 36/92
  Processing chunk 37/92
  Processing chunk 38/92
  Processing chunk 39/92
  Pr

In [18]:
print(participant_embedded_transcripts[302][1])

["Ellie: think of me as a friend i don't judge i can't i'm a computer\nEllie: i'm here to learn about people and would love to learn about you i'll ask a few questions to get us started and please feel free to tell me anything your answers are totally confidential\nEllie: how are you doing today\nParticipant: i'm fine how about yourself "
 list([4.765899181365967, 2.4971964359283447, -0.23363368213176727, -3.7968978881835938, -0.25872907042503357, -3.7829790115356445, 0.31883659958839417, -1.1243340969085693, -1.9006824493408203, -0.15161703526973724, -1.1603024005889893, -2.9016284942626953, 2.5446739196777344, 0.8604804277420044, -1.0193727016448975, -0.7758960127830505, -3.4722187519073486, 0.2691505551338196, -2.91408371925354, 0.7065085768699646, -1.3638696670532227, -1.7111896276474, 0.9253491759300232, -0.16307295858860016, 2.4727628231048584, 0.9735618829727173, 0.2931584417819977, -2.979600191116333, -1.5958975553512573, 1.0229841470718384, 0.2534439265727997, 2.28615832328796

## Grabbing evidence from transcript

In [28]:
class PHQ8Evidence(BaseModel):
    nointerest_evidence: List[str]
    depressed_evidence: List[str]
    sleep_evidence: List[str]
    tired_evidence: List[str]
    appetite_evidence: List[str]
    failure_evidence: List[str]
    concentrating_evidence: List[str]
    moving_evidence: List[str]

evidence_extraction_prompt = f"""Analyze the following therapy transcript and extract specific text chunks that provide evidence for each PHQ-8 domain. Based on the evidence found, determine the appropriate PHQ-8 score for each domain.

PHQ-8 scoring reference:
- 0 = not at all (0-1 days)
- 1 = several days (2-6 days) 
- 2 = more than half the days (7-11 days)
- 3 = nearly every day (12-14 days)

For each domain below, extract direct quotes from the transcript that indicate the presence, absence, frequency, or severity of symptoms. Then determine the most appropriate score based on the evidence.

PHQ-8 domains:
- nointerest: little interest or pleasure in activities
- depressed: feeling down, depressed, or hopeless
- sleep: sleep problems (trouble falling/staying asleep or sleeping too much)
- tired: feeling tired or having little energy
- appetite: appetite changes (poor appetite or overeating)
- failure: negative self-perception or feeling like a failure
- concentrating: trouble concentrating on tasks
- moving: psychomotor changes (moving/speaking slowly or restlessness)

Return a JSON object with arrays of relevant transcript quotes for each domain. Each quote should be a direct excerpt from the transcript that supports your scoring decision. Include as many relevant quotes as you find - this could range from 0 quotes (if no evidence) to multiple quotes per domain.

Therapy transcript:
{current_patient_transcript}

Respond with valid JSON matching this structure:
{{
    "PHQ8_NoInterest": ["evidence_1", "evidence_2", "evidence_3", "evidence_4"],
    "PHQ8_Depressed": ["evidence_1"],
    "PHQ8_Sleep": ["evidence_1", "evidence_2", "evidence_3"],
    "PHQ8_Tired": ["evidence_1", "evidence_2"],
    "PHQ8_Appetite": [],
    "PHQ8_Failure": ["evidence_1", "evidence_2", "evidence_3", "evidence_4", "evidence_5"],
    "PHQ8_Concentrating": ["evidence_1"],
    "PHQ8_Moving": ["evidence_1", "evidence_2"]
}}

Important: Extract UNIQUE quotes only - do not repeat the same quote multiple times. Each quote should be different and provide distinct, related evidence. If no evidence exists for a domain, return an empty array for that domain. Also, do not format the evidence grabbed in any way, output it EXACTLY as it is in the transcript.
"""

response = requests.post(
    BASE_URL,
    json={
        "model": model,
        "messages": [{"role": "user", "content": evidence_extraction_prompt}],
        "stream": False,
        "options": {
            "temperature": 0.1,
            "top_k": 10,
            "top_p": 0.8
        }
    }
)

# Printing
response_data = response.json()
content = response_data['message']['content']
content = content.strip('```json\n').strip('\n```')

evidence_dict = json.loads(content)

# Add scores to the output
scores = phq8_ground_truths.iloc[13]
evidence_dict['scores'] = {
    'PHQ8_NoInterest': int(scores['PHQ8_NoInterest']),
    'PHQ8_Depressed': int(scores['PHQ8_Depressed']),
    'PHQ8_Sleep': int(scores['PHQ8_Sleep']),
    'PHQ8_Tired': int(scores['PHQ8_Tired']),
    'PHQ8_Appetite': int(scores['PHQ8_Appetite']),
    'PHQ8_Failure': int(scores['PHQ8_Failure']),
    'PHQ8_Concentrating': int(scores['PHQ8_Concentrating']),
    'PHQ8_Moving': int(scores['PHQ8_Moving'])
}

# Remove duplicate quotes in each evidence list
for key in evidence_dict:
    if isinstance(evidence_dict[key], list):
        evidence_dict[key] = list(dict.fromkeys(evidence_dict[key]))

formatted_output = json.dumps(evidence_dict, indent=4)
print(formatted_output)


{
    "PHQ8_NoInterest": [],
    "PHQ8_Depressed": [
        "i've been going through a hell of a thing lately",
        "how have you been feeling lately\na little uneasy"
    ],
    "PHQ8_Sleep": [
        "it's not easy for me to sleep\nto get a good night's sleep",
        "i didn't sleep"
    ],
    "PHQ8_Tired": [
        "i don't know maybe sluggish"
    ],
    "PHQ8_Appetite": [],
    "PHQ8_Failure": [
        "i don't really know now",
        "i don't have a best friend",
        "i don't remember",
        "i don't really know"
    ],
    "PHQ8_Concentrating": [],
    "PHQ8_Moving": [],
    "scores": {
        "PHQ8_NoInterest": 1,
        "PHQ8_Depressed": 1,
        "PHQ8_Sleep": 3,
        "PHQ8_Tired": 1,
        "PHQ8_Appetite": 2,
        "PHQ8_Failure": 1,
        "PHQ8_Concentrating": 1,
        "PHQ8_Moving": 1
    }
}


## Embedding the evidence given

In [None]:
def find_similar_chunks(evidence_text_embedding, participant_embedded_transcripts, top_k=3):
    similarities = []
    
    # Go through all participants and their embeddings
    for participant_id, embeddings_array in participant_embedded_transcripts.items():
        for i, (raw_text, embedding) in enumerate(embeddings_array):
            # Calculate cosine similarity
            similarity = cosine_similarity(
                [evidence_text_embedding], 
                [embedding]
            )[0][0]
            
            similarities.append({
                'participant_id': participant_id,
                'raw_text': raw_text,
                'similarity': similarity,
                'embedding': embedding
            })
    
    # Sort by similarity and get top 3
    similarities.sort(key=lambda x: x['similarity'], reverse=True)
    return similarities[:top_k]

def process_evidence_for_references(evidence_dict, participant_embedded_transcripts, phq8_ground_truths):
    
    evidence_keys = [
        'PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 'PHQ8_Tired',
        'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving'
    ]
    
    all_references = []
    
    for evidence_key in evidence_keys:
        # Get evidence texts for this key
        evidence_texts = evidence_dict.get(evidence_key, [])
        
        # Skip if empty
        if not evidence_texts:
            continue
        
        # Combine evidence texts into single string
        combined_text = '\n'.join(evidence_texts)
        
        # Skip if less than 15 characters
        if len(combined_text) < 15:
            continue
        
        print(f"Processing {evidence_key}...")
        
        try:
            # Get embedding for combined evidence text
            evidence_embedding = get_embedding(combined_text)
            
            # Find top 3 similar chunks
            similar_chunks = find_similar_chunks(
                evidence_embedding, 
                participant_embedded_transcripts, 
                top_k=3
            )
            
            # Add each reference with its own header
            for chunk_info in similar_chunks:
                participant_id = chunk_info['participant_id']
                raw_text = chunk_info['raw_text']
                
                # Get the ground truth score for this participant and evidence type
                participant_data = phq8_ground_truths.loc[
                    phq8_ground_truths['Participant_ID'] == participant_id
                ]
                
                if not participant_data.empty:
                    score = int(participant_data[evidence_key].values[0])
                    reference_entry = f"({evidence_key} Score: {score})\n{raw_text}"
                    all_references.append(reference_entry)
                else:
                    print(f"Warning: No ground truth data found for participant {participant_id}")
                
        except Exception as e:
            print(f"Error processing {evidence_key}: {e}")
            continue
    
    # Combine all references into a string
    if all_references:
        reference_evidence = "<Reference Examples>\n\n" + "\n\n".join(all_references) + "\n\n<Reference Examples>"
    else:
        reference_evidence = "<Reference Examples>\nNo valid evidence found\n<Reference Examples>"
    
    return reference_evidence

# Usage:
reference_evidence = process_evidence_for_references(
    evidence_dict, 
    participant_embedded_transcripts, 
    phq8_ground_truths
)
print()
print(reference_evidence)

Processing PHQ8_Depressed...
Processing PHQ8_Sleep...
Processing PHQ8_Tired...
Processing PHQ8_Failure...

<Reference Examples>

(PHQ8_Depressed Score: 1)
Participant: it's a little strange
Ellie: how have you been feeling lately
Participant: how i've been feeling lately
Participant: a little uneasy

(PHQ8_Depressed Score: 1)
Participant: yes
Ellie: okay
Ellie: so how are you doing today
Participant: i'm a little nervous

(PHQ8_Depressed Score: 1)
Participant: very <v> just very irritable
Ellie: okay
Ellie: how have you been feeling lately 
Participant: um i was feeling anxious but now i'm calmer

(PHQ8_Sleep Score: 2)
Ellie: how easy is it for you to get a good night's sleep
Participant: almost impossible ever since i was a little kid i had a i had trouble falling asleep and um
Participant: about four years ago it it became really really bad where i just couldn't sleep through the night i was getting about four hours of sleep 
Participant: sometimes i would wake up every hour and i'd 

## Run Quantitative Analysis

In [41]:
class PHQ8ScoreWithExplanation(BaseModel):
    evidence: str  # Direct quotes or references from interview
    reason: str  # Brief explanation for the score
    score: Union[int, str]  # 0-3 or "N/A"

class PHQ8ScoresWithExplanations(BaseModel):
    PHQ8_NoInterest: PHQ8ScoreWithExplanation  # Little interest or pleasure in doing things
    PHQ8_Depressed: PHQ8ScoreWithExplanation   # Feeling down, depressed, or hopeless
    PHQ8_Sleep: PHQ8ScoreWithExplanation       # Trouble falling/staying asleep, or sleeping too much
    PHQ8_Tired: PHQ8ScoreWithExplanation       # Feeling tired or having little energy
    PHQ8_Appetite: PHQ8ScoreWithExplanation    # Poor appetite or overeating
    PHQ8_Failure: PHQ8ScoreWithExplanation     # Feeling bad about self or like a failure
    PHQ8_Concentrating: PHQ8ScoreWithExplanation # Trouble concentrating
    PHQ8_Moving: PHQ8ScoreWithExplanation      # Moving/speaking slowly or being fidgety/restless

system_prompt = """You are a AI psychiatrist assessment tool specialized in analyzing interview transcripts to evaluate PHQ-8 (Patient Health Questionnaire-8) scores. The PHQ-8 is a validated depression screening instrument that assesses eight of the nine DSM-IV criteria for depressive disorders over the past 2 weeks.

PHQ-8 Scoring System:
- 0 = Not at all (0-1 days in the past 2 weeks)
- 1 = Several days (2-6 days)
- 2 = More than half the days (7-11 days)
- 3 = Nearly every day (12-14 days)

Key Principles:
1. Base all assessments strictly on evidence from the transcript
2. When no relevant evidence exists for a symptom, output "N/A" rather than assuming absence (score of 0)
3. Consider both explicit statements and behavioral indicators in the interview
4. Be conservative - only assign scores when there is clear evidence of frequency
5. Remember that absence of mention does not necessarily mean absence of symptom
6. When reference examples are provided, use them for cross-analysis to inform scoring decisions
7. Weight reference examples based on their similarity to the current evidence - highly similar references should be considered more heavily
8. Follow the format specified exactly"""

def run_phq8_analysis(patient_transcript, ground_truth, reference_evidence=""):
    quantitative_prompt = f"""Analyze the following interview transcript and predict PHQ-8 scores for each symptom domain.

Interview transcript:
<transcript>
{patient_transcript}
</transcript>

{reference_evidence}

Analyze each symptom using the following approach in <thinking> tags:
1. Search for direct quotes or behavioral evidence related to each PHQ-8 symptom
2. When reference examples are provided, compare the current evidence with similar reference cases
3. Evaluate the frequency/severity based on available evidence and reference comparisons
4. Consider how similar the reference examples are to the current evidence - if highly similar, give more weight to the reference scores; if less similar, rely more on direct analysis
5. If no relevant evidence exists, mark as "N/A" rather than assuming absence
6. Only assign numeric scores (0-3) when evidence clearly indicates frequency

After your analysis, provide your final assessment in <answer> tags as a JSON object.

For each symptom, provide:
1. "evidence": exact quotes from transcript (use "No relevant evidence found" if not discussed)
2. "reason": explanation of scoring decision, including cross-reference analysis when applicable and why N/A if applicable
3. "score": integer 0-3 based on evidence, or "N/A" if no relevant evidence

Return ONLY a JSON object in <answer> tags with these exact keys:
- "PHQ8_NoInterest": {{evidence, reason, score}} for little interest or pleasure in doing things (anhedonia)
- "PHQ8_Depressed": {{evidence, reason, score}} for feeling down, depressed, or hopeless (depressed mood)
- "PHQ8_Sleep": {{evidence, reason, score}} for trouble falling or staying asleep, or sleeping too much (sleep problems)
- "PHQ8_Tired": {{evidence, reason, score}} for feeling tired or having little energy (fatigue)
- "PHQ8_Appetite": {{evidence, reason, score}} for poor appetite or overeating (appetite/weight changes)
- "PHQ8_Failure": {{evidence, reason, score}} for feeling bad about yourself or that you are a failure (negative self-perception)
- "PHQ8_Concentrating": {{evidence, reason, score}} for trouble concentrating on things like reading or watching TV (concentration problems)
- "PHQ8_Moving": {{evidence, reason, score}} for moving or speaking slowly, or being fidgety/restless (psychomotor changes)"""

    # Most deterministic temp, top_k, and top_p
    response = requests.post(
        BASE_URL,
        json={
            "model": model,
            "messages": [{"role": "system", "content": system_prompt},
                        {"role": "user", "content": quantitative_prompt}],
            "stream": False,
            "options": {
                "temperature": 0,
                "top_k": 1,
                "top_p": 1.0
            }
        }
    )

    # Parse and validate the response
    try:
        response_data = response.json()
        content = response_data['message']['content']
        
        # Extract content from <answer> tags if present
        if '<answer>' in content and '</answer>' in content:
            content = content.split('<answer>')[1].split('</answer>')[0].strip()
        
        # Remove markdown code blocks if present
        if content.startswith('```json'):
            content = content.split('```json')[1].split('```')[0].strip()
        elif content.startswith('```'):
            content = content.split('```')[1].split('```')[0].strip()
        
        # Parse the JSON response and validate with Pydantic
        scores_dict = json.loads(content)
        phq8_scores = PHQ8ScoresWithExplanations(**scores_dict)
        
        # Extract the 8 PHQ-8 score values
        scores_list = [
            phq8_scores.PHQ8_NoInterest.score,
            phq8_scores.PHQ8_Depressed.score,
            phq8_scores.PHQ8_Sleep.score,
            phq8_scores.PHQ8_Tired.score,
            phq8_scores.PHQ8_Appetite.score,
            phq8_scores.PHQ8_Failure.score,
            phq8_scores.PHQ8_Concentrating.score,
            phq8_scores.PHQ8_Moving.score
        ]
        
        print("Comparison of Predicted vs Ground Truth:")
        print("Metric\t\t\tPredicted\tGround Truth\tDifference")
        print("-" * 65)

        differences = []
        n_available = 0
        num_questions_NA = 0
        metrics = ['PHQ8_NoInterest', 'PHQ8_Depressed', 'PHQ8_Sleep', 'PHQ8_Tired', 
                'PHQ8_Appetite', 'PHQ8_Failure', 'PHQ8_Concentrating', 'PHQ8_Moving']
        predicted_values = [phq8_scores.PHQ8_NoInterest.score, phq8_scores.PHQ8_Depressed.score, phq8_scores.PHQ8_Sleep.score, 
                        phq8_scores.PHQ8_Tired.score, phq8_scores.PHQ8_Appetite.score, phq8_scores.PHQ8_Failure.score,
                        phq8_scores.PHQ8_Concentrating.score, phq8_scores.PHQ8_Moving.score]

        for metric, pred_val in zip(metrics, predicted_values):
            print(f"DEBUG: Processing {metric}")
            print(f"DEBUG: pred_val = {pred_val}, type = {type(pred_val)}")
            print(f"DEBUG: ground_truth[metric] = {ground_truth[metric]}, type = {type(ground_truth[metric])}")
            
            gt_val = int(ground_truth[metric].iloc[0] if hasattr(ground_truth[metric], 'iloc') else ground_truth[metric])
            print(f"DEBUG: gt_val = {gt_val}, type = {type(gt_val)}")
            
            if pred_val == "N/A":
                diff_str = "N/A"
                num_questions_NA += 1
                print(f"DEBUG: pred_val is N/A")
            else:
                # Convert pred_val to int if it's a string representation of a number
                try:
                    pred_val_int = int(pred_val)
                    print(f"DEBUG: converted pred_val to int: {pred_val_int}")
                except (ValueError, TypeError):
                    print(f"DEBUG: Could not convert pred_val '{pred_val}' to int, treating as N/A")
                    diff_str = "N/A"
                    num_questions_NA += 1
                    continue
                    
                diff = abs(pred_val_int - gt_val)
                differences.append(diff)
                diff_str = str(diff)
                n_available += 1
                print(f"DEBUG: diff = {diff}")
            
            print(f"{metric:<23} {str(pred_val):<12} {gt_val:<15} {diff_str}")
            print("DEBUG: ---")

        # Calculate metrics
        if n_available > 0:
            avg_difference = sum(differences) / n_available
            accuracy_on_available = 1 - (avg_difference / 3)
        else:
            avg_difference = float('inf')
            accuracy_on_available = 0
        
        # Accuracy * % available questions
        overall_accuracy = accuracy_on_available * (1 - (num_questions_NA / 8))
        
        print("-" * 65)
        if n_available > 0:
            print(f"Average Absolute Difference (on available): {avg_difference:.2f}")
            print(f"Accuracy on available questions: {accuracy_on_available:.2%}")
        print(f"Questions marked N/A: {num_questions_NA}/8")
        print(f"Overall accuracy: {overall_accuracy:.2%}")
        
        # Reasoning and evidence section
        print("\n\nDetailed Reasoning for Each Score:")
        print("=" * 80)

        for key in metrics:
            score_data = getattr(phq8_scores, key)
            print(f"\n{key} (Score: {score_data.score})")
            print("-" * 40)
            print(f"Evidence: {score_data.evidence}")
            print(f"Reason: {score_data.reason}")

        return phq8_scores, avg_difference, accuracy_on_available, num_questions_NA, overall_accuracy

    except (json.JSONDecodeError, KeyError, ValueError) as e:
        print(f"Error parsing response: {e}")
        print("Raw response:", response)
        print(json.dumps(response.json(), indent=2))
        return None, None, None, None, None
    
run_phq8_analysis(current_patient_transcript, phq8_ground_truths, reference_evidence)
print()


Comparison of Predicted vs Ground Truth:
Metric			Predicted	Ground Truth	Difference
-----------------------------------------------------------------
DEBUG: Processing PHQ8_NoInterest
DEBUG: pred_val = 0, type = <class 'str'>
DEBUG: ground_truth[metric] = 0      1
1      0
2      0
3      0
4      0
      ..
137    0
138    0
139    0
140    1
141    0
Name: PHQ8_NoInterest, Length: 142, dtype: int64, type = <class 'pandas.core.series.Series'>
DEBUG: gt_val = 1, type = <class 'int'>
DEBUG: converted pred_val to int: 0
DEBUG: diff = 1
PHQ8_NoInterest         0            1               1
DEBUG: ---
DEBUG: Processing PHQ8_Depressed
DEBUG: pred_val = 1, type = <class 'str'>
DEBUG: ground_truth[metric] = 0      1
1      0
2      1
3      1
4      1
      ..
137    0
138    1
139    1
140    1
141    0
Name: PHQ8_Depressed, Length: 142, dtype: int64, type = <class 'pandas.core.series.Series'>
DEBUG: gt_val = 1, type = <class 'int'>
DEBUG: converted pred_val to int: 1
DEBUG: diff = 0
PHQ8_D

In [None]:
print(phq8_ground_truths.loc[phq8_ground_truths['Participant_ID'] == participant_id].T)

### Original ollama example script (for reference)

In [3]:
"""Available models:
- gemma3-optimized:27b
- gemma3_optimized:8bit
- gemma3:27b-it-qat
- gemma3:27b
- phi4-optimized:latest
- phi4:latest
- phi4-reasoning:plus
- qwen3_optimized:latest
- qwen3:32b
- llama3.3:latest
- llama4:scout
- mistral3.1-optimized:latest
- mistral-small3.1:latest
- qwq:latest

Notes:
1. The models marked as "optimized" are configured to use multiple GPUs for faster inference.
2. The llama4:maverick model is not available due to its size.
"""

OLLAMA_NODE = "arctrddgxa002" # TODO: Change this variable to the node where Ollama is running
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"

model = "qwq:latest" # TODO: Change this variable to the model you want to use
message = "What is the capital of France?" # TODO: Change this variable to the message you want to ask the model

response = requests.post(
  BASE_URL,
  json = {
    "model": model,
    "messages": [{"role": "user", "content": message}],
    "stream": False
  }
)

print(json.dumps(response.json(), indent=2))

{
  "model": "qwq:latest",
  "created_at": "2025-07-15T22:46:58.150903388Z",
  "message": {
    "role": "assistant",
    "content": "<think>\nOkay, the user is asking, \"What is the capital of France?\" Let me think about how to approach this.\n\nFirst, I need to confirm the correct answer. The capital of France is Paris. That's a well-known fact, but I should make sure there hasn't been any recent changes, which is unlikely. \n\nNext, I should consider why the user is asking this. It might be a straightforward question for someone learning basic geography. Alternatively, they could be testing the system's knowledge. Either way, the answer needs to be clear and accurate.\n\nI should also think about providing a bit more context. Maybe mention that Paris is not only the capital but also the most populous city in France. Including some landmarks like the Eiffel Tower or the Louvre could add value. But wait, the user didn't ask for extra details, so maybe keep it concise unless they ask f