In [None]:
import os
import pandas as pd
import requests
import time
import json
import re
from requests.exceptions import Timeout, RequestException

def parse_score_and_explanation(response_text):
    """Extract score and explanation from model response"""
    score_patterns = [
        r'score[:\s]*(\d+)',
        r'(\d+)[/\s]*(?:out of\s*)?5',
        r'(\d+)[/\s]*5',
        r'rating[:\s]*(\d+)',
        r'^(\d+)',  # Number at start of line
    ]
    
    score = None
    for pattern in score_patterns:
        match = re.search(pattern, response_text, re.IGNORECASE | re.MULTILINE)
        if match:
            potential_score = int(match.group(1))
            if 1 <= potential_score <= 5:
                score = potential_score
                break
    
    return score, response_text.strip()

def identify_low_scoring_participants(results_csv_path, threshold=3):
    """Identify participants who scored <= threshold in any category"""
    df = pd.read_csv(results_csv_path)
    
    # Identify participants with low scores
    low_scoring_mask = (
        (df['coherence'] <= threshold) | 
        (df['completeness'] <= threshold) | 
        (df['specificity'] <= threshold) | 
        (df['accuracy'] <= threshold)
    )
    
    low_scoring_participants = df[low_scoring_mask].copy()
    
    # Add columns to track which metrics were low
    low_scoring_participants['low_coherence'] = df['coherence'] <= threshold
    low_scoring_participants['low_completeness'] = df['completeness'] <= threshold
    low_scoring_participants['low_specificity'] = df['specificity'] <= threshold
    low_scoring_participants['low_accuracy'] = df['accuracy'] <= threshold
    
    print(f"Found {len(low_scoring_participants)} participants with scores <= {threshold}")
    
    return low_scoring_participants

def generate_improved_prompt(metric, original_explanation, original_score):
    """Generate an improved prompt based on the feedback from the original assessment"""
    
    # Extract common issues from the explanation
    feedback_analysis = f"""
    Based on the previous assessment that scored {original_score}/5, here are the identified issues:
    {original_explanation}
    
    Please pay special attention to avoiding these specific mistakes.
    """
    
    base_prompts = {
        'coherence': f"""Evaluate the following qualitative assessment output for COHERENCE only. Compare qualitative_assessment to the provided transcript.

{feedback_analysis}

Score from 1-5 based on these criteria:
- Score of 5: 0 mistakes (completely logically consistent)
- Score of 4: 1-2 mistakes (minor inconsistencies)
- Score of 3: 3-4 mistakes (moderate inconsistencies)
- Score of 2: 5-6 mistakes (significant inconsistencies)
- Score of 1: 7 or more mistakes (major logical contradictions)

Coherence focuses on logical consistency. Look for:
- Internal contradictions within the assessment
- Statements that conflict with each other
- Logical flow and reasoning consistency
- Whether conclusions follow from the evidence presented

Pay extra attention to ensuring all statements are internally consistent and support each other logically.

Format your response as:
Score: [your score]
Explanation: [detailed explanation focusing on logical consistency]""",

        'completeness': f"""Evaluate the following qualitative assessment output for COMPLETENESS only. Compare qualitative_assessment to the provided transcript.

{feedback_analysis}

Score from 1-5 based on these criteria:
- Score of 5: 0 mistakes (covers all relevant information)
- Score of 4: 1-2 mistakes (minor omissions)
- Score of 3: 3-4 mistakes (moderate gaps)
- Score of 2: 5-6 mistakes (significant omissions)
- Score of 1: 7 or more mistakes (major gaps in coverage)

Completeness focuses on thoroughness. Look for:
- Coverage of all relevant symptoms mentioned in transcript
- Inclusion of severity indicators where discussed
- Duration and frequency details when available
- Social, biological, and risk factors as appropriate
- Supporting quotes and evidence

Ensure the assessment doesn't miss important information from the transcript.

Format your response as:
Score: [your score]
Explanation: [detailed explanation focusing on completeness and thoroughness]""",

        'specificity': f"""Evaluate the following qualitative assessment output for SPECIFICITY only. Compare qualitative_assessment to the provided transcript.

{feedback_analysis}

Score from 1-5 based on these criteria:
- Score of 5: 0 mistakes (highly specific, no vague statements)
- Score of 4: 1-2 mistakes (mostly specific with minor vagueness)
- Score of 3: 3-4 mistakes (moderate use of vague language)
- Score of 2: 5-6 mistakes (frequent vague statements)
- Score of 1: 7 or more mistakes (predominantly vague and generic)

Specificity focuses on precision and detail. Look for:
- Use of specific examples and direct quotes
- Avoidance of vague terms like "seems depressed" or "appears sad"
- Concrete behavioral observations
- Specific symptoms rather than general mood descriptions
- Detailed supporting evidence

Ensure the assessment uses precise, specific language backed by concrete evidence.

Format your response as:
Score: [your score]
Explanation: [detailed explanation focusing on specificity and precision]""",

        'accuracy': f"""Evaluate the following qualitative assessment output for ACCURACY only. Compare qualitative_assessment to the provided transcript.

{feedback_analysis}

Score from 1-5 based on these criteria:
- Score of 5: 0 mistakes (completely accurate to DSM-5/PHQ-8)
- Score of 4: 1-2 mistakes (minor inaccuracies)
- Score of 3: 3-4 mistakes (moderate errors)
- Score of 2: 5-6 mistakes (significant inaccuracies)
- Score of 1: 7 or more mistakes (major errors in clinical accuracy)

Accuracy focuses on clinical correctness. Look for:
- Correct identification of symptoms per DSM-5 criteria
- Accurate duration and frequency assessments
- Proper use of clinical terminology
- Alignment with established diagnostic criteria
- Factual accuracy regarding what was actually said in transcript

Ensure all clinical observations and interpretations are accurate and evidence-based.

Format your response as:
Score: [your score]
Explanation: [detailed explanation focusing on clinical accuracy]"""
    }
    
    return base_prompts.get(metric, "")

def reassess_participant(participant_data, transcript, ollama_config):
    """Reassess a single participant with improved prompts for low-scoring metrics"""
    
    participant_id = participant_data['participant_id']
    qualitative_assessment = participant_data['qualitative_assessment']
    
    print(f"Reassessing participant {participant_id}")
    
    results = {'participant_id': participant_id}
    
    # Define which metrics need reassessment
    metrics_to_reassess = []
    if participant_data['low_coherence']:
        metrics_to_reassess.append('coherence')
    if participant_data['low_completeness']:
        metrics_to_reassess.append('completeness')
    if participant_data['low_specificity']:
        metrics_to_reassess.append('specificity')
    if participant_data['low_accuracy']:
        metrics_to_reassess.append('accuracy')
    
    print(f"  Reassessing metrics: {metrics_to_reassess}")
    
    # Reassess each low-scoring metric
    for metric in metrics_to_reassess:
        print(f"  Reassessing {metric}...")
        
        # Get original score and explanation
        original_score = participant_data[metric]
        original_explanation = participant_data[f'{metric}_explanation']
        
        # Generate improved prompt
        improved_prompt = generate_improved_prompt(metric, original_explanation, original_score)
        
        # Add transcript and assessment to prompt
        full_prompt = f"""{improved_prompt}

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""
        
        # Make API request
        request_data = {
            "model": ollama_config['model'],
            "messages": [{"role": "user", "content": full_prompt}],
            "stream": False,
            "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
        }
        
        try:
            response = requests.post(ollama_config['base_url'], json=request_data, timeout=290)
            if response.status_code == 200:
                content = response.json()['message']['content']
                score, _ = parse_score_and_explanation(content)
                results[f'{metric}_reassess'] = score
                results[f'{metric}_reassess_explanation'] = content
                print(f"    New {metric} score: {score} (was {original_score})")
            else:
                results[f'{metric}_reassess'] = None
                results[f'{metric}_reassess_explanation'] = f"API Error: {response.status_code}"
                print(f"    API error for {metric}: {response.status_code}")
            
            time.sleep(2)  # Rate limiting
            
        except Exception as e:
            results[f'{metric}_reassess'] = None
            results[f'{metric}_reassess_explanation'] = f"Error: {str(e)}"
            print(f"    Error reassessing {metric}: {str(e)}")
    
    # Copy over scores that weren't reassessed
    all_metrics = ['coherence', 'completeness', 'specificity', 'accuracy']
    for metric in all_metrics:
        if metric not in metrics_to_reassess:
            results[f'{metric}_reassess'] = participant_data[metric]
            results[f'{metric}_reassess_explanation'] = participant_data[f'{metric}_explanation']
    
    return results

def run_feedback_loop_reassessment(results_csv_path, qualitative_results_path, ollama_config, threshold=3):
    """Main function to run the feedback loop reassessment"""
    
    # Load original evaluation results
    print("Loading original evaluation results...")
    eval_results = pd.read_csv(results_csv_path)
    
    # Load qualitative assessments
    print("Loading qualitative assessments...")
    qual_results = pd.read_csv(qualitative_results_path)
    
    # Merge to get qualitative assessments for participants
    merged_data = eval_results.merge(qual_results, on='participant_id', how='left')
    
    # Identify low-scoring participants
    low_scoring_participants = identify_low_scoring_participants(results_csv_path, threshold)
    
    if len(low_scoring_participants) == 0:
        print("No participants found with low scores. Exiting.")
        return
    
    # Merge with qualitative data
    reassess_data = low_scoring_participants.merge(qual_results, on='participant_id', how='left')
    
    print(f"Starting reassessment of {len(reassess_data)} participants...")
    
    reassessment_results = []
    processed_count = 0
    
    for index, row in reassess_data.iterrows():
        participant_id = row['participant_id']
        
        print(f"\n--- Reassessing {index + 1}/{len(reassess_data)}: {participant_id} ---")
        
        # Load transcript
        id_transcript = os.path.join("/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/", 
                                   f"{participant_id}_P", f"{participant_id}_TRANSCRIPT.csv")
        
        if not os.path.exists(id_transcript):
            print(f"Transcript not found for {participant_id}")
            continue
        
        try:
            transcript_data = pd.read_csv(id_transcript, sep='\t')
            dialogue_txt = transcript_data.dropna(subset=['speaker', 'value'])
            dialogue_txt['dialogue'] = dialogue_txt['speaker'] + " : " + dialogue_txt['value']
            transcript = "\n".join(dialogue_txt['dialogue'])
            
            # Reassess participant
            result = reassess_participant(row, transcript, ollama_config)
            reassessment_results.append(result)
            processed_count += 1
            
            print(f"Completed reassessment for {participant_id} ({processed_count} total)")
            
            # Save progress every 5 participants
            if len(reassessment_results) % 5 == 0:
                temp_df = pd.DataFrame(reassessment_results)
                temp_output = results_csv_path.replace('.csv', '_reassessment_temp.csv')
                temp_df.to_csv(temp_output, index=False)
                print(f"Saved progress: {len(reassessment_results)} reassessments")
            
        except Exception as e:
            print(f"Error processing {participant_id}: {str(e)}")
            continue
        
        time.sleep(1)  # General rate limiting
    
    # Save final results
    if reassessment_results:
        output_path = results_csv_path.replace('.csv', '_reassessment_results.csv')
        final_df = pd.DataFrame(reassessment_results)
        final_df.to_csv(output_path, index=False)
        
        print(f"\n=== REASSESSMENT SUMMARY ===")
        print(f"Total participants reassessed: {len(reassessment_results)}")
        print(f"Results saved to: {output_path}")
        
        # Calculate improvement statistics
        improvement_stats = {}
        for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
            original_col = metric
            reassess_col = f'{metric}_reassess'
            
            if original_col in merged_data.columns and reassess_col in final_df.columns:
                # Get participants that had this metric reassessed
                reassessed_participants = final_df[final_df[reassess_col].notna()]
                original_scores = []
                new_scores = []
                
                for _, row in reassessed_participants.iterrows():
                    pid = row['participant_id']
                    orig_data = merged_data[merged_data['participant_id'] == pid]
                    if not orig_data.empty:
                        orig_score = orig_data[original_col].iloc[0]
                        new_score = row[reassess_col]
                        if pd.notna(orig_score) and pd.notna(new_score):
                            original_scores.append(orig_score)
                            new_scores.append(new_score)
                
                if original_scores and new_scores:
                    avg_original = sum(original_scores) / len(original_scores)
                    avg_new = sum(new_scores) / len(new_scores)
                    improvement = avg_new - avg_original
                    improvement_stats[metric] = {
                        'count': len(original_scores),
                        'avg_original': avg_original,
                        'avg_new': avg_new,
                        'improvement': improvement
                    }
        
        print("\nImprovement Statistics:")
        for metric, stats in improvement_stats.items():
            print(f"{metric.capitalize()}: {stats['count']} reassessed, "
                  f"avg score {stats['avg_original']:.2f} → {stats['avg_new']:.2f} "
                  f"(+{stats['improvement']:.2f})")
    
    else:
        print("No successful reassessments completed.")

# Configuration
OLLAMA_NODE = "arctrddgxa002"  # Change this to your Ollama node
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"
MODEL = "gemma3-optimized:27b"  # Change this to your model

ollama_config = {
    'base_url': BASE_URL,
    'model': MODEL
}

# File paths - update these to match your files
RESULTS_CSV_PATH = "/data/users2/nblair7/analysis_results/eval_results_new.csv"
QUALITATIVE_RESULTS_PATH = "/data/users2/nblair7/analysis_results/qual_resultsfin.csv"

if __name__ == "__main__":
    # Run the feedback loop reassessment
    run_feedback_loop_reassessment(
        results_csv_path=RESULTS_CSV_PATH,
        qualitative_results_path=QUALITATIVE_RESULTS_PATH,
        ollama_config=ollama_config,
        threshold=3  # Reassess participants with scores <= 3
    )

In [1]:
import os
import pandas as pd
import requests
import time
import json
import re
from requests.exceptions import Timeout, RequestException



def parse_score_and_explanation(response_text):
    """Extract score and explanation from model response"""
    score_patterns = [
        r'score[:\s]*(\d+)',
        r'(\d+)[/\s]*(?:out of\s*)?5',
        r'(\d+)[/\s]*5',
        r'rating[:\s]*(\d+)',
        r'^(\d+)',  # Number at start of line
    ]
    
    score = None
    for pattern in score_patterns:
        match = re.search(pattern, response_text, re.IGNORECASE | re.MULTILINE)
        if match:
            potential_score = int(match.group(1))
            if 1 <= potential_score <= 5:
                score = potential_score
                break
    
    return score, response_text.strip()

# Configuration
OLLAMA_NODE = "arctrddgxa004" # TODO: Change this variable to the node where Ollama is running
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"
model = "gemma3-optimized:27b" # TODO: Change this variable to the model you want to use

# Input file 
input_csv_path = "/data/users2/nblair7/analysis_results/qual_resultsfin.csv"  

#Output files
feedback_assessments_csv = "/data/users2/nblair7/analysis_results/feedback_qualitative_assessments2.csv"  # re-evaluated qualitative assessments
feedback_evaluations_csv = "/data/users2/nblair7/analysis_results/feedback_evaluation_scores2.csv"  # re-evaluated evaluation scores

print("=== ENHANCED FEEDBACK LOOP RE-EVALUATION SYSTEM ===")
print(f"Input file: {input_csv_path}")
print(f"Feedback assessments file: {feedback_assessments_csv}")
print(f"Feedback evaluations file: {feedback_evaluations_csv}")

# Load the CSV file
print("Loading CSV file...")
df = pd.read_csv(input_csv_path)
print(f"Loaded {len(df)} participants")

feedback_assessments = []  # Store re-evaluated qualitative assessments
feedback_evaluations = []  # Store re-evaluated evaluation scores
processed_count = 0
skipped_count = 0
feedback_count = 0
failed_evaluations = []

# Check for existing feedback files to resume processing
completed_subjects = set()
if os.path.exists(feedback_assessments_csv):
    existing_feedback_assessments = pd.read_csv(feedback_assessments_csv)
    completed_subjects.update(existing_feedback_assessments['participant_id'].tolist())
    feedback_assessments = existing_feedback_assessments.to_dict('records')
    print(f"Found existing feedback assessments: {len(feedback_assessments)} records")

if os.path.exists(feedback_evaluations_csv):
    existing_feedback_evaluations = pd.read_csv(feedback_evaluations_csv)
    completed_subjects.update(existing_feedback_evaluations['participant_id'].tolist())
    feedback_evaluations = existing_feedback_evaluations.to_dict('records')
    print(f"Found existing feedback evaluations: {len(feedback_evaluations)} records")

if completed_subjects:
    print(f"Already completed {len(completed_subjects)} subjects")
    df = df[~df['participant_id'].isin(completed_subjects)]
    print(f"Remaining subjects to process: {len(df)}")

for index, row in df.iterrows():
    participant_id = row['participant_id']
    qualitative_assessment = row['qualitative_assessment']
    
    print(f"\n--- Processing {index + 1}/{len(df)}: {participant_id} ---")
    
    # Load transcript for this participant
    id_transcript = os.path.join("/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/", f"{participant_id}_P", f"{participant_id}_TRANSCRIPT.csv")
    print(f"Looking for transcript at: {id_transcript}")
    
    if not os.path.exists(id_transcript):
        print(f"Transcript not found for {participant_id}")
        skipped_count += 1
        continue
    
    print(f"Transcript found, loading data...")
    
    try:
        transcript_data = pd.read_csv(id_transcript, sep='\t')
        dialogue_txt = transcript_data.dropna(subset=['speaker', 'value'])
        dialogue_txt['dialogue'] = dialogue_txt['speaker'] + " : " + dialogue_txt['value']
        transcript = "\n".join(dialogue_txt['dialogue'])
        
        print(f"Dialogue length: {len(transcript)} characters")
        
    except Exception as e:
        print(f"Error loading transcript for {participant_id}: {e}")
        skipped_count += 1
        continue
    
    start_time = time.time()
    
    # Original evaluation prompts (for initial assessment to determine if feedback is needed)
    coherence_prompt = f"""Evaluate the following qualitative assessment output for COHERENCE only. Compare qualitative_assessment to the provided transcript. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.
Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

Coherence (1–5): Is the response logically consistent? Mistakes are logically inconsistent statements or contradictions within the assessment.

Format your response as:
Score: [your score]
Explanation: [explanation]

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    completeness_prompt = f"""Evaluate the following qualitative assessment output for COMPLETENESS only. Compare qualitative_assessment to the provided transcript. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.

Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

Completeness (1–5): Does the assessment cover all relevant symptoms, severities, duration/frequency? Mistakes have missed PHQ-8 symptoms, or duration/frequency details.

Format your response as:
Score: [your score]
Explanation: [explanation]

Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    specificity_prompt = f"""Evaluate the following qualitative assessment output for SPECIFICITY only. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.

Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

specificity (1–5): Is the assessment specific? Mistakes include using vague/generic statements like 'the patient seems depressed'.

Format your response as:
Score: [your score]
Explanation: [explanation]

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    accuracy_prompt = f"""Evaluate the following qualitative assessment output for ACCURACY only. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.

Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

Accuracy (1–5): Are the signs/symptoms aligned with DSM-5 or PHQ-8? Mistakes are incorrect symptoms or incorrect duration/frequecy. 

Format your response as:
Score: [your score]
Explanation: [explanation]

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    # Initial evaluation requests
    coherence_request = {
        "model": model,
        "messages": [{"role": "user", "content": coherence_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    completeness_request = {
        "model": model,
        "messages": [{"role": "user", "content": completeness_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    specificity_request = {
        "model": model,
        "messages": [{"role": "user", "content": specificity_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    accuracy_request = {
        "model": model,
        "messages": [{"role": "user", "content": accuracy_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    timeout = 300  
    
    try:
        # Initial evaluation to check if feedback is needed
        initial_scores = {}
        initial_explanations = {}
        
        # coherence
        print("  Getting initial coherence response...")
        coherence_response = requests.post(BASE_URL, json=coherence_request, timeout=timeout-10)
        if coherence_response.status_code == 200:
            coherence_content = coherence_response.json()['message']['content']
            coherence_score, _ = parse_score_and_explanation(coherence_content)
            initial_scores['coherence'] = coherence_score
            initial_explanations['coherence'] = coherence_content
            print(f"  Initial coherence score: {coherence_score}")
        else:
            initial_scores['coherence'] = None
            initial_explanations['coherence'] = None
        
        time.sleep(2)
        
        # completeness
        print("  Getting initial completeness response...")
        completeness_response = requests.post(BASE_URL, json=completeness_request, timeout=timeout-10)
        if completeness_response.status_code == 200:
            completeness_content = completeness_response.json()['message']['content']
            completeness_score, _ = parse_score_and_explanation(completeness_content)
            initial_scores['completeness'] = completeness_score
            initial_explanations['completeness'] = completeness_content
            print(f"  Initial completeness score: {completeness_score}")
        else:
            initial_scores['completeness'] = None
            initial_explanations['completeness'] = None
        
        time.sleep(2)
        
        # specificity
        print("  Getting initial specificity response...")
        specificity_response = requests.post(BASE_URL, json=specificity_request, timeout=timeout-10)
        if specificity_response.status_code == 200:
            specificity_content = specificity_response.json()['message']['content']
            specificity_score, _ = parse_score_and_explanation(specificity_content)
            initial_scores['specificity'] = specificity_score
            initial_explanations['specificity'] = specificity_content
            print(f"  Initial specificity score: {specificity_score}")
        else:
            initial_scores['specificity'] = None
            initial_explanations['specificity'] = None
        
        time.sleep(2)
        
        # accuracy
        print("  Getting initial accuracy response...")
        accuracy_response = requests.post(BASE_URL, json=accuracy_request, timeout=timeout-10)
        if accuracy_response.status_code == 200:
            accuracy_content = accuracy_response.json()['message']['content']
            accuracy_score, _ = parse_score_and_explanation(accuracy_content)
            initial_scores['accuracy'] = accuracy_score
            initial_explanations['accuracy'] = accuracy_content
            print(f"  Initial accuracy score: {accuracy_score}")
        else:
            initial_scores['accuracy'] = None
            initial_explanations['accuracy'] = None
        
        time.sleep(2)
        
        # Check if any score is <= 3 to trigger feedback loop
        low_scores = []
        for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
            if initial_scores[metric] and initial_scores[metric] <= 3:
                low_scores.append(metric)
        
        if low_scores:
            print(f"  LOW SCORES DETECTED: {low_scores} - Starting enhanced feedback loop...")
            feedback_count += 1
            
            # Keep track of all attempts for this participant
            iteration = 0
            max_iterations = 10  # Prevent infinite loops
            current_assessment = qualitative_assessment
            current_scores = initial_scores.copy()
            current_explanations = initial_explanations.copy()
            all_low_scores = []  # Track which scores were low across iterations
            
            # Feedback loop - keep trying until all scores are 4 or 5
            while low_scores and iteration < max_iterations:
                iteration += 1
                print(f"    Iteration {iteration}: Improving assessment for low scores: {low_scores}")
                all_low_scores.extend(low_scores)
                
                # Build feedback message for low scoring metrics
                feedback_details = []
                for metric in low_scores:
                    if current_scores[metric] and current_explanations[metric]:
                        metric_description = {
                            'coherence': 'logical consistency (avoiding contradictions within the assessment)',
                            'completeness': 'comprehensive coverage (including all relevant PHQ-8 symptoms, severities, and duration/frequency details)',
                            'specificity': 'specific details (avoiding vague/generic statements like "the patient seems depressed")',
                            'accuracy': 'accuracy according to DSM-5 or PHQ-8 criteria (correct symptoms and duration/frequency)'
                        }
                        feedback_details.append(f"""
- **{metric.upper()}** scored {current_scores[metric]}/5 because: {current_explanations[metric]}
  Please specifically improve the {metric_description[metric]} in your revised assessment.""")
                
                # Enhanced regeneration prompt with feedback context
                system_prompt = "You are a psychiatrist. Your job is to assess and diagnose patients for depression. Be as objective, concise, and accurate as possible."
                
                feedback_prompt = f"""The following is an interview transcript from a psychiatric assessment of a subject who is being screened for depression. Please note that 'Ellie' is the interviewer, and 'Participant' is the subject being assessed.

TRANSCRIPT:
{transcript}

ORIGINAL ASSESSMENT:
{qualitative_assessment}

EVALUATION FEEDBACK:
The original assessment has been evaluated and needs improvement in the following areas:
{chr(10).join(feedback_details)}

TASK:
Please reassess the transcript and provide an improved qualitative assessment that addresses the specific feedback above. Build upon the original assessment but focus on improving the identified weaknesses.

Please:
1. Provide an overall qualitative assessment of the subject's mental health. 
2. Summarize social aspects that may influence the subject's mental health. (ex. familial relationships, friendship dynamics, work environment, etc. that are relevant to the subject's mental health)
3. Summarize biological aspects that may influence the subject's mental health. (ex. familial history of mental health issues, previous or pre-existing mental health issues, stress levels, etc. that are relevant to the subject's mental health)
4. Identify potential risk factors the subject may be experiencing.
5. Use exact quotes from the transcript to support your assessment for each tag.

Output should be formatted as bullet points with headings for each section using stars. Example: **Tiredness** <explanation of tiredness>. Do not include any additional text outside the bullet points.

Please answer in this XML format with each tag on a new line, properly indented. Use straight quotes instead of curly quotes, and do not include any additional text outside the XML tags:

<assessment>
  <!-- Summary of participant's overall mental health -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</assessment>

<social_factors>
  <!-- Summary of social influences on patient's health -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</social_factors>

<biological_factors>
  <!-- Summary of biological influences on patient's health -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</biological_factors>

<risk_factors>
  <!-- Summary of potential risk factors -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</risk_factors>
"""
                
                feedback_request = {
                    "model": model,
                    "messages": [{"role": "system", "content": system_prompt},
                               {"role": "user", "content": feedback_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                
                feedback_response = requests.post(BASE_URL, json=feedback_request, timeout=timeout)
                if feedback_response.status_code != 200:
                    print(f"    Failed to regenerate assessment: {feedback_response.status_code}")
                    failed_evaluations.append(participant_id)
                    break
                
                current_assessment = feedback_response.json()['message']['content']
                print(f"    New assessment generated with targeted improvements, re-evaluating...")
                
                # Re-evaluate with new assessment
                new_coherence_prompt = coherence_prompt.replace(qualitative_assessment, current_assessment)
                new_completeness_prompt = completeness_prompt.replace(qualitative_assessment, current_assessment)
                new_specificity_prompt = specificity_prompt.replace(qualitative_assessment, current_assessment)
                new_accuracy_prompt = accuracy_prompt.replace(qualitative_assessment, current_assessment)
                
                # Store new scores and explanations
                new_scores = {}
                new_explanations = {}
                
                # Re-evaluate coherence
                time.sleep(2)
                new_coherence_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_coherence_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_coherence_response = requests.post(BASE_URL, json=new_coherence_request, timeout=timeout-10)
                if new_coherence_response.status_code == 200:
                    new_coherence_content = new_coherence_response.json()['message']['content']
                    new_coherence_score, _ = parse_score_and_explanation(new_coherence_content)
                    new_scores['coherence'] = new_coherence_score
                    new_explanations['coherence'] = new_coherence_content
                
                # Re-evaluate completeness
                time.sleep(2)
                new_completeness_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_completeness_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_completeness_response = requests.post(BASE_URL, json=new_completeness_request, timeout=timeout-10)
                if new_completeness_response.status_code == 200:
                    new_completeness_content = new_completeness_response.json()['message']['content']
                    new_completeness_score, _ = parse_score_and_explanation(new_completeness_content)
                    new_scores['completeness'] = new_completeness_score
                    new_explanations['completeness'] = new_completeness_content
                
                # Re-evaluate specificity
                time.sleep(2)
                new_specificity_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_specificity_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_specificity_response = requests.post(BASE_URL, json=new_specificity_request, timeout=timeout-10)
                if new_specificity_response.status_code == 200:
                    new_specificity_content = new_specificity_response.json()['message']['content']
                    new_specificity_score, _ = parse_score_and_explanation(new_specificity_content)
                    new_scores['specificity'] = new_specificity_score
                    new_explanations['specificity'] = new_specificity_content
                
                # Re-evaluate accuracy
                time.sleep(2)
                new_accuracy_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_accuracy_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_accuracy_response = requests.post(BASE_URL, json=new_accuracy_request, timeout=timeout-10)
                if new_accuracy_response.status_code == 200:
                    new_accuracy_content = new_accuracy_response.json()['message']['content']
                    new_accuracy_score, _ = parse_score_and_explanation(new_accuracy_content)
                    new_scores['accuracy'] = new_accuracy_score
                    new_explanations['accuracy'] = new_accuracy_content
                
                # Update current scores and explanations for next iteration
                current_scores.update(new_scores)
                current_explanations.update(new_explanations)
                
                # Check which scores are still low
                low_scores = []
                for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
                    if metric in new_scores and new_scores[metric] and new_scores[metric] <= 3:
                        low_scores.append(metric)
                
                # Print current scores
                print(f"    Iteration {iteration} scores: " + 
                      f"Coherence={current_scores.get('coherence', 'N/A')}, " +
                      f"Completeness={current_scores.get('completeness', 'N/A')}, " +
                      f"Specificity={current_scores.get('specificity', 'N/A')}, " +
                      f"Accuracy={current_scores.get('accuracy', 'N/A')}")
                
                if low_scores:
                    print(f"    Still have low scores: {low_scores}, continuing with targeted feedback...")
                else:
                    print(f"    All scores now 4 or 5! Enhanced feedback loop complete after {iteration} iterations.")
            
            # Save final results after feedback loop completes
            if iteration >= max_iterations:
                print(f"    Reached max iterations ({max_iterations}), stopping feedback loop")
            
            # Save the final qualitative assessment
            feedback_assessment_record = {
                'participant_id': participant_id,
                'original_qualitative_assessment': qualitative_assessment,
                'feedback_qualitative_assessment': current_assessment,
                'iterations_required': iteration,
                'low_scores_detected': ', '.join(sorted(set(all_low_scores))),
                'initial_coherence_score': initial_scores.get('coherence'),
                'initial_completeness_score': initial_scores.get('completeness'),
                'initial_specificity_score': initial_scores.get('specificity'),
                'initial_accuracy_score': initial_scores.get('accuracy'),
                'final_coherence_score': current_scores.get('coherence'),
                'final_completeness_score': current_scores.get('completeness'),
                'final_specificity_score': current_scores.get('specificity'),
                'final_accuracy_score': current_scores.get('accuracy')
            }
            feedback_assessments.append(feedback_assessment_record)
            
            # Save the final evaluation scores
            feedback_eval_record = {
                'participant_id': participant_id,
                'iterations_required': iteration,
                'low_scores_detected': ', '.join(sorted(set(all_low_scores)))
            }
            
            # Add final scores and explanations to record
            for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
                if metric in current_scores:
                    feedback_eval_record[f'final_{metric}_score'] = current_scores[metric]
                    feedback_eval_record[f'final_{metric}_explanation'] = current_explanations[metric]
                if metric in initial_scores:
                    feedback_eval_record[f'initial_{metric}_score'] = initial_scores[metric]
                    feedback_eval_record[f'initial_{metric}_explanation'] = initial_explanations[metric]
            
            feedback_evaluations.append(feedback_eval_record)
            processed_count += 1
        else:
            print(f"  No low scores detected - skipping feedback loop")
        
        elapsed_time = time.time() - start_time
        print(f"Completed participant {participant_id} in {elapsed_time:.1f}s ({processed_count} with feedback applied)")
            
    except Exception as e:
        print(f"Error processing participant {participant_id}: {e}")
        failed_evaluations.append(participant_id)
    
    # Save progress every 10 participants
    if (len(feedback_assessments) % 10 == 0 and len(feedback_assessments) > 0) or len(feedback_assessments) == 1:
        # Save feedback assessments
        if feedback_assessments:
            feedback_assessments_df = pd.DataFrame(feedback_assessments)
            feedback_assessments_df.to_csv(feedback_assessments_csv, index=False)
            print(f"Saved feedback assessments: {len(feedback_assessments)} records to {feedback_assessments_csv}")
        
        # Save feedback evaluations
        if feedback_evaluations:
            feedback_evaluations_df = pd.DataFrame(feedback_evaluations)
            feedback_evaluations_df.to_csv(feedback_evaluations_csv, index=False)
            print(f"Saved feedback evaluations: {len(feedback_evaluations)} records to {feedback_evaluations_csv}")
    
    time.sleep(1)

# Final summary
print(f"\n=== PROCESSING SUMMARY ===")
print(f"Total subjects processed: {len(df)}")
print(f"Skipped (no transcript): {skipped_count}")
print(f"Feedback loops applied: {feedback_count}")
print(f"Successfully processed with feedback: {processed_count}")
print(f"Failed: {len(failed_evaluations)}")
print(f"Feedback assessments created: {len(feedback_assessments)}")
print(f"Feedback evaluations created: {len(feedback_evaluations)}")

if failed_evaluations:
    print(f"Failed participant IDs: {failed_evaluations}")

# Save final feedback files
if feedback_assessments:
    feedback_assessments_df = pd.DataFrame(feedback_assessments)
    feedback_assessments_df.to_csv(feedback_assessments_csv, index=False)
    print(f"Final feedback assessments saved: {feedback_assessments_csv}")
    print(f"Enhanced feedback assessments CSV columns:")
    print(f"- participant_id")
    print(f"- original_qualitative_assessment")
    print(f"- feedback_qualitative_assessment")
    print(f"- iterations_required")
    print(f"- low_scores_detected")
    print(f"- initial_[metric]_score and final_[metric]_score for comparison")

if feedback_evaluations:
    feedback_evaluations_df = pd.DataFrame(feedback_evaluations)
    feedback_evaluations_df.to_csv(feedback_evaluations_csv, index=False)
    print(f"Final feedback evaluations saved: {feedback_evaluations_csv}")
    print(f"Enhanced feedback evaluations CSV columns:")
    print(f"- participant_id")
    print(f"- iterations_required")
    print(f"- initial_[metric]_score / final_[metric]_score")
    print(f"- initial_[metric]_explanation / final_[metric]_explanation")
    print(f"- low_scores_detected")

if not feedback_assessments and not feedback_evaluations:
    print("No participants required feedback - no CSV files created!")

=== ENHANCED FEEDBACK LOOP RE-EVALUATION SYSTEM ===
Input file: /data/users2/nblair7/analysis_results/qual_resultsfin.csv
Feedback assessments file: /data/users2/nblair7/analysis_results/feedback_qualitative_assessments2.csv
Feedback evaluations file: /data/users2/nblair7/analysis_results/feedback_evaluation_scores2.csv
Loading CSV file...
Loaded 142 participants

--- Processing 1/142: 303 ---
Looking for transcript at: /data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/303_P/303_TRANSCRIPT.csv
Transcript found, loading data...
Dialogue length: 14183 characters
  Getting initial coherence response...


KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
import requests
import time
import json
import re
from requests.exceptions import Timeout, RequestException



def parse_score_and_explanation(response_text):
    """Extract score and explanation from model response"""
    score_patterns = [
        r'score[:\s]*(\d+)',
        r'(\d+)[/\s]*(?:out of\s*)?5',
        r'(\d+)[/\s]*5',
        r'rating[:\s]*(\d+)',
        r'^(\d+)',  # Number at start of line
    ]
    
    score = None
    for pattern in score_patterns:
        match = re.search(pattern, response_text, re.IGNORECASE | re.MULTILINE)
        if match:
            potential_score = int(match.group(1))
            if 1 <= potential_score <= 5:
                score = potential_score
                break
    
    return score, response_text.strip()

# Configuration
OLLAMA_NODE = "arctrddgxa004" # TODO: Change this variable to the node where Ollama is running
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"
model = "gemma3-optimized:27b" # TODO: Change this variable to the model you want to use

# FAILED IDs TO PROCESS - Add your failed participant IDs here
FAILED_IDS = [
    380, 383, 385, 386, 391, 392, 393, 397, 400, 401, 402, 409, 412, 414, 415, 416, 419, 423, 425, 426, 427, 428, 429, 430, 433, 434, 437, 441, 443, 444, 445, 446, 447, 448, 449, 454, 455, 456, 457, 459, 463, 464, 468, 471, 473, 474, 475, 478, 479, 485, 486, 487, 488, 491, 302, 307, 331, 335, 346, 367, 377, 381, 382, 388, 389, 390, 395, 403, 404, 406, 413, 417, 418, 420, 422, 436, 439, 440, 451, 458, 472, 476, 477, 482, 483, 484, 489, 490, 492
]

# Input file 
input_csv_path = "/data/users2/nblair7/analysis_results/qual_resultsfin.csv"  

#Output files
feedback_assessments_csv = "/data/users2/nblair7/analysis_results/ASSESSMENTT.csv"  # re-evaluated qualitative assessments
feedback_evaluations_csv = "/data/users2/nblair7/analysis_results/SCOREST.csv"  # re-evaluated evaluation scores

print("=== ENHANCED FEEDBACK LOOP RE-EVALUATION SYSTEM (FAILED IDs ONLY) ===")
print(f"Input file: {input_csv_path}")
print(f"Failed IDs to process: {FAILED_IDS}")
print(f"Feedback assessments file: {feedback_assessments_csv}")
print(f"Feedback evaluations file: {feedback_evaluations_csv}")

# Load the CSV file
print("Loading CSV file...")
df = pd.read_csv(input_csv_path)
print(f"Loaded {len(df)} participants")

# Filter to only process failed IDs
if FAILED_IDS:
    df = df[df['participant_id'].astype(str).isin([str(pid) for pid in FAILED_IDS])]
    print(f"Filtered to {len(df)} failed participants to reprocess")
else:
    print("WARNING: No failed IDs specified in FAILED_IDS list. Please add them to the FAILED_IDS variable.")
    exit(1)

feedback_assessments = []  # Store re-evaluated qualitative assessments
feedback_evaluations = []  # Store re-evaluated evaluation scores
processed_count = 0
skipped_count = 0
feedback_count = 0
failed_evaluations = []

# Check for existing feedback files to resume processing
completed_subjects = set()
if os.path.exists(feedback_assessments_csv):
    existing_feedback_assessments = pd.read_csv(feedback_assessments_csv)
    completed_subjects.update(existing_feedback_assessments['participant_id'].tolist())
    feedback_assessments = existing_feedback_assessments.to_dict('records')
    print(f"Found existing feedback assessments: {len(feedback_assessments)} records")

if os.path.exists(feedback_evaluations_csv):
    existing_feedback_evaluations = pd.read_csv(feedback_evaluations_csv)
    completed_subjects.update(existing_feedback_evaluations['participant_id'].tolist())
    feedback_evaluations = existing_feedback_evaluations.to_dict('records')
    print(f"Found existing feedback evaluations: {len(feedback_evaluations)} records")

if completed_subjects:
    print(f"Already completed {len(completed_subjects)} subjects")
    df = df[~df['participant_id'].isin(completed_subjects)]
    print(f"Remaining subjects to process: {len(df)}")

for index, row in df.iterrows():
    participant_id = row['participant_id']
    qualitative_assessment = row['qualitative_assessment']
    
    print(f"\n--- Processing {index + 1}/{len(df)}: {participant_id} ---")
    
    # Load transcript for this participant
    id_transcript = os.path.join("/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/", f"{participant_id}_P", f"{participant_id}_TRANSCRIPT.csv")
    print(f"Looking for transcript at: {id_transcript}")
    
    if not os.path.exists(id_transcript):
        print(f"Transcript not found for {participant_id}")
        skipped_count += 1
        continue
    
    print(f"Transcript found, loading data...")
    
    try:
        transcript_data = pd.read_csv(id_transcript, sep='\t')
        dialogue_txt = transcript_data.dropna(subset=['speaker', 'value'])
        dialogue_txt['dialogue'] = dialogue_txt['speaker'] + " : " + dialogue_txt['value']
        transcript = "\n".join(dialogue_txt['dialogue'])
        
        print(f"Dialogue length: {len(transcript)} characters")
        
    except Exception as e:
        print(f"Error loading transcript for {participant_id}: {e}")
        skipped_count += 1
        continue
    
    start_time = time.time()
    
    # Original evaluation prompts (for initial assessment to determine if feedback is needed)
    coherence_prompt = f"""Evaluate the following qualitative assessment output for COHERENCE only. Compare qualitative_assessment to the provided transcript. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.
Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

Coherence (1–5): Is the response logically consistent? Mistakes are logically inconsistent statements or contradictions within the assessment.

Format your response as:
Score: [your score]
Explanation: [explanation]

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    completeness_prompt = f"""Evaluate the following qualitative assessment output for COMPLETENESS only. Compare qualitative_assessment to the provided transcript. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.

Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

Completeness (1–5): Does the assessment cover all relevant symptoms, severities, duration/frequency? Mistakes have missed PHQ-8 symptoms, or duration/frequency details.

Format your response as:
Score: [your score]
Explanation: [explanation]

Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    specificity_prompt = f"""Evaluate the following qualitative assessment output for SPECIFICITY only. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.

Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

specificity (1–5): Is the assessment specific? Mistakes include using vague/generic statements like 'the patient seems depressed'.

Format your response as:
Score: [your score]
Explanation: [explanation]

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    accuracy_prompt = f"""Evaluate the following qualitative assessment output for ACCURACY only. Consider the qualitative assessment (qualitative_assessment) of the transcript (transcript) and compare qualitative_assessment to the provided transcript.

Score from 1-5 solely based on the following criteria: 
 - score of 5 : 0 mistakes
 - score of 4 : 1-2 mistakes
 - score of 3 : 3-4 mistakes
 - score of 2 : 5-6 mistakes
 - score of 1 : 7 or more mistakes

Accuracy (1–5): Are the signs/symptoms aligned with DSM-5 or PHQ-8? Mistakes are incorrect symptoms or incorrect duration/frequecy. 

Format your response as:
Score: [your score]
Explanation: [explanation]

---
Here is the transcript: 
{transcript}

Here is the assessment based on the transcript: 
{qualitative_assessment}
---"""

    # Initial evaluation requests
    coherence_request = {
        "model": model,
        "messages": [{"role": "user", "content": coherence_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    completeness_request = {
        "model": model,
        "messages": [{"role": "user", "content": completeness_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    specificity_request = {
        "model": model,
        "messages": [{"role": "user", "content": specificity_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    accuracy_request = {
        "model": model,
        "messages": [{"role": "user", "content": accuracy_prompt}],
        "stream": False,
        "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
    }
    
    timeout = 300  
    
    try:
        # Initial evaluation to check if feedback is needed
        initial_scores = {}
        initial_explanations = {}
        
        # coherence
        print("  Getting initial coherence response...")
        coherence_response = requests.post(BASE_URL, json=coherence_request, timeout=timeout-10)
        if coherence_response.status_code == 200:
            coherence_content = coherence_response.json()['message']['content']
            coherence_score, _ = parse_score_and_explanation(coherence_content)
            initial_scores['coherence'] = coherence_score
            initial_explanations['coherence'] = coherence_content
            print(f"  Initial coherence score: {coherence_score}")
        else:
            initial_scores['coherence'] = None
            initial_explanations['coherence'] = None
        
        time.sleep(2)
        
        # completeness
        print("  Getting initial completeness response...")
        completeness_response = requests.post(BASE_URL, json=completeness_request, timeout=timeout-10)
        if completeness_response.status_code == 200:
            completeness_content = completeness_response.json()['message']['content']
            completeness_score, _ = parse_score_and_explanation(completeness_content)
            initial_scores['completeness'] = completeness_score
            initial_explanations['completeness'] = completeness_content
            print(f"  Initial completeness score: {completeness_score}")
        else:
            initial_scores['completeness'] = None
            initial_explanations['completeness'] = None
        
        time.sleep(2)
        
        # specificity
        print("  Getting initial specificity response...")
        specificity_response = requests.post(BASE_URL, json=specificity_request, timeout=timeout-10)
        if specificity_response.status_code == 200:
            specificity_content = specificity_response.json()['message']['content']
            specificity_score, _ = parse_score_and_explanation(specificity_content)
            initial_scores['specificity'] = specificity_score
            initial_explanations['specificity'] = specificity_content
            print(f"  Initial specificity score: {specificity_score}")
        else:
            initial_scores['specificity'] = None
            initial_explanations['specificity'] = None
        
        time.sleep(2)
        
        # accuracy
        print("  Getting initial accuracy response...")
        accuracy_response = requests.post(BASE_URL, json=accuracy_request, timeout=timeout-10)
        if accuracy_response.status_code == 200:
            accuracy_content = accuracy_response.json()['message']['content']
            accuracy_score, _ = parse_score_and_explanation(accuracy_content)
            initial_scores['accuracy'] = accuracy_score
            initial_explanations['accuracy'] = accuracy_content
            print(f"  Initial accuracy score: {accuracy_score}")
        else:
            initial_scores['accuracy'] = None
            initial_explanations['accuracy'] = None
        
        time.sleep(2)
        
        # Check if any score is <= 3 to trigger feedback loop
        low_scores = []
        for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
            if initial_scores[metric] and initial_scores[metric] <= 3:
                low_scores.append(metric)
        
        if low_scores:
            print(f"  LOW SCORES DETECTED: {low_scores} - Starting enhanced feedback loop...")
            feedback_count += 1
            
            # Keep track of all attempts for this participant
            iteration = 0
            max_iterations = 10  # Prevent infinite loops
            current_assessment = qualitative_assessment
            current_scores = initial_scores.copy()
            current_explanations = initial_explanations.copy()
            all_low_scores = []  # Track which scores were low across iterations
            
            # Feedback loop - keep trying until all scores are 4 or 5
            while low_scores and iteration < max_iterations:
                iteration += 1
                print(f"    Iteration {iteration}: Improving assessment for low scores: {low_scores}")
                all_low_scores.extend(low_scores)
                
                # Build feedback message for low scoring metrics
                feedback_details = []
                for metric in low_scores:
                    if current_scores[metric] and current_explanations[metric]:
                        metric_description = {
                            'coherence': 'logical consistency (avoiding contradictions within the assessment)',
                            'completeness': 'comprehensive coverage (including all relevant PHQ-8 symptoms, severities, and duration/frequency details)',
                            'specificity': 'specific details (avoiding vague/generic statements like "the patient seems depressed")',
                            'accuracy': 'accuracy according to DSM-5 or PHQ-8 criteria (correct symptoms and duration/frequency)'
                        }
                        feedback_details.append(f"""
- **{metric.upper()}** scored {current_scores[metric]}/5 because: {current_explanations[metric]}
  Please specifically improve the {metric_description[metric]} in your revised assessment.""")
                
                # Enhanced regeneration prompt with feedback context
                system_prompt = "You are a psychiatrist. Your job is to assess and diagnose patients for depression. Be as objective, concise, and accurate as possible."
                
                feedback_prompt = f"""The following is an interview transcript from a psychiatric assessment of a subject who is being screened for depression. Please note that 'Ellie' is the interviewer, and 'Participant' is the subject being assessed.

TRANSCRIPT:
{transcript}

ORIGINAL ASSESSMENT:
{qualitative_assessment}

EVALUATION FEEDBACK:
The original assessment has been evaluated and needs improvement in the following areas:
{chr(10).join(feedback_details)}

TASK:
Please reassess the transcript and provide an improved qualitative assessment that addresses the specific feedback above. Build upon the original assessment but focus on improving the identified weaknesses.

Please:
1. Provide an overall qualitative assessment of the subject's mental health. 
2. Summarize social aspects that may influence the subject's mental health. (ex. familial relationships, friendship dynamics, work environment, etc. that are relevant to the subject's mental health)
3. Summarize biological aspects that may influence the subject's mental health. (ex. familial history of mental health issues, previous or pre-existing mental health issues, stress levels, etc. that are relevant to the subject's mental health)
4. Identify potential risk factors the subject may be experiencing.
5. Use exact quotes from the transcript to support your assessment for each tag.

Output should be formatted as bullet points with headings for each section using stars. Example: **Tiredness** <explanation of tiredness>. Do not include any additional text outside the bullet points.

Please answer in this XML format with each tag on a new line, properly indented. Use straight quotes instead of curly quotes, and do not include any additional text outside the XML tags:

<assessment>
  <!-- Summary of participant's overall mental health -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</assessment>

<social_factors>
  <!-- Summary of social influences on patient's health -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</social_factors>

<biological_factors>
  <!-- Summary of biological influences on patient's health -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</biological_factors>

<risk_factors>
  <!-- Summary of potential risk factors -->
  <exact_quotes>
    <!-- Quotes from the transcript that support the assessment -->
  </exact_quotes>
</risk_factors>
"""
                
                feedback_request = {
                    "model": model,
                    "messages": [{"role": "system", "content": system_prompt},
                               {"role": "user", "content": feedback_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                
                feedback_response = requests.post(BASE_URL, json=feedback_request, timeout=timeout)
                if feedback_response.status_code != 200:
                    print(f"    Failed to regenerate assessment: {feedback_response.status_code}")
                    failed_evaluations.append(participant_id)
                    break
                
                current_assessment = feedback_response.json()['message']['content']
                print(f"    New assessment generated with targeted improvements, re-evaluating...")
                
                # Re-evaluate with new assessment
                new_coherence_prompt = coherence_prompt.replace(qualitative_assessment, current_assessment)
                new_completeness_prompt = completeness_prompt.replace(qualitative_assessment, current_assessment)
                new_specificity_prompt = specificity_prompt.replace(qualitative_assessment, current_assessment)
                new_accuracy_prompt = accuracy_prompt.replace(qualitative_assessment, current_assessment)
                
                # Store new scores and explanations
                new_scores = {}
                new_explanations = {}
                
                # Re-evaluate coherence
                time.sleep(2)
                new_coherence_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_coherence_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_coherence_response = requests.post(BASE_URL, json=new_coherence_request, timeout=timeout-10)
                if new_coherence_response.status_code == 200:
                    new_coherence_content = new_coherence_response.json()['message']['content']
                    new_coherence_score, _ = parse_score_and_explanation(new_coherence_content)
                    new_scores['coherence'] = new_coherence_score
                    new_explanations['coherence'] = new_coherence_content
                
                # Re-evaluate completeness
                time.sleep(2)
                new_completeness_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_completeness_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_completeness_response = requests.post(BASE_URL, json=new_completeness_request, timeout=timeout-10)
                if new_completeness_response.status_code == 200:
                    new_completeness_content = new_completeness_response.json()['message']['content']
                    new_completeness_score, _ = parse_score_and_explanation(new_completeness_content)
                    new_scores['completeness'] = new_completeness_score
                    new_explanations['completeness'] = new_completeness_content
                
                # Re-evaluate specificity
                time.sleep(2)
                new_specificity_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_specificity_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_specificity_response = requests.post(BASE_URL, json=new_specificity_request, timeout=timeout-10)
                if new_specificity_response.status_code == 200:
                    new_specificity_content = new_specificity_response.json()['message']['content']
                    new_specificity_score, _ = parse_score_and_explanation(new_specificity_content)
                    new_scores['specificity'] = new_specificity_score
                    new_explanations['specificity'] = new_specificity_content
                
                # Re-evaluate accuracy
                time.sleep(2)
                new_accuracy_request = {
                    "model": model,
                    "messages": [{"role": "user", "content": new_accuracy_prompt}],
                    "stream": False,
                    "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
                }
                new_accuracy_response = requests.post(BASE_URL, json=new_accuracy_request, timeout=timeout-10)
                if new_accuracy_response.status_code == 200:
                    new_accuracy_content = new_accuracy_response.json()['message']['content']
                    new_accuracy_score, _ = parse_score_and_explanation(new_accuracy_content)
                    new_scores['accuracy'] = new_accuracy_score
                    new_explanations['accuracy'] = new_accuracy_content
                
                # Update current scores and explanations for next iteration
                current_scores.update(new_scores)
                current_explanations.update(new_explanations)
                
                # Check which scores are still low
                low_scores = []
                for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
                    if metric in new_scores and new_scores[metric] and new_scores[metric] <= 3:
                        low_scores.append(metric)
                
                # Print current scores
                print(f"    Iteration {iteration} scores: " + 
                      f"Coherence={current_scores.get('coherence', 'N/A')}, " +
                      f"Completeness={current_scores.get('completeness', 'N/A')}, " +
                      f"Specificity={current_scores.get('specificity', 'N/A')}, " +
                      f"Accuracy={current_scores.get('accuracy', 'N/A')}")
                
                if low_scores:
                    print(f"    Still have low scores: {low_scores}, continuing with targeted feedback...")
                else:
                    print(f"    All scores now 4 or 5! Enhanced feedback loop complete after {iteration} iterations.")
            
            # Save final results after feedback loop completes
            if iteration >= max_iterations:
                print(f"    Reached max iterations ({max_iterations}), stopping feedback loop")
            
            # Save the final qualitative assessment
            feedback_assessment_record = {
                'participant_id': participant_id,
                'original_qualitative_assessment': qualitative_assessment,
                'feedback_qualitative_assessment': current_assessment,
                'iterations_required': iteration,
                'low_scores_detected': ', '.join(sorted(set(all_low_scores))),
                'initial_coherence_score': initial_scores.get('coherence'),
                'initial_completeness_score': initial_scores.get('completeness'),
                'initial_specificity_score': initial_scores.get('specificity'),
                'initial_accuracy_score': initial_scores.get('accuracy'),
                'final_coherence_score': current_scores.get('coherence'),
                'final_completeness_score': current_scores.get('completeness'),
                'final_specificity_score': current_scores.get('specificity'),
                'final_accuracy_score': current_scores.get('accuracy')
            }
            feedback_assessments.append(feedback_assessment_record)
            
            # Save the final evaluation scores
            feedback_eval_record = {
                'participant_id': participant_id,
                'iterations_required': iteration,
                'low_scores_detected': ', '.join(sorted(set(all_low_scores)))
            }
            
            # Add final scores and explanations to record
            for metric in ['coherence', 'completeness', 'specificity', 'accuracy']:
                if metric in current_scores:
                    feedback_eval_record[f'final_{metric}_score'] = current_scores[metric]
                    feedback_eval_record[f'final_{metric}_explanation'] = current_explanations[metric]
                if metric in initial_scores:
                    feedback_eval_record[f'initial_{metric}_score'] = initial_scores[metric]
                    feedback_eval_record[f'initial_{metric}_explanation'] = initial_explanations[metric]
            
            feedback_evaluations.append(feedback_eval_record)
            processed_count += 1
        else:
            print(f"  No low scores detected - skipping feedback loop")
        
        elapsed_time = time.time() - start_time
        print(f"Completed participant {participant_id} in {elapsed_time:.1f}s ({processed_count} with feedback applied)")
            
    except Exception as e:
        print(f"Error processing participant {participant_id}: {e}")
        failed_evaluations.append(participant_id)
    
    # Save progress every 10 participants
    if (len(feedback_assessments) % 10 == 0 and len(feedback_assessments) > 0) or len(feedback_assessments) == 1:
        # Save feedback assessments
        if feedback_assessments:
            feedback_assessments_df = pd.DataFrame(feedback_assessments)
            feedback_assessments_df.to_csv(feedback_assessments_csv, index=False)
            print(f"Saved feedback assessments: {len(feedback_assessments)} records to {feedback_assessments_csv}")
        
        # Save feedback evaluations
        if feedback_evaluations:
            feedback_evaluations_df = pd.DataFrame(feedback_evaluations)
            feedback_evaluations_df.to_csv(feedback_evaluations_csv, index=False)
            print(f"Saved feedback evaluations: {len(feedback_evaluations)} records to {feedback_evaluations_csv}")
    
    time.sleep(1)

# Final summary
print(f"\n=== PROCESSING SUMMARY ===")
print(f"Total subjects processed: {len(df)}")
print(f"Skipped (no transcript): {skipped_count}")
print(f"Feedback loops applied: {feedback_count}")
print(f"Successfully processed with feedback: {processed_count}")
print(f"Failed: {len(failed_evaluations)}")
print(f"Feedback assessments created: {len(feedback_assessments)}")
print(f"Feedback evaluations created: {len(feedback_evaluations)}")

if failed_evaluations:
    print(f"Failed participant IDs: {failed_evaluations}")

# Save final feedback files
if feedback_assessments:
    feedback_assessments_df = pd.DataFrame(feedback_assessments)
    feedback_assessments_df.to_csv(feedback_assessments_csv, index=False)
    print(f"Final feedback assessments saved: {feedback_assessments_csv}")
    print(f"Enhanced feedback assessments CSV columns:")
    print(f"- participant_id")
    print(f"- original_qualitative_assessment")
    print(f"- feedback_qualitative_assessment")
    print(f"- iterations_required")
    print(f"- low_scores_detected")
    print(f"- initial_[metric]_score and final_[metric]_score for comparison")

if feedback_evaluations:
    feedback_evaluations_df = pd.DataFrame(feedback_evaluations)
    feedback_evaluations_df.to_csv(feedback_evaluations_csv, index=False)
    print(f"Final feedback evaluations saved: {feedback_evaluations_csv}")
    print(f"Enhanced feedback evaluations CSV columns:")
    print(f"- participant_id")
    print(f"- iterations_required")
    print(f"- initial_[metric]_score / final_[metric]_score")
    print(f"- initial_[metric]_explanation / final_[metric]_explanation")
    print(f"- low_scores_detected")

if not feedback_assessments and not feedback_evaluations:
    print("No participants required feedback - no CSV files created!")

=== ENHANCED FEEDBACK LOOP RE-EVALUATION SYSTEM (FAILED IDs ONLY) ===
Input file: /data/users2/nblair7/analysis_results/qual_resultsfin.csv
Failed IDs to process: [380, 383, 385, 386, 391, 392, 393, 397, 400, 401, 402, 409, 412, 414, 415, 416, 419, 423, 425, 426, 427, 428, 429, 430, 433, 434, 437, 441, 443, 444, 445, 446, 447, 448, 449, 454, 455, 456, 457, 459, 463, 464, 468, 471, 473, 474, 475, 478, 479, 485, 486, 487, 488, 491, 302, 307, 331, 335, 346, 367, 377, 381, 382, 388, 389, 390, 395, 403, 404, 406, 413, 417, 418, 420, 422, 436, 439, 440, 451, 458, 472, 476, 477, 482, 483, 484, 489, 490, 492]
Feedback assessments file: /data/users2/nblair7/analysis_results/ASSESSMENTT.csv
Feedback evaluations file: /data/users2/nblair7/analysis_results/SCOREST.csv
Loading CSV file...
Loaded 142 participants
Filtered to 89 failed participants to reprocess

--- Processing 54/89: 380 ---
Looking for transcript at: /data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/380_P/380_TRANSCRIPT.csv

In [3]:
import os
import pandas as pd
import requests
import time
import json
import re
from requests.exceptions import Timeout, RequestException
from datetime import datetime

def parse_score_and_explanation(response_text):
    """Extract score and explanation from model response"""
    score_patterns = [
        r'score[:\s]*(\d+)',
        r'(\d+)[/\s]*(?:out of\s*)?5',
        r'(\d+)[/\s]*5',
        r'rating[:\s]*(\d+)',
        r'^(\d+)',
    ]
    
    score = None
    for pattern in score_patterns:
        match = re.search(pattern, response_text, re.IGNORECASE | re.MULTILINE)
        if match:
            potential_score = int(match.group(1))
            if 1 <= potential_score <= 5:
                score = potential_score
                break
    
    return score, response_text.strip()

def make_api_call(prompt, metric_name, participant_id, max_retries=10):
    """Make API call with unlimited retries and extended timeout"""
    print(f"    [{datetime.now().strftime('%H:%M:%S')}] Starting {metric_name} evaluation...", end="", flush=True)
    
    retry_count = 0
    while retry_count < max_retries:
        try:
            request = {
                "model": model,
                "messages": [{"role": "user", "content": prompt}],
                "stream": False,
                "options": {"temperature": 0, "top_k": 20, "top_p": 0.9}
            }
            
            start_time = time.time()
            # Increased timeout to 300 seconds (5 minutes)
            response = requests.post(BASE_URL, json=request, timeout=300)
            elapsed = time.time() - start_time
            
            if response.status_code == 200:
                content = response.json()['message']['content']
                score, explanation = parse_score_and_explanation(content)
                print(f" ✓ Score: {score} ({elapsed:.1f}s)")
                return score, explanation
            else:
                print(f" ⚠ HTTP {response.status_code}, retrying... (attempt {retry_count + 1})")
                
        except Timeout:
            retry_count += 1
            print(f" ⚠ TIMEOUT, retrying... (attempt {retry_count})")
            time.sleep(5)  # Wait before retrying
            continue
        except RequestException as e:
            retry_count += 1
            print(f" ⚠ REQUEST ERROR, retrying... (attempt {retry_count})")
            time.sleep(5)
            continue
        except Exception as e:
            retry_count += 1
            print(f" ⚠ ERROR, retrying... (attempt {retry_count})")
            time.sleep(5)
            continue
        
        retry_count += 1
        time.sleep(2)  # Brief pause between retries
    
    # If all retries failed
    print(f" ✗ Failed after {max_retries} attempts")
    return None, f"Failed after {max_retries} attempts"

# Configuration
OLLAMA_NODE = "arctrddgxa004"
BASE_URL = f"http://{OLLAMA_NODE}:11434/api/chat"
model = "gemma3-optimized:27b"

print(f"[{datetime.now().strftime('%H:%M:%S')}] Testing API connection...")
try:
    test_response = requests.get(f"http://{OLLAMA_NODE}:11434/api/tags", timeout=30)
    if test_response.status_code == 200:
        print("✓ API connection successful")
    else:
        print(f"⚠ API returned status {test_response.status_code}")
except Exception as e:
    print(f"✗ API connection failed: {e}")
    print("Script may hang on API calls!")

# FAILED IDs TO PROCESS
FAILED_IDS = [
    380, 383, 385, 386, 391, 392, 393, 397, 400, 401, 402, 409, 412, 414, 415, 416, 419, 423, 425, 426, 427, 428, 429, 430, 433, 434, 437, 441, 443, 444, 445, 446, 447, 448, 449, 454, 455, 456, 457, 459, 463, 464, 468, 471, 473, 474, 475, 478, 479, 485, 486, 487, 488, 491, 302, 307, 331, 335, 346, 367, 377, 381, 382, 388, 389, 390, 395, 403, 404, 406, 413, 417, 418, 420, 422, 436, 439, 440, 451, 458, 472, 476, 477, 482, 483, 484, 489, 490, 492
]

# Input/Output files
input_csv_path = "/data/users2/nblair7/analysis_results/qual_resultsfin.csv"
feedback_assessments_csv = "/data/users2/nblair7/analysis_results/ASSESSMENT.csv"
feedback_evaluations_csv = "/data/users2/nblair7/analysis_results/SCORES.csv"

print("=== FEEDBACK LOOP RE-EVALUATION (PERSISTENT) ===")
print(f"Processing {len(FAILED_IDS)} failed IDs")
print("Will continue until all scores reach 4 or 5")

# Load data
print(f"[{datetime.now().strftime('%H:%M:%S')}] Loading data...")
df = pd.read_csv(input_csv_path)
df = df[df['participant_id'].astype(str).isin([str(pid) for pid in FAILED_IDS])]
print(f"Found {len(df)} participants to process")

# Check existing files
completed_subjects = set()
if os.path.exists(feedback_assessments_csv):
    existing = pd.read_csv(feedback_assessments_csv)
    completed_subjects.update(existing['participant_id'].tolist())

if os.path.exists(feedback_evaluations_csv):
    existing = pd.read_csv(feedback_evaluations_csv)
    completed_subjects.update(existing['participant_id'].tolist())

if completed_subjects:
    df = df[~df['participant_id'].isin(completed_subjects)]
    print(f"Skipping {len(completed_subjects)} already completed. Remaining: {len(df)}")

assessments = []
scores = []
processed = 0
failed = []

for index, row in df.iterrows():
    participant_id = row['participant_id']
    qualitative_assessment = row['qualitative_assessment']
    
    print(f"\n[{datetime.now().strftime('%H:%M:%S')}] Processing {participant_id} ({processed + 1}/{len(df)})")
    
    # Load transcript
    print("  Loading transcript...", end="", flush=True)
    transcript_path = os.path.join("/data/users4/xli/ai-psychiatrist/datasets/daic_woz_dataset/", f"{participant_id}_P", f"{participant_id}_TRANSCRIPT.csv")
    
    if not os.path.exists(transcript_path):
        print(f" ✗ No transcript found")
        failed.append(participant_id)
        continue
    
    try:
        transcript_data = pd.read_csv(transcript_path, sep='\t')
        dialogue_txt = transcript_data.dropna(subset=['speaker', 'value'])
        dialogue_txt['dialogue'] = dialogue_txt['speaker'] + " : " + dialogue_txt['value']
        transcript = "\n".join(dialogue_txt['dialogue'])
        print(f" ✓ Loaded ({len(transcript)} chars)")
    except Exception as e:
        print(f" ✗ Error: {e}")
        failed.append(participant_id)
        continue
    
    # Evaluation prompts
    prompts = {
        'coherence': f"""Evaluate for COHERENCE (1-5): Is the response logically consistent? 
Score: 5=0 mistakes, 4=1-2 mistakes, 3=3-4 mistakes, 2=5-6 mistakes, 1=7+ mistakes
Mistakes = contradictions within assessment.

Format: Score: [number]\nExplanation: [brief explanation]

Transcript: {transcript[:2000]}...
Assessment: {qualitative_assessment[:1000]}...""",

        'completeness': f"""Evaluate for COMPLETENESS (1-5): Does assessment cover all relevant symptoms, severities, duration/frequency?
Score: 5=0 mistakes, 4=1-2 mistakes, 3=3-4 mistakes, 2=5-6 mistakes, 1=7+ mistakes
Mistakes = missed PHQ-8 symptoms or duration/frequency details.

Format: Score: [number]\nExplanation: [brief explanation]

Transcript: {transcript[:2000]}...
Assessment: {qualitative_assessment[:1000]}...""",

        'specificity': f"""Evaluate for SPECIFICITY (1-5): Is the assessment specific vs vague?
Score: 5=0 mistakes, 4=1-2 mistakes, 3=3-4 mistakes, 2=5-6 mistakes, 1=7+ mistakes
Mistakes = vague statements like "patient seems depressed".

Format: Score: [number]\nExplanation: [brief explanation]

Transcript: {transcript[:2000]}...
Assessment: {qualitative_assessment[:1000]}...""",

        'accuracy': f"""Evaluate for ACCURACY (1-5): Signs/symptoms aligned with DSM-5/PHQ-8?
Score: 5=0 mistakes, 4=1-2 mistakes, 3=3-4 mistakes, 2=5-6 mistakes, 1=7+ mistakes
Mistakes = incorrect symptoms or duration/frequency.

Format: Score: [number]\nExplanation: [brief explanation]

Transcript: {transcript[:2000]}...
Assessment: {qualitative_assessment[:1000]}..."""
    }
    
    # Get initial scores
    print("  Initial evaluation:")
    current_scores = {}
    current_explanations = {}
    
    for metric, prompt in prompts.items():
        score, explanation = make_api_call(prompt, metric, participant_id)
        if score is not None:
            current_scores[metric] = score
            current_explanations[metric] = explanation[:200] + "..." if len(explanation) > 200 else explanation
        else:
            # If API call completely fails, mark as failed participant
            print(f"  ✗ Failed to get initial score for {metric}")
            failed.append(participant_id)
            break
        
        time.sleep(2)  # Brief pause between API calls
    
    # Skip if we couldn't get initial scores
    if participant_id in failed:
        continue
    
    # Check if feedback needed - continue until ALL scores are 4 or 5
    low_scores = [metric for metric, score in current_scores.items() if score and score < 4]
    
    current_assessment = qualitative_assessment
    
    if low_scores:
        print(f"  Scores below 4 detected: {low_scores}")
        print("  Starting persistent feedback loop...")
        iteration = 0
        max_iterations = 50  # Increased limit, but will continue until target reached
        
        while low_scores and iteration < max_iterations:
            iteration += 1
            print(f"    Feedback iteration {iteration}:")
            
            # Build feedback prompt
            feedback_details = []
            for metric in low_scores:
                if metric in current_scores and metric in current_explanations:
                    feedback_details.append(f"- {metric.upper()}: scored {current_scores[metric]}/5. Must reach 4 or 5.")
            
            feedback_prompt = f"""Improve this psychiatric assessment based on feedback:

TRANSCRIPT: {transcript[:1500]}...

CURRENT ASSESSMENT: {current_assessment[:800]}...

FEEDBACK: The assessment MUST be improved in these areas to reach score 4 or 5:
{chr(10).join(feedback_details)}

Requirements for each score:
- Score 4: Maximum 1-2 mistakes/issues
- Score 5: No mistakes/issues

Provide significantly improved assessment in XML format:
<assessment><!-- detailed, comprehensive improved assessment --></assessment>
<social_factors><!-- social influences --></social_factors> 
<biological_factors><!-- biological influences --></biological_factors>
<risk_factors><!-- risk factors --></risk_factors>"""
            
            # Generate improved assessment - keep trying until successful
            improved_assessment = None
            improvement_attempts = 0
            while improved_assessment is None and improvement_attempts < 10:
                improvement_attempts += 1
                score, improved_assessment = make_api_call(feedback_prompt, f"improvement-{improvement_attempts}", participant_id)
                if improved_assessment is None:
                    print(f"    Retrying improvement generation (attempt {improvement_attempts})...")
                    time.sleep(5)
            
            if improved_assessment is None:
                print(f"    ✗ Could not generate improvement after {improvement_attempts} attempts")
                break
            
            current_assessment = improved_assessment
            
            # Re-evaluate with new assessment - only the low scoring metrics
            print(f"    Re-evaluating improved assessment:")
            for metric, base_prompt in prompts.items():
                if metric in low_scores:
                    new_prompt = base_prompt.replace(qualitative_assessment[:1000], current_assessment[:1000])
                    
                    # Keep trying until we get a score
                    eval_attempts = 0
                    score = None
                    while score is None and eval_attempts < 5:
                        eval_attempts += 1
                        score, explanation = make_api_call(new_prompt, f"{metric}-reeval-{eval_attempts}", participant_id)
                        if score is None:
                            time.sleep(3)
                    
                    if score is not None:
                        current_scores[metric] = score
                        current_explanations[metric] = explanation[:200] + "..." if len(explanation) > 200 else explanation
                    else:
                        print(f"    ⚠ Could not re-evaluate {metric} after {eval_attempts} attempts")
                    
                    time.sleep(2)
            
            # Check remaining low scores - must be 4 or 5
            low_scores = [metric for metric, score in current_scores.items() if score and score < 4]
            
            if not low_scores:
                print(f"    ✓ All scores reached 4 or 5!")
                break
            else:
                print(f"    Scores still below 4: {[(m, current_scores[m]) for m in low_scores]}")
        
        if iteration >= max_iterations:
            print(f"  ⚠ Reached maximum iterations ({max_iterations}) without achieving target scores")
        else:
            print(f"  ✓ Feedback completed successfully after {iteration} iterations")
    else:
        print("  ✓ All scores already 4 or 5, no feedback needed")
    
    # Store results
    assessments.append({
        'participant_id': participant_id,
        'dataset_type': 'feedback_improved',
        'qualitative_assessment': current_assessment
    })
    
    scores.append({
        'participant_id': participant_id,
        'coherence': current_scores.get('coherence'),
        'coherence_explanation': current_explanations.get('coherence', ''),
        'completeness': current_scores.get('completeness'),
        'completeness_explanation': current_explanations.get('completeness', ''),
        'specificity': current_scores.get('specificity'),
        'specificity_explanation': current_explanations.get('specificity', ''),
        'accuracy': current_scores.get('accuracy'),
        'accuracy_explanation': current_explanations.get('accuracy', '')
    })
    
    processed += 1
    
    # Save progress every participant (more frequent saves for long-running process)
    try:
        pd.DataFrame(assessments).to_csv(feedback_assessments_csv, index=False)
        pd.DataFrame(scores).to_csv(feedback_evaluations_csv, index=False)
        print(f"  ✓ Progress saved: {processed} completed")
    except Exception as e:
        print(f"  ⚠ Save failed: {e}")

# Final save
try:
    if assessments:
        pd.DataFrame(assessments).to_csv(feedback_assessments_csv, index=False)
        print(f"✓ Final assessments saved: {len(assessments)} records")

    if scores:
        pd.DataFrame(scores).to_csv(feedback_evaluations_csv, index=False)
        print(f"✓ Final scores saved: {len(scores)} records")
except Exception as e:
    print(f"✗ Final save failed: {e}")

print(f"\n=== SUMMARY ===")
print(f"Completed: {processed} participants")
print(f"Failed: {len(failed)} participants")
if failed:
    print(f"Failed IDs: {failed}")
print(f"Finished at: {datetime.now().strftime('%H:%M:%S')}")

[14:24:52] Testing API connection...
✓ API connection successful
=== FEEDBACK LOOP RE-EVALUATION (PERSISTENT) ===
Processing 89 failed IDs
Will continue until all scores reach 4 or 5
[14:24:52] Loading data...
Found 89 participants to process
Skipping 3 already completed. Remaining: 86

[14:24:53] Processing 386 (1/86)
  Loading transcript... ✓ Loaded (17714 chars)
  Initial evaluation:
    [14:24:53] Starting coherence evaluation... ✓ Score: 5 (61.2s)
    [14:25:56] Starting completeness evaluation... ✓ Score: 1 (29.9s)
    [14:26:28] Starting specificity evaluation... ✓ Score: 2 (30.1s)
    [14:27:00] Starting accuracy evaluation... ✓ Score: 1 (34.7s)
  Scores below 4 detected: ['completeness', 'specificity', 'accuracy']
  Starting persistent feedback loop...
    Feedback iteration 1:
    [14:27:36] Starting improvement-1 evaluation... ✓ Score: 4 (137.2s)
    Re-evaluating improved assessment:
    [14:29:54] Starting completeness-reeval-1 evaluation... ✓ Score: 1 (30.4s)
    [14:30:2

KeyboardInterrupt: 