In [None]:
# Install necessary packages (if needed)
!pip install openai python-dotenv pandas

# Imports and API setup
import openai
import pandas as pd
import json
import os
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Confirm successful setup
print("✅ Setup Completed Successfully!")


In [None]:
validation_data = []
with open("/workspace/data/embeddings/questions/qa_finetune.jsonl", "r") as f:
    for line in f:
        validation_data.append(json.loads(line))

df_validation = pd.DataFrame(validation_data)
print("✅ Loaded Validation Data:")
df_validation.head()



In [None]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def query_finetuned_model(question):
    response = client.chat.completions.create(
        model="ft:gpt-3.5-turbo-0125:fmai::BNlDwJo3",
        messages=[
            {"role": "system", "content": "You are an expert financial assistant."},
            {"role": "user", "content": question}
        ],
        temperature=0.0
    )
    return response.choices[0].message.content.strip()

# Quick test run
test_question = df_validation.iloc[0]['text']
test_response = query_finetuned_model(test_question)

print(f"✅ Test Question:\n{test_question}\n\n✅ Model Response:\n{test_response}")


In [None]:
# Perform inference on full dataset and store results
validation_results = []

for idx, row in df_validation.iterrows():
    question = row["text"]
    gpt4_answer = row.get("ai_answer", "N/A")  # Safely get GPT-4 answer if available
    finetuned_answer = query_finetuned_model(question)
    
    validation_results.append({
        "question": question,
        "gpt4_answer": gpt4_answer,
        "finetuned_answer": finetuned_answer
    })
    print(f"Processed question {idx+1}/{len(df_validation)}")

df_results = pd.DataFrame(validation_results)

# Preview results
df_results.head()


In [None]:
# Save clearly for detailed analysis
df_results.to_csv("/workspace/data/embeddings/questions/finetune_validation_results.csv", index=False)
print("✅ Results saved explicitly at: '/workspace/data/embeddings/questions/finetune_validation_results.csv'")


In [None]:
import json
import pandas as pd

# Correct file path (adjust if necessary)
validation_path = "/workspace/data/embeddings/questions/qa_finetune.jsonl"

# Load correctly structured validation data
validation_data = []
with open(validation_path, "r") as f:
    for line in f:
        validation_data.append(json.loads(line))

df_validation = pd.DataFrame(validation_data)

# Confirm explicitly correct data structure
print("✅ Properly loaded validation data:")
df_validation.head()


In [None]:
import re

# Function to explicitly extract question and answer
def parse_text(text):
    question_match = re.search(r"### Question\n(.*?)\n\n### Answer", text, re.DOTALL)
    answer_match = re.search(r"### Answer\n(.*)", text, re.DOTALL)

    question = question_match.group(1).strip() if question_match else None
    answer = answer_match.group(1).strip() if answer_match else None

    return question, answer

# Clearly apply parsing function to dataframe
df_validation[['question', 'gpt4_answer']] = df_validation.apply(
    lambda row: pd.Series(parse_text(row['text'])), axis=1)

# Clearly preview structured dataframe
print("✅ Parsed validation dataframe explicitly:")
df_validation[['question', 'gpt4_answer']].head()


In [None]:
# Perform inference clearly with corrected dataframe
validation_results = []

for idx, row in df_validation.iterrows():
    question = row["question"]
    gpt4_answer = row["gpt4_answer"]
    finetuned_answer = query_finetuned_model(question)
    
    validation_results.append({
        "question": question,
        "gpt4_answer": gpt4_answer,
        "finetuned_answer": finetuned_answer
    })
    print(f"Processed question {idx+1}/{len(df_validation)}")

# Clearly structured corrected results DataFrame
df_results_corrected = pd.DataFrame(validation_results)

# Preview explicitly corrected results
df_results_corrected.head()


In [None]:
import pandas as pd

# Explicitly load corrected results
df_results = pd.read_csv("/workspace/data/embeddings/questions/finetune_validation_results_corrected.csv")

# Preview clearly
df_results.head()


In [None]:
!pip install matplotlib


In [None]:
import matplotlib.pyplot as plt

# Calculate explicitly length of answers as quick proxy for depth/detail
df_results["gpt4_length"] = df_results["gpt4_answer"].apply(len)
df_results["finetuned_length"] = df_results["finetuned_answer"].apply(len)

# Explicit visual comparison (bar plot)
df_results[["gpt4_length", "finetuned_length"]].plot(
    kind='bar', figsize=(14,7), alpha=0.75)

plt.title('Response Length Comparison (GPT-4 vs. Fine-Tuned GPT-3.5)', fontsize=16)
plt.xlabel('Question Index', fontsize=14)
plt.ylabel('Length of Response', fontsize=14)
plt.legend(['GPT-4', 'Fine-Tuned GPT-3.5'])
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


In [None]:
# Explicitly print comparative review clearly for manual analysis
for idx, row in df_results.iterrows():
    print(f"📌 Q{idx+1}: {row['question']}\n")
    print(f"🟢 GPT-4 Answer (Reference):\n{row['gpt4_answer']}\n")
    print(f"🔵 Fine-Tuned GPT-3.5 Answer:\n{row['finetuned_answer']}\n")
    print("-"*80, "\n")


In [None]:
# Example structure for notes clearly documenting insights
evaluation_notes = [
    {'question_index': 1, 'issue': 'Less detailed answer'},
    {'question_index': 7, 'issue': 'Incorrect technical explanation'},
    # Add more explicitly after your review
]

df_notes = pd.DataFrame(evaluation_notes)
df_notes.to_csv("/workspace/data/embeddings/questions/fine_tune_evaluation_notes.csv", index=False)

print("✅ Evaluation notes clearly documented and saved.")


In [None]:
# Show first 20 flagged rows in plain text
cols = ['question', 'gpt4_length', 'finetuned_length', 'len_ratio']
print(flagged[cols].head(20).to_string(index=False))

# Optional: write the full flagged set to a quick‑view file
flagged.to_csv("/workspace/flagged_len_candidates.csv", index=False)
print("\n✅  Saved all flagged rows to /workspace/flagged_len_candidates.csv")


In [1]:
import pandas as pd

# 1. Load your corrected inference results
df = pd.read_csv("/workspace/data/embeddings/questions/finetune_validation_results_corrected.csv")

# 2. Re‑create the length‑based flags
df['gpt4_length'] = df['gpt4_answer'].str.len()
df['finetuned_length'] = df['finetuned_answer'].str.len()
df['len_ratio'] = df['finetuned_length'] / df['gpt4_length'].replace(0,1)
df['flag_len'] = (df['len_ratio'] < 0.60) | (df['finetuned_length'] < 150)

# 3. Keep only the flagged rows and reset their index to 0…N
flagged = df[df['flag_len']].copy().reset_index(drop=True)

# 4. Add an empty 'comment' column if it doesn’t exist
flagged['comment'] = ""



In [2]:
from IPython.display import display
cols = ['question', 'finetuned_answer', 'gpt4_length', 'finetuned_length', 'len_ratio', 'comment']
display(flagged.iloc[0:20][cols])


Unnamed: 0,question,finetuned_answer,gpt4_length,finetuned_length,len_ratio,comment
0,Analyze how policy limits in insurance contrac...,Policy limits mitigate adverse selection by cr...,3171,943,0.297383,
1,Develop a mathematical model demonstrating how...,The adverse selection model in insurance marke...,3101,962,0.310223,
2,Compare and contrast adverse selection challen...,Private insurance companies and public pension...,3868,1139,0.294467,
3,Evaluate how the increasing availability of ge...,The availability of genetic testing and predic...,3117,1108,0.35547,
4,Critically assess the effectiveness of policy ...,Policy limits effectively align interests by c...,2988,564,0.188755,
5,Develop a comprehensive framework for quantify...,The framework for quantifying adverse selectio...,4265,1425,0.334115,
6,Analyze how the structure of policy limits in ...,Optimal policy limit structures balance social...,3379,814,0.2409,
7,Compare the theoretical predictions of adverse...,Theoretical adverse selection models predict: ...,3112,1133,0.364075,
8,Analyze how trading halts function as market c...,Trading halts serve as circuit breakers by tem...,3224,1018,0.315757,
9,Analyze how trading halts function as market c...,Trading halts serve as circuit breakers by tem...,3224,994,0.308313,


In [3]:
comments_0_19 = [
    "Missing policy comparison details, lacks practical examples",
    "No mathematical formulation shown, missing market failure conditions",
    "Insufficient institutional structure analysis, lacks specific examples",
    "Lacks specific regulatory proposals, missing ethical framework details",
    "Too brief overall, missing scenario breakdown, lacks alignment mechanism details",
    "Missing quantification methodology, no practical implementation examples",
    "Insufficient optimization criteria, no social welfare measurement metrics",
    "Missing specific market comparisons, lacks empirical evidence details",
    "Lacks specific circuit breaker examples, missing effectiveness metrics",
    "Duplicate of above, similar deficiencies",
    "Missing auction theory formalization, needs more market information examples",
    "Despite longer length, missing specific volume profile indicators",
    "Insufficient contrast of volume patterns, lacks practical identification criteria",
    "Missing timeframe-specific analysis, insufficient transition indicators",
    "Lacks comparative methodology assessment, missing practical application",
    "Extremely brief, missing Gueant's core theoretical framework",
    "Duplicate of above, same deficiencies",
    "Critically brief, missing mathematical relationship explanation",
    "Duplicate of above, same deficiencies",
    "Too brief, missing statistical methodology comparison"
]


In [4]:
flagged.loc[0:19, 'comment'] = comments_0_19


In [5]:
flagged.to_csv("/workspace/fine_tune_evaluation_notes.csv", index=False)
print("✅  Saved 20 annotated rows!")



✅  Saved 20 annotated rows!


In [6]:
# 1️⃣  Reload the file you just saved
flagged_check = pd.read_csv("/workspace/fine_tune_evaluation_notes.csv")

# 2️⃣  Count rows with non‑empty comments
num_commented = (flagged_check['comment'].str.len() > 0).sum()
total_flagged = len(flagged_check)
print(f"✅  {num_commented}/{total_flagged} flagged rows now have comments.")

# 3️⃣  Show first 5 commented rows to confirm they match your entries
display(flagged_check[flagged_check['comment'].str.len() > 0].head())


✅  20/90 flagged rows now have comments.


Unnamed: 0,question,gpt4_answer,finetuned_answer,gpt4_length,finetuned_length,len_ratio,flag_len,comment
0,Analyze how policy limits in insurance contrac...,Policy limits in insurance contracts are a cru...,Policy limits mitigate adverse selection by cr...,3171,943,0.297383,True,"Missing policy comparison details, lacks pract..."
1,Develop a mathematical model demonstrating how...,Adverse selection in insurance markets occurs ...,The adverse selection model in insurance marke...,3101,962,0.310223,True,"No mathematical formulation shown, missing mar..."
2,Compare and contrast adverse selection challen...,Adverse selection is a significant challenge i...,Private insurance companies and public pension...,3868,1139,0.294467,True,"Insufficient institutional structure analysis,..."
3,Evaluate how the increasing availability of ge...,The increasing availability of genetic testing...,The availability of genetic testing and predic...,3117,1108,0.35547,True,"Lacks specific regulatory proposals, missing e..."
4,Critically assess the effectiveness of policy ...,Policy limits in insurance contracts are desig...,Policy limits effectively align interests by c...,2988,564,0.188755,True,"Too brief overall, missing scenario breakdown,..."


In [7]:
# --- Display rows 20–39 (0‑based index) ---
start, end = 20, 40          # slice is [start:end)
cols = ['question', 'gpt4_length', 'finetuned_length', 'len_ratio']

next_batch = flagged.iloc[start:end]
display(next_batch[cols])     # or print(next_batch[cols].to_markdown())


Unnamed: 0,question,gpt4_length,finetuned_length,len_ratio
20,What statistical methods would be most appropr...,2394,176,0.073517
21,How might the shape of the execution probabili...,2953,233,0.078903
22,How might the shape of the execution probabili...,2953,233,0.078903
23,Compare the theoretical predictions of how bid...,3338,572,0.17136
24,Evaluate the impact of different regulatory fr...,3123,694,0.222222
25,Derive the optimal order placement strategy fo...,3269,968,0.296115
26,Formulate and solve the optimization problem f...,3927,741,0.188694
27,How do price impact models account for the fee...,2883,514,0.178287
28,Critically evaluate the assumptions of homogen...,3189,732,0.229539
29,Analyze the execution priority rules governing...,2767,1071,0.387062


In [8]:
# Your second‑batch comments (rows 20‑39)
comments_20_39 = [
    "Too brief, missing statistical methodology comparison",
    "Insufficient execution probability modeling, lacks graphical representation",
    "Duplicate of above, same deficiencies",
    "Missing theoretical framework, insufficient bid-ask spread analysis",
    "Lacks comprehensive regulatory analysis, missing comparative examples",
    "Incomplete strategy derivation, missing mathematical formulation",
    "Severely abbreviated optimization problem, missing solution steps",
    "Insufficient fee impact analysis, missing model comparison",
    "Incomplete critical evaluation, missing empirical counterexamples",
    "Despite better length ratio, lacks execution priority rule specifics",
    "Missing mathematical formalization, incomplete optimization criteria",
    "Lacks complete implementation analysis, missing market impact details",
    "Despite longer length, missing specific iceberg order characteristics",
    "Insufficient microstructure comparison, lacks detailed market mechanics",
    "Missing game theory formalization, lacks strategic interaction details",
    "Incomplete statistical methodology, missing detection algorithm details",
    "Extremely brief, missing Gueant's theoretical framework",
    "Duplicate of above, same deficiencies",
    "Critically brief, missing execution probability relationship explanation",
    "Duplicate of above, same deficiencies"
]

# Assign them (flagged index is 0…N after reset_index)
flagged.loc[20:39, 'comment'] = comments_20_39


In [9]:
flagged.to_csv("/workspace/fine_tune_evaluation_notes.csv", index=False)
print("✅  Added comments for rows 20–39 and saved.")


✅  Added comments for rows 20–39 and saved.


In [10]:
flagged_check = pd.read_csv("/workspace/fine_tune_evaluation_notes.csv")
num_commented = (flagged_check['comment'].str.len() > 0).sum()
total_flagged = len(flagged_check)
print(f"Now {num_commented}/{total_flagged} flagged rows have comments.")

# Show a couple of the just‑updated rows to confirm
from IPython.display import display
display(flagged_check.iloc[20:25][['question', 'comment']])


Now 40/90 flagged rows have comments.


Unnamed: 0,question,comment
20,What statistical methods would be most appropr...,"Too brief, missing statistical methodology com..."
21,How might the shape of the execution probabili...,"Insufficient execution probability modeling, l..."
22,How might the shape of the execution probabili...,"Duplicate of above, same deficiencies"
23,Compare the theoretical predictions of how bid...,"Missing theoretical framework, insufficient bi..."
24,Evaluate the impact of different regulatory fr...,"Lacks comprehensive regulatory analysis, missi..."


In [11]:
from IPython.display import display
cols = ['question', 'finetuned_answer', 'gpt4_length', 'finetuned_length', 'len_ratio', 'comment']
display(flagged.iloc[40:60][cols])

Unnamed: 0,question,finetuned_answer,gpt4_length,finetuned_length,len_ratio,comment
40,What statistical methods would be most appropr...,"Survival analysis, logistic regression with pr...",2343,176,0.075117,
41,What statistical methods would be most appropr...,"Survival analysis, logistic regression with pr...",2343,176,0.075117,
42,How might the shape of the execution probabili...,The execution probability curve reveals inform...,2784,233,0.083693,
43,How might the shape of the execution probabili...,The execution probability curve reveals inform...,2784,233,0.083693,
44,Compare and contrast the methodological approa...,"In continuous double auction markets, executio...",3893,739,0.189828,
45,What are the implications of time-varying exec...,Time-varying execution probabilities necessita...,2739,367,0.133991,
46,Evaluate the relationship between liquidity fr...,Liquidity fragmentation and shadow-spread wide...,2570,1331,0.517899,
47,Design a comprehensive statistical methodology...,The methodology integrates three key component...,3861,999,0.258741,
48,Derive the mathematical relationship between s...,The OLS estimate of market impact (MI) in the ...,2600,1192,0.458462,
49,Analyze how institutional quantitative traders...,Institutional quantitative traders should modi...,2485,763,0.307042,


In [12]:
# Your third‑batch comments (rows 40‑59)
comments_40_59 = [
    "Too brief, missing comparative analysis of statistical methods, no implementation details",
    "Duplicate of above, same deficiencies",
    "Insufficient curve characterization, lacks mathematical framework",
    "Duplicate of above, same deficiencies",
    "Missing detailed methodological comparison, lacks empirical examples",
    "Missing temporal analysis framework, insufficient adaptation strategies",
    "Despite good ratio, lacks theoretical foundation for fragmentation-spread relationship",
    "Insufficient methodology components, missing validation approach",
    "Incomplete mathematical derivation, missing assumption analysis",
    "Missing specific adaptation techniques, lacks implementation details",
    "Missing mathematical formalization, incomplete venue interaction modeling",
    "Despite good ratio, lacks specific regulatory mechanism assessment",
    "Missing quantification methodology details, insufficient impact metrics",
    "Insufficient historical context, missing model evolution details",
    "Incomplete modeling comparison, lacks mathematical formulations",
    "Missing mathematical framework details, insufficient resilience metrics",
    "Lacks specific empirical challenges, missing solution methodologies",
    "Insufficient theoretical justification, missing mathematical proof",
    "Despite good ratio, lacks optimal execution algorithm details",
    "Missing game theory formalization, insufficient strategic equilibrium analysis"
]

# Assign them (flagged index is 0…N after reset_index)
flagged.loc[40:59, 'comment'] = comments_40_59

# Save progress
flagged.to_csv("/workspace/fine_tune_evaluation_notes.csv", index=False)
print("✅  Saved comments for rows 40–59!")



✅  Saved comments for rows 40–59!


In [13]:
flagged_check = pd.read_csv("/workspace/fine_tune_evaluation_notes.csv")
num_commented = (flagged_check['comment'].str.len() > 0).sum()
total_flagged = len(flagged_check)
print(f"Now {num_commented}/{total_flagged} flagged rows have comments.")

# Show a couple of the just‑updated rows to confirm
from IPython.display import display
display(flagged_check.iloc[40:45][['question', 'comment']])

Now 60/90 flagged rows have comments.


Unnamed: 0,question,comment
40,What statistical methods would be most appropr...,"Too brief, missing comparative analysis of sta..."
41,What statistical methods would be most appropr...,"Duplicate of above, same deficiencies"
42,How might the shape of the execution probabili...,"Insufficient curve characterization, lacks mat..."
43,How might the shape of the execution probabili...,"Duplicate of above, same deficiencies"
44,Compare and contrast the methodological approa...,"Missing detailed methodological comparison, la..."


In [None]:
from IPython.display import display
cols = ['question', 'finetuned_answer', 'gpt4_length', 'finetuned_length', 'len_ratio', 'comment']
display(flagged.iloc[60:80][cols])

In [17]:
# Your fourth‑batch comments (rows 60‑79)
comments_60_79 = [
    "Missing theoretical unification details, insufficient decomposition explanation",
    "Despite good ratio, lacks precise mechanism analysis",
    "Missing analytical methods, insufficient implementation details",
    "Lacks comparative analysis across specific asset classes",
    "Despite decent ratio, missing mathematical formulations",
    "Despite good ratio, lacks algorithm optimization criteria",
    "Despite good ratio, missing specific regulatory evaluation metrics",
    "Missing stress condition analysis, insufficient empirical support",
    "Missing specific circuit breaker mechanisms, lacks effectiveness metrics",
    "Duplicate of above, similar deficiencies",
    "Missing quantitative formulation, insufficient evaluation metrics",
    "Missing impact quantification, lacks specific algorithmic adaptations",
    "Despite good ratio, missing depth-volatility mathematical relationship",
    "Despite good ratio, missing specific regulatory framework evaluation",
    "Missing strategy backtesting results, insufficient optimization details",
    "Missing specific market-making adaptations, lacks empirical evidence",
    "Missing theoretical foundation, insufficient parameter calibration",
    "Missing specific technological innovations, lacks market impact analysis",
    "Despite good ratio, missing mathematical integration method",
    "Despite good ratio, lacks detailed regulatory implementation analysis"
]

# Assign them (flagged index is 0…N after reset_index)
flagged.loc[60:79, 'comment'] = comments_60_79

# Save progress
flagged.to_csv("/workspace/fine_tune_evaluation_notes.csv", index=False)
print("✅  Saved comments for rows 60–79!")

# Optional quick verification
done = (flagged['comment'].str.len() > 0).sum()
print(f"Progress: {done}/{len(flagged)} flagged rows now annotated.")


✅  Saved comments for rows 60–79!
Progress: 80/90 flagged rows now annotated.


In [20]:
from IPython.display import display
cols = ['question', 'finetuned_answer', 'gpt4_length', 'finetuned_length', 'len_ratio', 'comment']
display(flagged.iloc[80:90][cols])


Unnamed: 0,question,finetuned_answer,gpt4_length,finetuned_length,len_ratio,comment
80,Compare and contrast the methodological approa...,The methodological approaches in the cited wor...,3997,1720,0.430323,
81,Develop a research methodology to empirically ...,The research methodology integrates the volume...,3328,1038,0.311899,
82,Design a comprehensive high-frequency trading ...,The comprehensive strategy integrates volume c...,4373,1461,0.334096,
83,Given a stock with an expected VWAP of $25.50 ...,The 99% confidence interval for VWAP is calcul...,972,451,0.463992,
84,Analyze the relationship between confidence in...,The confidence interval width directly determi...,3004,561,0.186751,
85,"From the given VWAP confidence intervals, deri...",The standard deviation of VWAP is given by σ(V...,1759,286,0.162592,
86,Evaluate the statistical assumptions underlyin...,The VWAP confidence interval calculations assu...,2735,959,0.35064,
87,Compare and contrast the implications of the 6...,"The 67%, 90%, and 95% VWAP confidence interval...",3392,1398,0.412146,
88,Formulate a mathematical model for dynamic VWA...,The dynamic VWAP confidence interval model is:...,2593,978,0.377169,
89,Develop a comprehensive VWAP performance evalu...,A comprehensive VWAP performance evaluation fr...,3400,1734,0.51,


In [21]:
# === FINAL‑BATCH COMMENTS + SUMMARY NOTE ====================================

import pandas as pd, textwrap, os, pathlib

# Load the in‑progress annotations
notes_path = "/workspace/fine_tune_evaluation_notes.csv"
flagged = pd.read_csv(notes_path)

# ---------------- 1️⃣  Add comments for rows 80‑89 --------------------------
comments_80_89 = [
    "Despite good ratio, lacks comparative methodology analysis, missing specific approaches",
    "Insufficient methodology details, lacks empirical validation criteria",
    "Missing strategy backtest results, insufficient risk management components",
    "Missing confidence interval calculation steps, lacks statistical justification",
    "Critically brief, missing relationship explanation, no statistical framework",
    "Missing derivation steps, insufficient statistical foundation",
    "Missing key assumptions, lacks critical evaluation of limitations",
    "Despite good ratio, lacks specific trading implications for each interval",
    "Missing model parameters explanation, insufficient dynamic variable integration",
    "Despite good ratio, missing specific evaluation metrics, lacks implementation guide"
]

flagged.loc[80:89, 'comment'] = comments_80_89

# Save the fully‑annotated CSV
flagged.to_csv(notes_path, index=False)
print("✅  All flagged rows now annotated and saved!")

# ---------------- 2️⃣  Persist your overall diagnostic summary --------------
summary_text = textwrap.dedent("""
    ## Overall Fine‑Tune Diagnostic (2025‑04‑19)

    Observed recurring deficiencies in fine‑tuned GPT‑3.5 answers:

    1. Mathematical formulations frequently missing or incomplete
    2. Theoretical frameworks lack depth and rigor
    3. Practical implementation details often omitted
    4. Empirical evidence and concrete examples insufficient
    5. Comparative analyses lack thoroughness and specificity
    6. Validation methodologies under‑detailed
    7. Length ratio especially poor on technical topics
    8. Even when longer, key conceptual details still absent

    => Action items:
       • Augment dataset with ~200 Q&A focused on math rigor, examples, empirical studies
       • Include chain‑of‑thought exemplars and explicit formula derivations
       • Strengthen system prompt: “Provide mathematical derivations, empirical citations…”
""").strip()

summary_path = "/workspace/fine_tune_overall_analysis.txt"
pathlib.Path(summary_path).write_text(summary_text)
print(f"📄  Diagnostic summary saved to {summary_path}")


✅  All flagged rows now annotated and saved!
📄  Diagnostic summary saved to /workspace/fine_tune_overall_analysis.txt
