In [None]:
import pandas as pd
import anthropic
import ast
import json
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pickle

# === Config ===
CLAUDE_API_KEY = ""
client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)
embedder = SentenceTransformer('all-roberta-large-v1')

# === Load Excel file ===
excel_path = "your_excel_path.xlsx"

# === Load Questions Data ===
df = pd.read_csv("C:/Users/theya/Downloads/data_Claude_questions_100(Sheet1).csv", encoding="latin1")

results = []

def minimal_answer_prompt(question, chunk):
    return f"""You are a biomedical assistant.

Use only the following text to answer the question. 
Do not use outside knowledge or make assumptions.
If the answer is not clearly mentioned, say: "Not mentioned in the text."

TEXT:
\"\"\"
{chunk}
\"\"\"

Question: {question}
Give your answer in one simple sentence, no bullet points, no quotes, no newlines.
Answer:"""

def strict_answer_prompt(question, chunk):
    return f"""You are a biomedical assistant. Answer the question only using the information in the text below. Do not make assumptions or use outside knowledge. If the answer is not clearly stated, say "Not mentioned in the text."

TEXT:
\"\"\"
{chunk}
\"\"\"

Question: {question}
Give your answer in simple sentence, text style, no bullet points, no quotes, no newlines.
Answer:"""

def minimal_answer_prompt(question, chunk):
    return f"""You are a biomedical assistant. Answer the question using only the information in the following text. Do not use outside knowledge. Do not add any explanations or formatting.
If the answer is not clearly stated, reply: Not mentioned in the text.

TEXT:
\"\"\"
{chunk}
\"\"\"

Question: {question}
Give your answer in simple sentence or sentences. No bullet points. No quotes. No newlines. No extra explanation, just stick to the question.

Answer:"""


# === Claude QA function ===
def answer_with_claude(question, context):
    prompt = minimal_answer_prompt(question, context)
    try:
        response = client.messages.create(
            model="claude-3-haiku-20240307",
            max_tokens=300,
            temperature=0,
            messages=[{"role": "user", "content": prompt}]
        )
        return response.content[0].text.strip()
    except Exception as e:
        print(f"❌ Error from Claude: {e}")
        return "Claude error."


#df = df.head(5)
# === Main loop ===
for idx, row in df.iterrows():
    doc_id = row["doc_ID"]
    chunk_id = row["chunk_ID"]
    chunk = str(row["chunk"])
    
    try:
        questions = ast.literal_eval(row["questions"]) if isinstance(row["questions"], str) and row["questions"].startswith("[") else str(row["questions"]).split("\n")
    except:
        questions = [str(row["questions"])]

    for q in questions:
        q = q.strip()
        if not q: continue

        answer = answer_with_claude(q, chunk)
        results.append({
            "question": q,
            "answer": answer,
            "pubmed_ID": doc_id,
            "doc_ID": doc_id,
            "chunk_ID": chunk_id
        })

# === Save to JSON ===
with open("C:/Users/theya/Downloads/qa_ground_truth.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
print("✅ Saved answers to 'qa_ground_truth.json'")

# === Vectorize answers and save index ===
answers = [r["answer"] for r in results]
answer_embeddings = embedder.encode(answers, convert_to_numpy=True)
index = faiss.IndexFlatL2(answer_embeddings.shape[1])
index.add(answer_embeddings)

# Save FAISS index and metadata
faiss.write_index(index, "C:/Users/theya/Downloads/qa_ground_truth_answer_index.faiss")
with open("C:/Users/theya/Downloads/qa_ground_truth_metadata.pkl", "wb") as f:
    pickle.dump(results, f)

print("✅ Vector index and metadata saved.")
