In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path("..") / "src"))

#Retrival Logic
from similarity_search import search_sentences
#Extract the text from the pdf
from extraction import extract_sangram
extract_sangram.extract_pdf("data/raw_pdfs/sample.pdf")

In [None]:
#Call the similarity search function - Does Query Embedding + Search in one call

#Extracted sentences - (these two lines below can be kept together in similarity_search.py) - So modal loading and stuffs all will be there
input_path = Path("../outputs/processed_text/sample.sentences.txt")
sentences = [f"passage: {line.strip()}" for line in input_path.read_text(encoding="utf-8").splitlines() if line.strip()] #Convert the sentences in the format that model expects

#Call the retrival logic
results = search_sentences(sentences, k=20) #K is 30 by default

In [None]:
#For Storing Gold_Standard text in Database - Dont uncomment this 
#import store_embedding


#Thresshold the similarity score - 85%(might miss few samples)
threshold = 0.80#This number should be perfectly tuned i started from 80 where in 80 i found many false matches so increased the thresshold to 85 which seems good.
filtered_results = {
    q: [(m, s) for (m, s) in matches if s >= threshold]
    for q, matches in results.items()
}

In [None]:
#Number of extracted query sentences that have at least one match above 85 in the Gold Standards
matched_sentence = sum(1 for matches in filtered_results.values() if matches)
print(matched_sentence)

In [None]:
#Check the result
for query, matches in filtered_results.items():
    if matches:
        print(f"\n Query: {query}")
        for match, score in matches:
            print(f"   → {match} (sim={score:.4f})")

In [None]:
#Loop through queries that have strong matches
for query, matches in filtered_results.items():
    if matches:
        print(" Queries to send to LLM:\n", query)

In [None]:
def build_prompt(query_sentence, matches):
    #Keep only 5 highest similar from gold standard
    top_matches = matches[:5]  

    bullet_matches = "\n".join([f'- "{m}" (Similarity: {s:.2f})' for m, s in top_matches]) #m is gold standard sentence and s is similarity score

    return f"""
You are acting as a **research compliance auditor**. 
Your role is to verify whether a given sentence from a scientific document constitutes a **formal acknowledgement** of institutional or financial support.

The sentence to review is:

**Extracted Sentence:**
"{query_sentence.replace("passage: ", "")}"

This sentence was selected because it closely matches known acknowledgement phrases:
{bullet_matches}

### Your task
Determine whether the sentence contains a **formal acknowledgement** that refers specifically to **one or more of the following known entities**:
- CMCA (Centre for Microscopy Characterisation and Analysis)
- The University of Western Australia (UWA)
- Microscopy Australia or its nodes
- NCRIS or other national research infrastructure programs
Only classify as “Yes” if the sentence clearly refers to **these known institutions or programs**

### Decision Criteria
Classify the sentence as a **formal acknowledgement** only if it clearly refers to:
- Use or access of CMCA, UWA microscopy facilities, or Microscopy Australia
- Technical or analytical assistance provided by these institutions
- Institutional or national infrastructure support (e.g., NCRIS funding or facilities)

Strong signals include phrases like:
- “use of CMCA facilities”
- “supported by Microscopy Australia”
- “technical assistance from CMCA”
- “funded through the NCRIS program”

Do **not** classify it as an acknowledgement if it:
- Only thanks individuals without institutional affiliation
- Provides generic gratitude with no clear link to facilities, funding, or institutional support

### Respond in this exact format:
Answer: [Yes or No]  
Reason: Explain **why** this sentence qualifies (or not) based on the decision criteria above. Use 1-3 sentences only.
""".strip()

In [None]:

from openai import OpenAI
#Initialize client
client = OpenAI(api_key= "API")

def verify_acknowledgement(filtered_results):
    results = {}

    for query, matches in filtered_results.items():
        if matches:
            prompt = build_prompt(query, matches)

            if prompt is None:
                continue

            response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a research auditor."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.0
            )

            results[query] = response.choices[0].message.content.strip()

    return results

In [None]:
# Run and show the result
result = verify_acknowledgement(filtered_results)
for query, response in result.items():
    print(f"LLM Response:\n{response}")

In [None]:
#Write the summarise function that returns the Json: