In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path("..") / "src"))
#Retrival Logic
from similarity_search import search_sentences
#Find mentions of staff and instruments
#from match import (staff_list, staff_mention, instrument_list, instrument_mention)
#Extract the text from the pdf
#from extraction import extract_sangram
#extract_sangram.extract_pdf("data/raw_pdfs/27_Negative.pdf")

In [None]:

#Load extracted sentence
with open("../outputs/processed_text/27_Negative.sentences.txt", "r", encoding="utf-8") as f:
    sentences = f.read().splitlines()

#Staffs
staff_list = staff_list("staffs.txt")
staff_name = staff_mention(sentences, staff_list)

#Instruments
instrument_list = instrument_list("instruments.txt")
instruments = instrument_mention(sentences, instrument_list)

In [None]:
input_path = Path("../outputs/processed_text/27_Negative.sentences.txt")
sentences = [f"passage: {line.strip()}" for line in input_path.read_text(encoding="utf-8").splitlines() if line.strip()] #Convert the sentences in the format that model expects

#Call the retrival logic
results = search_sentences(sentences, k=3)


In [None]:
#For Storing Gold_Standard text in Database - Dont uncomment this 
#import store_embedding


#Thresshold the similarity score
threshold = 0.70 #Changed the logic so can reduce as well
filtered_results = {
    query: [(match, score) for (match, score) in matches if score >= threshold]
    for query, matches in results.items()
    if any(score >= threshold for (_, score) in matches)
}
#Number of extracted query sentences that have at least one match above thresshold in the Gold Standards
matched_sentence = sum(1 for matches in filtered_results.values() if matches)
print(matched_sentence)

In [None]:
#Build list of queries with max similarity
ranked_queries = [
    (query_sentence, matches, max(score for _, score in matches))
    for query_sentence, matches in filtered_results.items()
]

#Sort queries by their best score
ranked_queries.sort(key=lambda triple: triple[2], reverse=True)

#Print
print("The follwoing sentences matched the query with similarity above the threshold. Below i have listed the matched sentences(passage) with query they match to and their similarity score.")
for query_sentence, matches, _ in ranked_queries:
    print(f"\n{query_sentence}")
    for matched_sentence, similarity_score in matches:
        print(f"{matched_sentence} (sim={similarity_score:.4f})")

In [None]:
#Take queries sorted by their best similarity score
best_queries = [query for query, _, _ in ranked_queries[:7]]
#Checking
print("Queries to send to LLM:")
for query in best_queries:
    print(query)

In [None]:
def build_prompt(query_sentence, matches):
    top_match, top_score = matches[0] #There are 3, but here just for the context passing only top one. 
    bullet_matches = f'- "{top_match}" (Similarity: {top_score:.2f})'

    return f"""
You are acting as a **research compliance auditor**. 
Your role is to verify whether a given sentence from a scientific document constitutes a **formal acknowledgement** of institutional or financial support.

The sentence to review is:

**Extracted Sentence:**
"{query_sentence.replace("passage: ", "")}"

This sentence was selected because it closely matches known acknowledgement phrases:
{bullet_matches}

### Your task
Determine whether the sentence contains a **formal acknowledgement** that refers specifically to **one or more of the following known entities**:
- CMCA (Centre for Microscopy Characterisation and Analysis)
- The University of Western Australia (UWA)
- Microscopy Australia or its nodes
- NCRIS(National Collaborative Research Infrastructure Strategy) research infrastructure programs

### Decision Criteria
Classify the sentence as a **formal acknowledgement** only if it clearly refers to:
- Use or access of CMCA, UWA microscopy facilities, or Microscopy Australia
- Technical or analytical assistance provided by these institutions
- support by NCRIS funding

Do **not** classify it as an acknowledgement if it:
- Only thanks individuals without institutional affiliation
- Provides generic gratitude with no clear link to facilities, funding, or institutional support

### Respond in this exact format:
Answer: [Yes or No]  
Reason: Explain **why** this sentence qualifies (or not) based on the decision criteria above. Use 1-3 sentences only.
""".strip()

In [None]:

from openai import OpenAI
#Initialize client
client = OpenAI(api_key= "API KEY")

def verify_acknowledgement(ranked_queries, top_k = 7):
    results = {}
    top_queries = ranked_queries[:top_k]

    for query_sentence, matches, _ in top_queries:
        #Build the prompt
        prompt = build_prompt(query_sentence, matches)
        #Send to LLM
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a research auditor."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        #Save response
        results[query_sentence] = response.choices[0].message.content.strip()
    return results

In [None]:
# Run and show the result
result = verify_acknowledgement(ranked_queries, top_k = 7)
for query_sentence, response in result.items():
    print(f"Query: {query_sentence}")
    print(f"LLM Response:\n{response}\n")

In [None]:
#If one Yes - then document is qualified
#Decision 
def decision(responses):
    for resp in responses.values():
        if "Answer: Yes" in resp:
            return True
    return False

#Apply decision
if decision(result):
    print("There is a formal acknowledgement in the document")
else:
    print("There is no formal acknowledgement in the document")


In [None]:
import json
#Apply decision
document_status = "Yes" if decision(result) else "No"

#Build JSON structure
output_json = {
    "Acknowledgement": document_status,
    "Sentence_verifications": [
        {
            "query": query_sentence,
            "llm_response": response,
            "answer": "Yes" if "Answer: Yes" in response else "No"
        }
        for query_sentence, response in result.items()
    ], 
    "CMCA Staff": staff_name,
    "Instrument": instruments
}

#Save to JSON file
with open("result.json", "w", encoding="utf-8") as f:
    json.dump(output_json, f, ensure_ascii=False, indent=2)