In [9]:
import sys
from pathlib import Path
sys.path.append(str(Path("..") / "src"))

#Retrival Logic
from similarity_search import search_sentences
#Extract the text from the pdf
from extraction import extract_sangram
extract_sangram.extract_pdf("data/raw_pdfs/sample.pdf")

Files written to: /home/ubuntu/CITS5553/Project/outputs/processed_text
Preview (first 8 sentences):
Transcriptomic Analysis Reveals the Heterogeneous Role of Conducting Films Upon Electrical Stimulation Nicholas B. Lawler, Uditi Bhatt, Vipul Agarwal, Cameron W. Evans, Priya Kaluskar, Sebastian E. Amos, Kai Chen, Yin Yao, Haibo Jiang, Yu Suk Choi, Minghao Zheng, Dino Spagnoli, Irene Suarez-Martinez, Per B. Zetterlund, Vincent P. Wallace, Alan R. Harvey, Stuart I. Hodgetts, and K. Swaminathan Iyer* Central nervous system (CNS) injuries and neurodegenerative diseases have markedly poor prognoses and can result in permanent dysfunction due to the general inability of CNS neurons to regenerate.
Differentiation of transplanted stem cells has emerged as a therapeutic avenue to regenerate tissue architecture in damaged areas.
Electrical stimulation is a promising approach for directing the differentiation outcomes and pattern of outgrowth of transplanted stem cells, however traditional inorgan

In [10]:
#Call the similarity search function - Does Query Embedding + Search in one call
#Extracted sentences - (these two lines below can be kept together in similarity_search.py) - So modal loading and stuffs all will be there

input_path = Path("../outputs/processed_text/sample.sentences.txt")
sentences = [f"passage: {line.strip()}" for line in input_path.read_text(encoding="utf-8").splitlines() if line.strip()] #Convert the sentences in the format that model expects

#Call the retrival logic
results = search_sentences(sentences, k=3)

In [11]:
#For Storing Gold_Standard text in Database - Dont uncomment this 
#import store_embedding


#Thresshold the similarity score
threshold = 0.70 #Changed the logic so can reduce as well
filtered_results = {
    query: [(match, score) for (match, score) in matches if score >= threshold]
    for query, matches in results.items()
    if any(score >= threshold for (_, score) in matches)
}
#Number of extracted query sentences that have at least one match above thresshold in the Gold Standards
matched_sentence = sum(1 for matches in filtered_results.values() if matches)
#print(matched_sentence)

In [None]:
#Build list of queries with max similarity
ranked_queries = [
    (query_sentence, matches, max(score for _, score in matches))
    for query_sentence, matches in filtered_results.items()
]

#Sort queries by their best score
ranked_queries.sort(key=lambda triple: triple[2], reverse=True)
#Print
for query_sentence, matches, _ in ranked_queries:
    print(f"\nQuery: {query_sentence}")
    for matched_sentence, similarity_score in matches:
        print(f"{matched_sentence} (sim={similarity_score:.4f})")

In [14]:
#Take queries sorted by their best similarity score
best_queries = [query for query, _, _ in ranked_queries[:7]]
#Checking
print("Queries to send to LLM:")
for query in best_queries:
    print(query)

Queries to send to LLM:
passage: The authors acknowledge the facilities and the scientific and technical assistance  of Microscopy Australia at the Electron Microscopy Unit (EMU) at the Mark Wainwright Analytical Centre (MWAC) at UNSW Sydney.
passage: An electric field of 0.1 V over 2 mm was Supporting Information is available from the Wiley Online Library or from The authors acknowledge the facilities and the scientific and technical assistance  of the Australian Microscopy & Microanalysis Research Facility at the Centre for Microscopy, Characterization & Analysis, The University Adv.
passage: V.P.W. was supported by the Australian Research Council  via a Future Fellowship (project number FT180100683) funded by the Australian Government.
passage: This work was supported by resources provided by the Pawsey Supercomputing Research Centre with funding from the Australian Government and the Government of Western Australia.
passage: Immunocytochemically stained samples were imaged using  a

In [15]:
def build_prompt(query_sentence, matches):
    top_match, top_score = matches[0] #There are 3, but here just for the context passing only top one. 
    bullet_matches = f'- "{top_match}" (Similarity: {top_score:.2f})'

    return f"""
You are acting as a **research compliance auditor**. 
Your role is to verify whether a given sentence from a scientific document constitutes a **formal acknowledgement** of institutional or financial support.

The sentence to review is:

**Extracted Sentence:**
"{query_sentence.replace("passage: ", "")}"

This sentence was selected because it closely matches known acknowledgement phrases:
{bullet_matches}

### Your task
Determine whether the sentence contains a **formal acknowledgement** that refers specifically to **one or more of the following known entities**:
- CMCA (Centre for Microscopy Characterisation and Analysis)
- The University of Western Australia (UWA)
- Microscopy Australia or its nodes
- NCRIS(National Collaborative Research Infrastructure Strategy) research infrastructure programs

### Decision Criteria
Classify the sentence as a **formal acknowledgement** only if it clearly refers to:
- Use or access of CMCA, UWA microscopy facilities, or Microscopy Australia
- Technical or analytical assistance provided by these institutions
- support by NCRIS funding

Do **not** classify it as an acknowledgement if it:
- Only thanks individuals without institutional affiliation
- Provides generic gratitude with no clear link to facilities, funding, or institutional support

### Respond in this exact format:
Answer: [Yes or No]  
Reason: Explain **why** this sentence qualifies (or not) based on the decision criteria above. Use 1-3 sentences only.
""".strip()

In [None]:

from openai import OpenAI
#Initialize client
client = OpenAI(api_key= "API Key")

def verify_acknowledgement(ranked_queries, top_k = 7):
    results = {}
    top_queries = ranked_queries[:top_k]

    for query_sentence, matches, _ in top_queries:
        #Build the prompt
        prompt = build_prompt(query_sentence, matches)
        #Send to LLM
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a research auditor."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0
        )
        #Save response
        results[query_sentence] = response.choices[0].message.content.strip()
    return results



In [17]:
# Run and show the result
result = verify_acknowledgement(ranked_queries, top_k = 7)
for query_sentence, response in result.items():
    print(f"Query: {query_sentence}")
    print(f"LLM Response:\n{response}\n")
    print("*" * len(response))

Query: passage: The authors acknowledge the facilities and the scientific and technical assistance  of Microscopy Australia at the Electron Microscopy Unit (EMU) at the Mark Wainwright Analytical Centre (MWAC) at UNSW Sydney.
LLM Response:
Answer: Yes
Reason: The sentence clearly acknowledges the facilities and scientific and technical assistance of Microscopy Australia at the Electron Microscopy Unit (EMU) at the Mark Wainwright Analytical Centre (MWAC) at UNSW Sydney. This indicates the use of Microscopy Australia's facilities and assistance, meeting the criteria for a formal acknowledgement.

*************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
Query: passage: An electric field o