In [1]:
import sys
sys.path.append('..')

In [2]:
from lib import answer_query_csv, setup_db, search_results

In [3]:
c = setup_db("../data/cache.db")

In [4]:
with open("../test_queries.txt", "r") as f:
    queries = f.readlines()

In [5]:
#remove newline characters
queries = [q.strip() for q in queries]

# remove comma from end of query if its there
queries = [q[:-1] if q[-1] == ',' else q for q in queries]

queries

['how does a hepatic adenoma look like on MRI?',
 'penumbra ischemic stroke perfuson imaging',
 'how to differentiate between a hepatic adenoma and focal nodular hyperplasia on MRI?',
 'how does japanese encephalitis look like on MRI?',
 'what is CLIPPERS?',
 'What is a normal callosal angle, and how do I measure it?',
 'Can you tell me more about the MTA score?',
 'what are the criteria of an liver abscess',
 'pneumonitis',
 'how does the penumbra look in ct perfusion?',
 'ganglioglioma mri appearance',
 'whats a normal critical shoulder angle?',
 'What are the classic imaging findings of multiple sclerosis on MRI?',
 'How do I differentiate between a meningioma and schwannoma on contrast-enhanced MRI?',
 "What's the difference in appearance between septic and degenerative disc changes on MRI?",
 'Can you explain the Li-RADS criteria for liver lesions on CT/MRI?',
 'What are the key features of posterior reversible encephalopathy syndrome (PRES) on imaging?',
 'How do I measure the ac

In [6]:
%env EVAL=1

env: EVAL=1


In [6]:
import asyncio

In [7]:
sem = asyncio.Semaphore(30)

async def run_with_semaphore(f, *args, **kwargs):
    async with sem:
        return await f(*args, **kwargs)
    

In [8]:
import asyncio
from tqdm import tqdm

async def process_queries(queries, limit=None):
    query_list = queries[:limit] if limit else queries
    tasks = []
    
    # Create progress bar
    pbar = tqdm(total=len(query_list), desc="Processing queries")
    
    async def wrapped_answer(query, cursor):
        result = await run_with_semaphore(answer_query_csv, query, cursor)
        pbar.update(1)
        return result
    
    # Create tasks with progress tracking
    for q in query_list:
        tasks.append(asyncio.create_task(wrapped_answer(q, c)))
    
    results = await asyncio.gather(*tasks)
    pbar.close()
    return results


In [None]:
# Create and run the task
tasks = asyncio.create_task(process_queries(queries))
results = tasks

Processing queries:   0%|          | 0/98 [00:00<?, ?it/s]2025-04-10 10:37:27,570 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2025-04-10 10:37:27,571 - INFO - Retrying request to /chat/completions in 2.000000 seconds
2025-04-10 10:37:27,579 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2025-04-10 10:37:27,581 - INFO - Retrying request to /chat/completions in 2.000000 seconds
2025-04-10 10:37:27,698 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2025-04-10 10:37:27,735 - INFO - HTTP Request: POST https://api.together.xyz/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2025-04-10 10:37:27,750 - INFO - Retrying request to /chat/completions in 1.000000 seconds
2025-04-10 10:37:27,752 - INFO - Retrying request to /chat/completions in 1.000000 seconds
2025-04-10 10:37:27,752 - INFO - HTTP Request: POST https:

In [10]:
real_results = results.result()

In [11]:
[r for r in real_results if "urls" not in r]

[{'query': 'how to recognize COVID-19 pneumonia on chest CT?',
  'search_terms': ['COVID-19 pneumonia'],
  'answer': 'The provided context does not contain the information to answer this query.'}]

In [12]:
import json

with open("results_6.json", "w") as f:
    f.write(json.dumps(real_results))

In [6]:
from fastlite import database

In [11]:
db = database("eval_4.db")

In [13]:
notes = db.t.notes

In [33]:
results = list(db.query("select note from notes"))

In [34]:
len(results)


8

In [14]:
notes()

[{'idx': 3,
  'note': "The best search result would be in position 2 here. The retrieved article about powassan virus doesn't answer the user query. "},
 {'idx': 9,
  'note': 'again, the fitting article would be number 2 and not number 1 in the search results. With our heuristic we get the wrong article. '},
 {'idx': 14,
  'note': "I'm not sure if the LLM uses the context to answer the question or not. "},
 {'idx': 15,
  'note': 'Again, the best article is at position 2 and not position 1'},
 {'idx': 18,
  'note': "The best article is further down in the search results (or linked in the article that is retrieved), but the LLM doesn't have the correct context to answer this question here."},
 {'idx': 24,
  'note': 'Again, the best search result is not number 1 and thus not retrieved'},
 {'idx': 32,
  'note': 'Again, the correct search result would be at position 2 and not position 1 (thus not retrieved)'},
 {'idx': 35,
  'note': 'Again, the correct search result would be at position 2 a

In [39]:
[r["note"] for r in results]


['The search term is too specific. The "MRI" suffix fucks up the search results. ',
 "The text is a little wonky and long, but the sources are just not good. The search terms are good, but radiopaedia search just isn't ideal. ",
 "Well, the information needed to answer the question isn't in the context, so it's impossible to answer it correctly. At least it doesn't hallucinate. ",
 'The answer is a little bit long and a little bit all over. The first paragraph (cannot be differentiated) would have been enough.',
 'Again the modality in the search term fucks up the results.',
 'The search term itself is okay, but somehow the radiopaedia search sucks here (the overview article is number 5 or so). ',
 "Again, the search term is actually good here, but radiopaedia's search is just bad here. ",
 "The search term is logically correct, but radiopaedia's search sucks here. "]

Okay so using no reranker resulted in just two "reranking" problems: Queries, where the search term turns up the right articles, but the right article isn't #1, so we don't choose it with our simple heuristic. 

But using a reranker leads to 12 reranking problems, so this is a win. 

Now the problem is mostly with how the LLM works with the context and not the context itself. This is a great win. And also using no reranker saves some latency and costs of course. 

In [1]:
import openai

In [2]:
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
client = openai.OpenAI(
    base_url="https://api.together.xyz/v1",
    api_key=os.getenv("TOGETHER_API_KEY"),
)
model = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"

In [16]:
sp = """Answer the user query faithfully using the information in the context. 
Structure the answer in a way that is easy to read and educational using markdown.
Do not start your answer with a markdown heading. You can use headings on the answer to mark sections.

If you don't know the answer don't fabricate an answer, just say 'I don't know'. 

Don't start your answer with something like 'Based on the context...'. Do not mention the context in your answer. This is very important! Return the answer directly."""

In [19]:
msgs = [
    {
        "role": "system",
        "content": sp
    },
    {
        "role": "user",
        "content": "what is the critical shoulder angle?"
    },
]

In [6]:
import tqdm

In [9]:
results = []
for q in tqdm.tqdm(queries):
    result = await answer_query_csv(q, c)
    results.append(result)

  0%|          | 0/101 [00:00<?, ?it/s]2025-04-07 08:08:58,170 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/openai/chat/completions "HTTP/1.1 200 OK"
2025-04-07 08:08:58,177 - INFO - Searchterms for query: 'how does a hepatic adenoma look like on MRI?' are: '['hepatic adenoma']'
2025-04-07 08:08:58,208 - INFO - Cache hit for search query: 'hepatic adenoma'
2025-04-07 08:08:58,228 - INFO - Cache hit for url: 'https://radiopaedia.org/articles/hepatic-adenoma?lang=us'
2025-04-07 08:08:59,230 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/openai/chat/completions "HTTP/1.1 200 OK"
  1%|          | 1/101 [00:03<05:08,  3.08s/it]2025-04-07 08:09:01,361 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/openai/chat/completions "HTTP/1.1 200 OK"
2025-04-07 08:09:01,365 - INFO - Searchterms for query: 'penumbra ischemic stroke perfuson imaging' are: '['ischemic penumbra']'
2025-04-07 08:09:01,391 - INFO - Cac

UnboundLocalError: cannot access local variable 'result' where it is not associated with a value

In [10]:
results


[{'question': 'how does a hepatic adenoma look like on MRI?',
  'answer': 'In non-hemorrhagic adenomas, they typically appear as:\n*   **T1**: variable and can range from being hyper-, iso-, to hypointense (hyperintense in 35-77% of cases)\n*   **T2**: mildly hyperintense (in 47-74% of cases)\n*   **IP/OP**: the presence of fat typically leads to signal drop out on out-of-phase imaging\n*   **T1 C+ (Gd)**\n    *   some reports suggest that the enhancement becomes isointense to the rest of the liver by 1 minute\n    *   on the dynamic post-contrast sequence, adenomas show early arterial enhancement and become nearly isointense about liver on delayed images\n*   **T1 C+ (Eovist/Primovist)**: usually appears hypointense on hepatobiliary phase (20 mins after injection) due to reduced uptake of Eovist (whereas focal nodular hyperplasia appears iso- to hyperintense)\n\nIf hemorrhagic, blood products may cause significant heterogeneity in signal on all sequences.\n',
  'sources': [{'title': '