In [2]:
import apsw

In [None]:
import apsw.bestpractice
import requests
from bs4 import BeautifulSoup

apsw.bestpractice.apply(apsw.bestpractice.recommended)


In [60]:
connection = apsw.Connection("../data/cache.db")
cursor = connection.cursor()

In [64]:
# list all tables
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()

[('radiopaedia_search_results',),
 ('radiopaedia_articles',),
 ('sqlite_stat1',),
 ('sqlite_stat4',)]

In [75]:
cursor.execute("Select count(*) from radiopaedia_search_results").fetchone()

(66,)

In [76]:
cursor.execute("Select count(*) from radiopaedia_articles").fetchone()

(54,)

In [6]:
cursor.execute("CREATE TABLE IF NOT EXISTS search_term_cache (query TEXT PRIMARY KEY, search_term TEXT)")

<apsw.Cursor at 0x1046c18a0>

In [7]:
from pydantic import BaseModel, Field

class SearchTerm(BaseModel):
    text: str = Field(
        ...,
        title="The search term extracted from the provided text. The search term should be the underlying disease or condition that the user is asking about.",
    )

In [9]:
cursor.execute("INSERT INTO search_term_cache (query, search_term) VALUES (?, ?)", ("What is the treatment for diabetes?", "diabetes"))

<apsw.Cursor at 0x1046c18a0>

In [10]:
cursor.execute("SELECT search_term FROM search_term_cache WHERE query = ?", ("What is the treatment for diabetes?",)).fetchall()

[('diabetes',)]

In [8]:

from dotenv import load_dotenv
from anthropic import Anthropic
import instructor

load_dotenv()

anthropic = Anthropic()

ANTHROPIC_MODEL = "claude-3-5-haiku-latest"

client = instructor.from_anthropic(Anthropic())

In [13]:
def get_search_term(query: str) -> SearchTerm:
    # check cache
    cache_hits = cursor.execute("SELECT search_term FROM search_term_cache WHERE query = ?", (query,)).fetchall()
    if cache_hits:
        print(f"Cache hit for query: '{query}'")
        return SearchTerm(text=cache_hits[0][0])
    # note that client.chat.completions.create will also work
    resp = client.messages.create(
        model=ANTHROPIC_MODEL,
        max_tokens=1024,
        messages=[
            {
                "role": "system",
                "content": """Extract a search term from the provided text. The search term should be the underlying disease or condition that the user is asking about.
                
                Examples:
                text: "What is the most common cause of acute pancreatitis?"
                search term: "acute pancreatitis"
                
                text: "What MTA score is pathological for a 77 year old?"
                search term: "MTA score"
                
                text: "How to differentiate radiation necrosis from tumor recurrence?"
                search term: "radiation necrosis"
                
                text: "How do I measure the tibial tuberosity-trochlear groove distance?"
                search term: "tibial tuberosity-trochlear groove distance"
                
                """,
            },
            {
                "role": "user",
                "content": f"text: {query}",
            },
        ],
        response_model=SearchTerm,
    )
    
    cursor.execute("INSERT INTO search_term_cache (query, search_term) VALUES (?, ?)", (query, resp.text))

    # logging.info(f"SearchTerm for query: '{query}' is: '{resp.text}'")

    return resp


In [16]:
get_search_term("MTA Score")

Cache hit for query: 'MTA Score'


SearchTerm(text='MTA Score')

In [29]:
http_headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "accept-language": "de-DE,de;q=0.7",
    "cache-control": "max-age=0",
    "cookie": "*radiopaedia*session=FK2kLR8%2Bhkzg8D5gtP4N4x7y7QoL55kFD40yMiPSOY7%2FN2z7QD85jgYgOl36adv2O4KM%2Buk9%2BIxtFUDX3LTHoSSOeoykDZyd%2FLqq5OPox0tzbn1URW4n5oZhowZCpw9MHPRvZ%2FAIDDhAtaJxz7Ue3hBb%2BjcKRloPcYbUjUBOi4KvtINrFnKpfey%2B13AgAz8bcuzLURQiO07IXbnVRackkRXpgG0mBcjh8n1Ap849t33s81SgYXTXFGeBit4JVhTtqLGrUa%2F%2FIXFZZyKREUk9b8kQYmvUIRtE9Fmr1Rd43JQOa60hTs%2F4Vhh%2F1rClz1pAgZrzAIEDbzx%2Bvz1CKDU%2FhHlydPYIcvrhZXOQ6TKhRDV8bfrfKjjToXAH9vyFbq4VxlnAnyoCA4JBFetkzZbrqrZSYsM%2F%2BF7LW8Swh92pLtO%2BwAps555wlyXnQKDbrapf%2FNaABd3%2Bk301t64uwC1n4nkq3Z7rIG409N%2FFfzylkrNs1eOArX1j%2FqkilucHBXMt--A7TmaE8Ut8M5kXCo--txKn9l3AgN8d8znLnEjAKw%3D%3D",
    "priority": "u=0, i",
    "sec-ch-ua": '"Chromium";v="128", "Not;A=Brand";v="24", "Brave";v="128"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"macOS"',
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "sec-gpc": "1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
}

def search_radiopaedia(search_query: str):
    url = "https://radiopaedia.org/search"
    cache_hits = cursor.execute("SELECT search_results FROM radiopaedia_search_results WHERE search_query = ?", (search_query,)).fetchall()
    if cache_hits:
        print(f"Cache hit for search query: '{search_query}'")
        rbody = cache_hits[0][0]
        return BeautifulSoup(rbody, "html.parser")

    params = {"lang": "us", "q": search_query, "scope": "articles"}

    response = requests.get(url, params=params, headers=http_headers)
    
    cursor.execute("INSERT INTO radiopaedia_search_results (search_query, search_results) VALUES (?, ?)", (search_query, response.content))

    return BeautifulSoup(response.content, "html.parser")

In [None]:
cursor.execute("CREATE TABLE IF NOT EXISTS radiopaedia_search_results (search_query TEXT PRIMARY KEY, search_results TEXT)")

<apsw.Cursor at 0x1046c18a0>

In [31]:
search_radiopaedia("MTA Score")

Cache hit for search query: 'MTA Score'


<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="https://prod-assets-static.radiopaedia.org/assets/logo-6ce5f7e4ea68d94a0d0c42e3b9ef044769601e5ddd6ed0e6d74dbd1b130d93fc.png" property="og:image"/><meta content="Radiopaedia" property="og:site_name"/><meta content="MTA Score | Search | Radiopaedia.org" property="og:title"/><meta content="website" property="og:type"/><meta content="https://radiopaedia.org/search?lang=us&amp;q=MTA+Score&amp;scope=articles" property="og:url"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>MTA Score | Search | Radiopaedia.org</title>
<meta content="" name="robots">
<link href="https://radiopaedia.org/?lang=usradiopaedia-icon-144.png" rel="apple-touch-icon-precomposed" sizes="144x144"/>
<link href="https://radiopaedia.org/?lang=usfavicon.ico" rel="shortcut icon"/>
<link href="https://radiopaedia.org/search?q=MTA+Score&amp;scope=articles&amp;lang=gb" hreflang="en-GB" rel="alternate">
<link href="h

In [32]:
cursor.execute("CREATE TABLE IF NOT EXISTS radiopaedia_articles (url TEXT PRIMARY KEY, content TEXT)")

<apsw.Cursor at 0x1046c18a0>

In [34]:
def get_article_text(url):
    cache_hits = cursor.execute("SELECT content FROM radiopaedia_articles WHERE url = ?", (url,)).fetchall()
    if cache_hits:
        print(f"Cache hit for url: '{url}'")
        return cache_hits[0][0]
    response = requests.get(url, headers=http_headers)
    soup = BeautifulSoup(response.content, "html.parser")
    content = soup.select("#content > div.body.user-generated-content")[0].text.strip()
    cursor.execute("INSERT INTO radiopaedia_articles (url, content) VALUES (?, ?)", (url, content))
    return content 

In [36]:
get_article_text("https://radiopaedia.org/articles/hepatic-adenoma?lang=us")

Cache hit for url: 'https://radiopaedia.org/articles/hepatic-adenoma?lang=us'


'Hepatic adenomas,\xa0also referred to as hepatocellular adenomas, are benign,\xa0generally hormone-induced, liver tumors. The tumors are usually solitary, have a predilection for hemorrhage, and must be differentiated from other focal liver lesions.\nOn this page:\n\nArticle:\nEpidemiology\nClinical presentation\nPathology\nRadiographic features\nTreatment and prognosis\nDifferential diagnosis\nRelated articles\nReferences\n\n\nImages:\nCases and figures\nImaging differential diagnosis\n\n\nEpidemiologyThe incidence of hepatic adenomas is unknown, with studies showing migration from the classically described female predominance related to the use of oral contraceptives, to an increased prevalence in men, particularly recognizing that obesity and metabolic syndrome are emerging risk factors for adenomas 18.\xa0Hepatic adenoma is traditionally considered the most frequent hepatic tumor in young women on the oral contraceptive pill.AssociationsHepatic adenomas are associated with 3:\nora

In [None]:
def setup_db():
    apsw.bestpractice.apply(apsw.bestpractice.recommended)
    connection = apsw.Connection(":memory:")
    cursor = connection.cursor()
    cursor.execute("CREATE TABLE IF NOT EXISTS radiopaedia_search_results (search_query TEXT PRIMARY KEY, search_results TEXT)")
    cursor.execute("CREATE TABLE IF NOT EXISTS radiopaedia_articles (url TEXT PRIMARY KEY, content TEXT)")

def get_article_text(url):
    cache_hits = cursor.execute("SELECT content FROM radiopaedia_articles WHERE url = ?", (url,)).fetchall()
    if cache_hits:
        print(f"Cache hit for url: '{url}'")
        return cache_hits[0][0]
    response = requests.get(url, headers=http_headers)
    soup = BeautifulSoup(response.content, "html.parser")
    content = soup.select("#content > div.body.user-generated-content")[0].text.strip()
    cursor.execute("INSERT INTO radiopaedia_articles (url, content) VALUES (?, ?)", (url, content))
    return content 

def search_radiopaedia(search_query: str):
    url = "https://radiopaedia.org/search"
    cache_hits = cursor.execute("SELECT search_results FROM radiopaedia_search_results WHERE search_query = ?", (search_query,)).fetchall()
    if cache_hits:
        print(f"Cache hit for search query: '{search_query}'")
        rbody = cache_hits[0][0]
        return BeautifulSoup(rbody, "html.parser")

    params = {"lang": "us", "q": search_query, "scope": "articles"}

    response = requests.get(url, params=params, headers=http_headers)
    
    cursor.execute("INSERT INTO radiopaedia_search_results (search_query, search_results) VALUES (?, ?)", (search_query, response.content))

    return BeautifulSoup(response.content, "html.parser")
    

In [38]:
from claudette import * 

In [49]:
c = Chat(models[-1])

In [42]:
c("What is the treatment for diabetes?")

Treatment for diabetes depends on the type of diabetes, but generally includes:

1. Type 1 Diabetes:
- Insulin therapy (daily injections or insulin pump)
- Blood sugar monitoring
- Healthy diet
- Regular exercise

2. Type 2 Diabetes:
- Lifestyle changes (diet and exercise)
- Oral medications
- Insulin therapy (if needed)
- Blood sugar monitoring
- Weight management

3. Common treatments for both types:
- Regular blood glucose monitoring
- Healthy eating plan
- Physical activity
- Stress management
- Medication to control blood sugar
- Medication to manage related conditions

4. Additional management strategies:
- Regular medical check-ups
- Foot care
- Eye examinations
- Managing cholesterol and blood pressure
- Diabetes education

5. Potential medications:
- Metformin
- Sulfonylureas
- DPP-4 inhibitors
- GLP-1 receptor agonists
- Insulin

The specific treatment plan is personalized based on individual health needs, diabetes type, and overall medical condition. Consulting with a healthcare professional is crucial for proper diabetes management.

<details>

- id: `msg_012t17bXqSUV5xPAp8ThNBq4`
- content: `[{'text': 'Treatment for diabetes depends on the type of diabetes, but generally includes:\n\n1. Type 1 Diabetes:\n- Insulin therapy (daily injections or insulin pump)\n- Blood sugar monitoring\n- Healthy diet\n- Regular exercise\n\n2. Type 2 Diabetes:\n- Lifestyle changes (diet and exercise)\n- Oral medications\n- Insulin therapy (if needed)\n- Blood sugar monitoring\n- Weight management\n\n3. Common treatments for both types:\n- Regular blood glucose monitoring\n- Healthy eating plan\n- Physical activity\n- Stress management\n- Medication to control blood sugar\n- Medication to manage related conditions\n\n4. Additional management strategies:\n- Regular medical check-ups\n- Foot care\n- Eye examinations\n- Managing cholesterol and blood pressure\n- Diabetes education\n\n5. Potential medications:\n- Metformin\n- Sulfonylureas\n- DPP-4 inhibitors\n- GLP-1 receptor agonists\n- Insulin\n\nThe specific treatment plan is personalized based on individual health needs, diabetes type, and overall medical condition. Consulting with a healthcare professional is crucial for proper diabetes management.', 'type': 'text'}]`
- model: `claude-3-5-haiku-20241022`
- role: `assistant`
- stop_reason: `end_turn`
- stop_sequence: `None`
- type: `message`
- usage: `{'input_tokens': 14, 'output_tokens': 260, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`

</details>

In [43]:
c("how do GLP-1 agonists work?")

GLP-1 (Glucagon-Like Peptide-1) agonists work through several mechanisms to help manage blood sugar levels and support weight loss:

1. Pancreatic Effects:
- Stimulate insulin secretion when blood glucose is high
- Suppress glucagon release
- Slow down insulin release to prevent hypoglycemia

2. Glucose Regulation:
- Reduce hepatic glucose production
- Improve insulin sensitivity
- Decrease blood sugar levels

3. Digestive System Effects:
- Slow gastric emptying
- Reduce appetite
- Increase feelings of fullness
- Decrease food intake

4. Weight Management:
- Promote weight loss
- Reduce body fat
- Decrease caloric intake

5. Metabolic Benefits:
- Improve beta-cell function
- Potentially protect pancreatic cells
- Reduce cardiovascular risk

6. Mechanism of Action:
- Mimic natural GLP-1 hormone
- Bind to GLP-1 receptors
- Enhance glucose-dependent insulin secretion

Common GLP-1 agonists include:
- Semaglutide (Ozempic, Wegovy)
- Liraglutide (Victoza)
- Dulaglutide (Trulicity)
- Exenatide (Byetta)

These medications are primarily used for type 2 diabetes and obesity management.

<details>

- id: `msg_01DeAi8zzggFVMjd1VHJVZmF`
- content: `[{'text': 'GLP-1 (Glucagon-Like Peptide-1) agonists work through several mechanisms to help manage blood sugar levels and support weight loss:\n\n1. Pancreatic Effects:\n- Stimulate insulin secretion when blood glucose is high\n- Suppress glucagon release\n- Slow down insulin release to prevent hypoglycemia\n\n2. Glucose Regulation:\n- Reduce hepatic glucose production\n- Improve insulin sensitivity\n- Decrease blood sugar levels\n\n3. Digestive System Effects:\n- Slow gastric emptying\n- Reduce appetite\n- Increase feelings of fullness\n- Decrease food intake\n\n4. Weight Management:\n- Promote weight loss\n- Reduce body fat\n- Decrease caloric intake\n\n5. Metabolic Benefits:\n- Improve beta-cell function\n- Potentially protect pancreatic cells\n- Reduce cardiovascular risk\n\n6. Mechanism of Action:\n- Mimic natural GLP-1 hormone\n- Bind to GLP-1 receptors\n- Enhance glucose-dependent insulin secretion\n\nCommon GLP-1 agonists include:\n- Semaglutide (Ozempic, Wegovy)\n- Liraglutide (Victoza)\n- Dulaglutide (Trulicity)\n- Exenatide (Byetta)\n\nThese medications are primarily used for type 2 diabetes and obesity management.', 'type': 'text'}]`
- model: `claude-3-5-haiku-20241022`
- role: `assistant`
- stop_reason: `end_turn`
- stop_sequence: `None`
- type: `message`
- usage: `{'input_tokens': 289, 'output_tokens': 329, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`

</details>

In [54]:
if c.h:
    print("got history")
else:
    print("no history")

no history


In [55]:
c("happy birthday")

Thank you! However, I want to be direct with you - I don't actually have a birthday, as I'm an AI. I appreciate the kind sentiment, though. Is there something I can help you with today?

<details>

- id: `msg_01WdsszVvYuLgt7jgaPqq382`
- content: `[{'text': "Thank you! However, I want to be direct with you - I don't actually have a birthday, as I'm an AI. I appreciate the kind sentiment, though. Is there something I can help you with today?", 'type': 'text'}]`
- model: `claude-3-5-haiku-20241022`
- role: `assistant`
- stop_reason: `end_turn`
- stop_sequence: `None`
- type: `message`
- usage: `{'input_tokens': 9, 'output_tokens': 48, 'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0}`

</details>

In [56]:
c.h

[{'role': 'user', 'content': 'happy birthday'},
 {'role': 'assistant',
  'content': [TextBlock(text="Thank you! However, I want to be direct with you - I don't actually have a birthday, as I'm an AI. I appreciate the kind sentiment, though. Is there something I can help you with today?", type='text')]}]

In [57]:
c.h[-1]["content"][0].text

"Thank you! However, I want to be direct with you - I don't actually have a birthday, as I'm an AI. I appreciate the kind sentiment, though. Is there something I can help you with today?"

In [58]:
for m in c.h:
    if m["role"] == "user":
        print(m["content"])
    else:
        print(m["content"][0].text)

happy birthday
Thank you! However, I want to be direct with you - I don't actually have a birthday, as I'm an AI. I appreciate the kind sentiment, though. Is there something I can help you with today?


In [2]:
answer = """Blood on MRI has highly variable imaging characteristics depending on the age of the blood, type of hemoglobin, and MRI sequence. Key stages include:
1. Hyperacute (<1 day): Intracellular oxyhemoglobin, isointense on T1, hyperintense on T2.
2. Acute (1-3 days): Intracellular deoxyhemoglobin, hypointense on T1 and T2.
3. Early subacute (3-7 days): Intracellular methemoglobin, hyperintense on T1, hypointense on T2.
4. Late subacute (7-28 days): Extracellular methemoglobin, hyperintense on T1 and T2.
5. Chronic (>28 days): Peripheral hemosiderin (hypointense) and central hemichromes (isointense/hyperintense).
<sources>
https://radiopaedia.org/articles/haemorrhage-on-mri-1?lang=us
</sources>"""

answer.split("<sources>")[0].strip()

'Blood on MRI has highly variable imaging characteristics depending on the age of the blood, type of hemoglobin, and MRI sequence. Key stages include:\n1. Hyperacute (<1 day): Intracellular oxyhemoglobin, isointense on T1, hyperintense on T2.\n2. Acute (1-3 days): Intracellular deoxyhemoglobin, hypointense on T1 and T2.\n3. Early subacute (3-7 days): Intracellular methemoglobin, hyperintense on T1, hypointense on T2.\n4. Late subacute (7-28 days): Extracellular methemoglobin, hyperintense on T1 and T2.\n5. Chronic (>28 days): Peripheral hemosiderin (hypointense) and central hemichromes (isointense/hyperintense).'

In [3]:
import re

def extract_xml(text: str, tag: str) -> str:
    """
    Extracts the content of the specified XML tag from the given text. Used for parsing structured responses

    Args:
        text (str): The text containing the XML.
        tag (str): The XML tag to extract content from.

    Returns:
        str: The content of the specified XML tag, or an empty string if the tag is not found.
    """
    match = re.search(f"<{tag}>(.*?)</{tag}>", text, re.DOTALL)
    return match.group(1) if match else ""

In [5]:
extract_xml(answer, "sources").strip().splitlines()

['https://radiopaedia.org/articles/haemorrhage-on-mri-1?lang=us']

In [42]:
with open("../test_queries.txt", "r") as f:
    queries = f.readlines()

queries = [q.strip() for q in queries]

In [43]:
queries = [q.replace(",", "") if q.endswith(",") else q for q in queries]
queries

['how does a hepatic adenoma look like on MRI?',
 'penumbra ischemic stroke perfuson imaging',
 'how to differentiate between a hepatic adenoma and focal nodular hyperplasia on MRI?',
 'how does japanese encephalitis look like on MRI?',
 'what is CLIPPERS?',
 'What is a normal callosal angle and how do I measure it?',
 'Can you tell me more about the MTA score?',
 'what are the criteria of an liver abscess',
 'pneumonitis',
 'how does the penumbra look in ct perfusion?',
 'ganglioglioma mri appearance',
 'whats a normal critical shoulder angle?',
 'What are the classic imaging findings of multiple sclerosis on MRI?',
 'How do I differentiate between a meningioma and schwannoma on contrast-enhanced MRI?',
 "What's the difference in appearance between septic and degenerative disc changes on MRI?",
 'Can you explain the Li-RADS criteria for liver lesions on CT/MRI?',
 'What are the key features of posterior reversible encephalopathy syndrome (PRES) on imaging?',
 'How do I measure the ace

In [44]:
len(queries)

137

In [9]:
import sys
sys.path.append('..')

In [35]:
from lib import get_search_term, search_results, setup_db, SearchTerm

In [14]:
import random 

query = random.choice(queries)
query

'how to recognize COVID-19 pneumonia on chest CT?'

In [17]:
import tqdm
import pandas as pd

In [45]:
results = []
for q in tqdm.tqdm(queries):
    st = get_search_term(q)
    sr = search_results(st, cursor)
    results.append({"query": q, "search_term": st.text, "search_results": json.dumps(sr)})
    
df = pd.DataFrame(results)

  0%|          | 0/137 [00:00<?, ?it/s]2025-03-12 13:59:55,325 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-03-12 13:59:55,337 - INFO - SearchTerm for query: 'how does a hepatic adenoma look like on MRI?' is: 'hepatic adenoma'
  1%|          | 1/137 [00:01<04:29,  1.98s/it]2025-03-12 13:59:57,414 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-03-12 13:59:57,420 - INFO - SearchTerm for query: 'penumbra ischemic stroke perfuson imaging' is: 'penumbra ischemic stroke perfusion imaging'
  1%|▏         | 2/137 [00:04<04:40,  2.08s/it]2025-03-12 13:59:59,829 - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-03-12 13:59:59,832 - INFO - SearchTerm for query: 'how to differentiate between a hepatic adenoma and focal nodular hyperplasia on MRI?' is: 'hepatic adenoma versus focal nodular hyperplasia MRI differentiation'
  2%|▏         | 3/137 [00:06<04:34,  2.05s/it]2025-03

In [46]:
df

Unnamed: 0,query,search_term,search_results
0,how does a hepatic adenoma look like on MRI?,hepatic adenoma,"[{""id"": 0, ""title"": ""Hepatic adenoma"", ""body"":..."
1,penumbra ischemic stroke perfuson imaging,penumbra ischemic stroke perfusion imaging,"[{""id"": 0, ""title"": ""Central nervous system cu..."
2,how to differentiate between a hepatic adenoma...,hepatic adenoma versus focal nodular hyperplas...,[]
3,how does japanese encephalitis look like on MRI?,japanese encephalitis,"[{""id"": 0, ""title"": ""Japanese encephalitis"", ""..."
4,what is CLIPPERS?,CLIPPERS,"[{""id"": 0, ""title"": ""Chronic lymphocytic infla..."
...,...,...,...
132,how does a extraventricular neurocytoma look l...,extraventricular neurocytoma,"[{""id"": 0, ""title"": ""Extraventricular neurocyt..."
133,what can you tell me about pulmonary endometri...,pulmonary endometriosis,"[{""id"": 0, ""title"": ""Thoracic endometriosis"", ..."
134,how does myocarditis look like on cardiac mri?,myocarditis cardiac MRI,"[{""id"": 0, ""title"": ""Myocarditis protocol (MRI..."
135,what are normal values for a t1 mapping of the...,T1 mapping heart normal values,"[{""id"": 0, ""title"": ""Hypertensive heart diseas..."


In [30]:
import apsw
import apsw.bestpractice
apsw.bestpractice.apply(apsw.bestpractice.recommended)

In [31]:
cursor = setup_db()

In [48]:
df[df.search_results == "[]"]

Unnamed: 0,query,search_term,search_results
2,how to differentiate between a hepatic adenoma...,hepatic adenoma versus focal nodular hyperplas...,[]
14,What's the difference in appearance between se...,septic disc changes versus degenerative disc c...,[]
34,how to tell if disc infection or just degenera...,disc infection versus degeneration,[]
47,avf bone marrow signal,avf bone marrow signal,[]
58,how to assess vertebral compression fracture a...,vertebral compression fracture acuity,[]
84,what are the imaging findings of Lemierre’s sy...,Lemierre's syndrome,[]


In [50]:
df.to_csv("test_queries.csv", index=False)

In [8]:
import pandas as pd

df = pd.read_csv("test_queries.csv")

In [2]:
import cohere

In [3]:
%env COHERE_API_KEY=zasn1cVOQe4IeBIkIpFi9588L9cv3DOs10w6eEeq

env: COHERE_API_KEY=zasn1cVOQe4IeBIkIpFi9588L9cv3DOs10w6eEeq


In [1]:
import os
import json

In [4]:
client = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))        


In [9]:
row = df.iloc[85]
row

query                     how does adrenal hemorrhage appear on CT?
search_term                                      adrenal hemorrhage
search_results    [{"id": 0, "title": "Adrenal hemorrhage", "bod...
Name: 85, dtype: object

In [27]:
def get_docs(search_results):
    docs = [d["title"] + " " + d["body"] for d in search_results]
    return docs

In [29]:
sr = json.loads(row.search_results)
sr

[{'id': 0,
  'title': 'Hepatic adenoma',
  'body': 'Hepatic adenomas,\xa0or hepatocellular adenomas, are benign,\xa0generally hormone-induced liver tumors. They are usually solitary but can be multiple. Most adenomas have a predilection for hemorrhage, and they must be differentiated from other focal liver lesions due to the risk of HCC transformation....',
  'href': '/articles/hepatic-adenoma?lang=us'},
 {'id': 1,
  'title': 'Inflammatory hepatic adenoma',
  'body': 'Inflammatory hepatic adenomas are a genetic and pathological subtype of hepatic adenoma. Their appearance and prognosis is different than other subtypes and they have the highest incidence of hemorrhage amongst hepatic adenoma subtypes.\n\nEpidemiology\n\nInflammatory hepatic adenomas are the most c...',
  'href': '/articles/inflammatory-hepatic-adenoma?lang=us'},
 {'id': 2,
  'title': 'Unclassified hepatic adenoma',
  'body': 'Unclassified hepatic adenomas refer to the 5-10% of hepatocellular adenomas\xa0subtype that lac

In [None]:
def get_best_article(results, query):
    client = cohere.Client(api_key=os.getenv("COHERE_API_KEY"))        
    reranked = client.rerank(query=query, documents=get_docs(results), top_n=5)
    reranked_idx = reranked.results[0].index
    score = reranked.results[0].relevance_score
    return results[reranked_idx], score
    

In [30]:
row

query                  how does a hepatic adenoma look like on MRI?
search_term                                         hepatic adenoma
search_results    [{"id": 0, "title": "Hepatic adenoma", "body":...
Name: 0, dtype: object

In [36]:
get_best_article(sr, row.search_term)

({'id': 0,
  'title': 'Hepatic adenoma',
  'body': 'Hepatic adenomas,\xa0or hepatocellular adenomas, are benign,\xa0generally hormone-induced liver tumors. They are usually solitary but can be multiple. Most adenomas have a predilection for hemorrhage, and they must be differentiated from other focal liver lesions due to the risk of HCC transformation....',
  'href': '/articles/hepatic-adenoma?lang=us'},
 0.9996444)

In [24]:
row = df.iloc[0]

r = client.rerank(query=row["query"], documents=get_docs(row), top_n=5)

In [25]:
r.results

[RerankResponseResultsItem(document=None, index=0, relevance_score=0.9168571),
 RerankResponseResultsItem(document=None, index=1, relevance_score=0.67006874),
 RerankResponseResultsItem(document=None, index=4, relevance_score=0.5614252),
 RerankResponseResultsItem(document=None, index=3, relevance_score=0.50600404),
 RerankResponseResultsItem(document=None, index=2, relevance_score=0.3744475)]

In [26]:
sr[r.results[0].index]

{'id': 0,
 'title': 'Adrenal hemorrhage',
 'body': 'Adrenal hemorrhage can result from a variety of traumatic and non-traumatic causes. When unilateral,\xa0it is often clinically silent. In contrast, bilateral adrenal hemorrhage can lead to catastrophic adrenal insufficiency.\n\nClinical presentation\n\nThe clinical signs of adrenal hemorrhage are very ...',
 'href': '/articles/adrenal-haemorrhage?lang=us'}

In [18]:
[] + [3,4]

[3, 4]

In [15]:
import anthropic
import re

def extract_all_query_content(text: str) -> list[str]:
    # Extract content between search_terms tags
    search_terms_match = re.search(r"<search_terms>(.*?)</search_terms>", text, re.DOTALL)
    if not search_terms_match:
        return []
    
    # Get the content and split by lines
    search_terms_content = search_terms_match.group(1).strip()
    search_terms = [term.strip() for term in search_terms_content.splitlines() if term.strip()]
    
    return search_terms

def get_search_terms(query: str) -> list[str]:
    client = anthropic.Anthropic(
        api_key=os.getenv("ANTHROPIC_API_KEY"),
    )

    message = client.messages.create(
        model="claude-3-5-haiku-20241022",
        max_tokens=8192,
        temperature=0.6,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "<examples>\n<example>\n<query>\nWhat are the findings of acute pancreatitis on contrast-enhanced CT?\n</query>\n<ideal_output>\n<analysis>\n1. Main radiological concept: acute pancreatitis\n2. The query does not require a comparison or multiple distinct concepts.\n3. The most appropriate technical term is \"acute pancreatitis\".\n4. This search term is justified because it directly addresses the main concept in the query. Although the query asks about the cause, searching for \"acute pancreatitis\" will likely provide information about its etiology, including the most common cause.\n</analysis>\n<search_terms>acute pancreatitis</search_terms>\n</ideal_output>\n</example>\n<example>\n<query>\nWhat is the MTA score?\n</query>\n<ideal_output>\n<analysis>\n1. Main radiological concept: MTA score\n2. The query does not require a comparison or multiple distinct concepts.\n3. The most appropriate technical term is \"MTA score\".\n4. This search term is justified because it directly addresses the main concept in the query. While the query asks about a specific age and pathological score, searching for \"MTA score\" will likely provide comprehensive information about the scoring system, including age-related norms and pathological thresholds.\n</analysis>\n<search_terms>MTA score</search_terms>\n</ideal_output>\n</example>\n<example>\n<query>\nHow to differentiate radiation necrosis from tumor recurrence on MRI?\n</query>\n<ideal_output>\n<analysis>\n1. Main radiological concepts: radiation necrosis, tumor recurrence\n2. The query implies a comparison between two conditions, but does not explicitly ask for both to be searched.\n3. The most appropriate technical term is \"radiation necrosis\".\n4. This search term is justified because it focuses on one of the main concepts in the query. While the question involves differentiating between radiation necrosis and tumor recurrence, searching for \"radiation necrosis\" will likely provide information on its characteristics and how to distinguish it from tumor recurrence. As per the guidelines, we provide only one search term unless explicitly asked for multiple terms.\n</analysis>\n<search_terms>radiation necrosis</search_terms>\n</ideal_output>\n</example>\n<example>\n<query>\nHow to differentiate primary CNS lymphoma from glioblastoma on MRI?\n</query>\n<ideal_output>\n<analysis>\n1. Main radiological concepts: primary CNS lymphoma, glioblastoma\n2. The query explicitly asks for a comparison between two conditions.\n3. The most appropriate technical terms are \"CNS lymphoma\" and \"glioblastoma\".\n4. These search terms are justified because the query specifically asks for a comparison between these two conditions. As per the guidelines, we provide multiple search terms when the query explicitly requires a comparison.\n</analysis>\n<search_terms>CNS lymphoma\nglioblastoma</search_terms>\n</ideal_output>\n</example>\n<example>\n<query>\nHow does a TOF MRA work?\n</query>\n<ideal_output>\n<analysis>\n1. Main radiological concept: TOF MRA (Time-of-Flight Magnetic Resonance Angiography)\n2. The query does not require a comparison or multiple distinct concepts.\n3. The most appropriate technical term is \"TOF MRA\".\n4. This search term is justified because it directly addresses the main concept in the query. While the question asks about the working principle, searching for \"TOF MRA\" will likely provide comprehensive information about the technique, including how it works.\n</analysis>\n<search_terms>TOF MRA</search_terms>\n</ideal_output>\n</example>\n<example>\n<query>\nWhat's chemical shift imaging?\n</query>\n<ideal_output>\n<analysis>\n1. Main radiological concept: chemical shift imaging\n2. The query does not require a comparison or multiple distinct concepts.\n3. The most appropriate technical term is \"chemical shift imaging\".\n4. This search term is justified because it directly matches the concept asked about in the query. It is a concise noun phrase that captures the specific imaging technique in question.\n</analysis>\n<search_terms>chemical shift imaging</search_terms>\n</ideal_output>\n</example>\n</examples>\n\n"
                    },
                    {
                        "type": "text",
                        "text": f"You are an experienced radiologist tasked with generating relevant search terms for Radiopaedia based on user queries about radiological topics. Your goal is to identify the core concept(s) or condition(s) mentioned in the query and provide concise, targeted search terms.\n\nHere is the user's query:\n\n<user_query>\n{query}\n</user_query>\n\nPlease analyze the query and determine the most appropriate search term(s). Follow these guidelines:\n\n1. Identify the main radiological concept, condition, or imaging technique mentioned in the query.\n2. If the query explicitly asks for a comparison between two conditions or concepts, provide search terms for both.\n3. In most cases, provide only one search term unless the query specifically demands multiple terms.\n4. Search terms should be concise noun phrases or technical terms, not full questions.\n5. Focus on the underlying medical concept, even if the query is about a specific aspect of that concept.\n\nBefore providing your final output, wrap your analysis in <analysis> tags. In this analysis:\n\n1. List the main radiological concepts or conditions mentioned in the query.\n2. Determine if the query requires a comparison or multiple distinct concepts.\n3. Consider the most appropriate technical term or concise noun phrase for each identified concept.\n4. Justify your choice of search term(s) based on the guidelines provided.\n\nAfter your analysis, provide the search term(s) in the following format:\n\n<search_terms>\n[One search term per line]\n</search_terms>\n\nRemember to only include multiple search terms if the query explicitly requires a comparison or multiple distinct concepts."
                    }
                ]
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": "<analysis>"
                    }
                ]
            }
        ]
    )

    return extract_all_query_content(message.content[0].text)

In [16]:
get_search_terms("What is the difference between a type 1 and type 2 endoleak?")


['type 1 endoleak', 'type 2 endoleak']