In [13]:
from collections import Counter
from utils import load_jsonl, load_jsonl_iteratively

jstage_root = "/data/xzhao/experiments/roman-pretrain/datasets/kg-datasets/ja-0.5/eval_qa"
docid_path = f"{jstage_root}/docids.jsonl"
eval_docids = set(load_jsonl(docid_path))

# doc_path = "/data/xzhao/experiments/roman-pretrain/datasets/kg-datasets/ja-0.5/eval_qa/01_fact_check/ja_ners.jsonl"
doc_path = "/data/xzhao/experiments/roman-pretrain/datasets/kg-datasets/ja-0.5/eval_qa/01_fact_check/bionlp_ners.jsonl"
# doc_path = "/data/xzhao/experiments/roman-pretrain/datasets/kg-datasets/ja-0.5/eval_qa/01_fact_check/scibert_ners.jsonl"
docs = load_jsonl(doc_path)

In [7]:
from tqdm import tqdm
match_sents, unmatch_sents = [], []
no_sentences = 0
for doc in tqdm(load_jsonl_iteratively(doc_path)):
    if "docid" not in doc or doc["docid"] not in eval_docids:
        continue
    
    if "sentences" not in doc:
        no_sentences += 1
        continue
    for sentence in doc["sentences"]:
        entities = []
        for entity in sentence["entities"]:
            ent_type = entity["type"]
            if ent_type.startswith("time") or ent_type.startswith("cc_") or ent_type.startswith("t-val_") or ent_type.startswith("t-key_other") or ent_type.startswith("f_"):
                continue
            entities.append(entity['name'])
        if len(entities) > 2:
            match_sents.append(sentence)
        else:
            unmatch_sents.append(sentence)
print(f"Matched sentences: {len(match_sents)}")
print(f"Unmatched sentences: {len(unmatch_sents)}")
print(f"Number of documents without sentences: {no_sentences}")


10000it [00:06, 1436.97it/s]

Matched sentences: 34591
Unmatched sentences: 37069
Number of documents without sentences: 1





In [None]:
filter_tuis = ["T073", "T098", "T097"]
tui2ents = {}
match_sents, unmatch_sents = [], []
cuis = Counter()
ents = set()
for doc in docs:
    for sentence in doc["sentences"]:
        entities = []
        for entity in sentence["entities"]:
            ents.add(entity['text'].strip().lower())
            for ent in entity["ents"]:
                match = False
                cuis[ent["cui"]] += 1
                for tui in ent["tui"]:
                    if tui in filter_tuis:
                        continue
                    match = True
                    tui2ents.setdefault(tui, set()).add(entity['text'])
                if match:
                    entities.append(ent)
        if len(entities) > 2:
            match_sents.append(sentence)
        else:
            unmatch_sents.append(sentence)
print(f"Matched sentences: {len(match_sents)}")
print(f"Unmatched sentences: {len(unmatch_sents)}")
print(f"Unique CUIs: {len(cuis)}")
print(f"Averge occurrence of CUIs: {sum(cuis.values()) / len(cuis)}")
print(f"Unique entities: {len(ents)}")


Matched sentences: 40762
Unmatched sentences: 41008
Unique CUIs: 31135
Averge occurrence of CUIs: 11.928344307049944
Unique entities: 27856


In [14]:
ents = set()
filter_tuis = ["T073", "T098", "T097"]
def get_sentences():
    """Generator function to yield sentences with more than 2 entities."""
    for doc in docs:
        for i, sentence in enumerate(doc["sentences"]):
            entities = []
            for entity in sentence["entities"]:
                ents.add(entity['text'].strip().lower())
                for ent in entity["ents"]:
                    match = False
                    for tui in ent["tui"]:
                        if tui in filter_tuis:
                            continue
                        match = True
                    if match:
                        entities.append(ent)
            if len(entities) <= 2:
                continue
            request_id = f"{doc['docid']}_sentid:{i}"
            yield request_id, sentence['text']

In [15]:
from langchain_openai import ChatOpenAI
from langchain_core.rate_limiters import InMemoryRateLimiter

rate_limiter = InMemoryRateLimiter(requests_per_second=10, check_every_n_seconds=1, max_bucket_size=10)

llm = ChatOpenAI(
    model="0068_QA-Gen",
    openai_api_key="",
    openai_api_base="http://gpu-node13:8080/v1",
    temperature=0.5,
    top_p=0.9,
    # rate_limiter=rate_limiter,
    max_retries=1,
    timeout=60,
    logprobs=True,
)

In [16]:
from typing import Optional

from pydantic import BaseModel, Field

class Triple(BaseModel):
    subject: str = Field(description="The subject of the extracted fact.")
    relation: str = Field(description="The relation in the extracted fact.")
    object: str = Field(description="The object of the extracted fact.")

class TripleExtract(BaseModel):
    factuality: bool = Field(
        description="A Boolean value indicating whether the sentence contains generalizable triple-like factual knowledge."
    )
    triple: Optional[Triple] = Field(
        description="A nested JSON object with three fields: subject, relation, and object representing the extracted fact. Please note that if factuality is false, make sure this field is null."
    )
    reason: str = Field(
        description="A brief explanation for why the sentence was or wasn't considered factual, referring to the criteria provided."
    )

import re
import json
from pydantic import ValidationError


def extract_first_json_block(text: str) -> str:
    """
    Extract the first valid JSON object from a string with nested braces.
    """
    start = text.find("{")
    if start == -1:
        raise ValueError("No opening brace found in text")

    brace_count = 0
    for i in range(start, len(text)):
        if text[i] == "{":
            brace_count += 1
        elif text[i] == "}":
            brace_count -= 1
            if brace_count == 0:
                return text[start:i+1]
    raise ValueError("Braces do not match, incomplete JSON block")

def robust_parse(output_str, parser):
    """
    Preprocess LLM output string and try to parse it robustly using a LangChain parser.
    """
    cleaned = re.sub(r"^```(?:json)?|```$", "", output_str.strip(), flags=re.MULTILINE).strip()

    # Normalize casing for JSON literals
    cleaned = re.sub(r'\bNULL\b', 'null', cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r'\bTRUE\b', 'true', cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r'\bFALSE\b', 'false', cleaned, flags=re.IGNORECASE)

    try:
        json_block = extract_first_json_block(cleaned)
    except ValueError as e:
        print("⚠️ Failed to extract valid JSON block")
        raise RuntimeError(str(e))
    
    try:
        return parser.parse(json_block).model_dump()
    except (ValidationError, json.JSONDecodeError) as e:
        print("⚠️  Parser failed after cleanup.")
        raise RuntimeError(f"Robust parsing failed: {e}")
# structured_llm = llm.with_structured_output(method="json_mode")
# structured_llm = llm.with_structured_output(schema=TripleExtract, method="json_schema", include_raw=True, strict=False)

In [17]:
from langchain.prompts import PromptTemplate


from langchain.output_parsers import PydanticOutputParser

from utils import load_json

instructions = load_json("./instructions.json")
system_prompt = instructions["factuality-check"]["en"]['system']
user_prompt = instructions["factuality-check"]["en"]['user']

parser = PydanticOutputParser(pydantic_object=TripleExtract, include_raw=True)
prompt = PromptTemplate(
    template=f'{system_prompt}\n{user_prompt}',
    input_variables=["sentence"],
    partial_variables={"schema_instruction": parser.get_format_instructions()},
)

messages = []
for request_id, sentence in get_sentences():
    messages.append(prompt.format(sentence=sentence))
    if len(messages) >= 100:
        break

In [31]:
response1 = llm.invoke(prompt.format(sentence="Vitamin D supplementation reduces the risk of osteoporosis in elderly patients."))
# response2 = llm.invoke(prompt.format(sentence=messages[0]))
response2 = llm.invoke(prompt.format(sentence="a test"))

In [39]:
tokens = [token["token"] for token in response2.response_metadata['logprobs']['content']]
tokens

['`',
 '``',
 'json',
 '<0x0A>',
 '{',
 '<0x0A>',
 '▁▁▁',
 '▁"',
 'f',
 'actual',
 'ity',
 '":',
 '▁false',
 ',',
 '<0x0A>',
 '▁▁▁',
 '▁"',
 'tri',
 'ple',
 '":',
 '▁null',
 ',',
 '<0x0A>',
 '▁▁▁',
 '▁"',
 'reason',
 '":',
 '▁"',
 'The',
 '▁input',
 '▁sentence',
 '▁is',
 '▁too',
 '▁vague',
 '▁to',
 '▁determine',
 '▁any',
 '▁general',
 'izable',
 '▁fact',
 'ual',
 '▁knowledge',
 '.',
 '▁It',
 '▁does',
 '▁not',
 '▁provide',
 '▁enough',
 '▁context',
 '▁to',
 '▁identify',
 '▁a',
 '▁subject',
 ',',
 '▁relation',
 ',',
 '▁or',
 '▁object',
 ',',
 '▁nor',
 '▁does',
 '▁it',
 '▁assert',
 '▁a',
 '▁fact',
 'ual',
 '▁relationship',
 '▁or',
 '▁state',
 '▁a',
 '▁fact',
 '."',
 '<0x0A>',
 '}',
 '<0x0A>',
 '`',
 '``',
 '</s>']

In [None]:
# false_token_array = ['fact', 'uality', '":', 'Ġfalse']
# true_token_array = ['fact', 'uality', '":', 'Ġtrue']

false_token_array =  ['▁"', 'f', 'actual', 'ity', '":', '▁false']
true_token_array = ['▁"', 'f', 'actual', 'ity', '":', '▁true']

def tokens_match(tokens, target_array):
    index = -1
    target_length = len(target_array)
    for i in range(len(tokens) - target_length + 1):
        if tokens[i:i + target_length] == target_array:
            return i + target_length - 1
    return index


In [51]:
def measure_prob(response):
    logprobs = response.response_metadata['logprobs']
    tokens = [item['token'] for item in logprobs['content']]
    probs = [item['logprob'] for item in logprobs['content']]

    match_true = tokens_match(tokens, true_token_array)
    match_false = tokens_match(tokens, false_token_array)
    
    if match_true == -1 and match_false == -1:
        raise ValueError("Neither true nor false token arrays matched, which is unexpected.")
    elif match_true != -1 and match_false != -1:
        raise ValueError("Both true and false token arrays matched, which is unexpected.")
    elif match_true != -1:
        prob = probs[match_true]
        return True, prob
    elif match_false != -1:
        prob = probs[match_false]
        return False, prob

In [52]:
for response in [response2]:
    measure_prob(response)

Target length: 6
['▁"', 'f', 'actual', 'ity', '":', '▁false']
['▁"', 'tri', 'ple', '":', '▁null', ',']
['▁"', 'reason', '":', '▁"', 'The', '▁input']
['▁"', 'The', '▁input', '▁sentence', '▁is', '▁too']
Target length: 6
['▁"', 'f', 'actual', 'ity', '":', '▁false']


In [183]:
response2.response_metadata['logprobs']

{'content': [{'token': 'Okay',
   'bytes': [79, 107, 97, 121],
   'logprob': -0.57659912109375,
   'top_logprobs': []},
  {'token': ',',
   'bytes': [44],
   'logprob': -7.629365427419543e-06,
   'top_logprobs': []},
  {'token': 'Ġso',
   'bytes': [196, 160, 115, 111],
   'logprob': -0.6446996331214905,
   'top_logprobs': []},
  {'token': 'ĠI',
   'bytes': [196, 160, 73],
   'logprob': -0.0012866322649642825,
   'top_logprobs': []},
  {'token': "'m",
   'bytes': [39, 109],
   'logprob': -1.3614983558654785,
   'top_logprobs': []},
  {'token': 'Ġtrying',
   'bytes': [196, 160, 116, 114, 121, 105, 110, 103],
   'logprob': -0.5268017649650574,
   'top_logprobs': []},
  {'token': 'Ġto',
   'bytes': [196, 160, 116, 111],
   'logprob': -4.768370445162873e-07,
   'top_logprobs': []},
  {'token': 'Ġfigure',
   'bytes': [196, 160, 102, 105, 103, 117, 114, 101],
   'logprob': -0.24157193303108215,
   'top_logprobs': []},
  {'token': 'Ġout',
   'bytes': [196, 160, 111, 117, 116],
   'logprob': -0

In [123]:
robust_parse(responses[0].content, parser)

{'factuality': False,
 'triple': None,
 'reason': 'This sentence describes an experimental setup (measuring cortisol and SOD activity in saliva under mental stress) but does not report any results or assert a generalizable biomedical fact. It lacks a subject-relation-object structure that could represent a causal or mechanistic relationship, clinical technique, or universally applicable knowledge.'}

In [115]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=2)

responses = []
async for idx, result in llm.abatch_as_completed(messages[:2], config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

Got result 1: content='<think>\nOkay, let\'s tackle this input. The sentence is: "In experiment 1, we measured the concentration of cortisol and activity of SOD in saliva after the addition of mental stress."\n\nFirst, I need to determine if this expresses a generalizable factual knowledge. The user mentioned that if it\'s a specific case observation or mentions an experimental setup without results, it shouldn\'t be marked as factual.\n\nThe sentence talks about an experiment (experiment 1) where they measured cortisol and SOD activity in saliva after adding mental stress. The key here is that they\'re describing what they did in an experiment but not stating the results. The sentence is about the setup or the process of the study, not the findings. \n\nLooking at the criteria, if the sentence only mentions experimental setups without reporting results, it\'s not factual. Since the input doesn\'t mention any outcomes or conclusions, just the method, it falls under that category. \n\nS

In [116]:
# result = parser.parse(responses[0].content).model_dump()

robust_parse(responses[0].content, parser)

{'factuality': False,
 'triple': None,
 'reason': 'This sentence describes an experimental setup (measuring cortisol and SOD activity in saliva under mental stress) but does not report any results or assert a generalizable biomedical fact. It lacks a subject-relation-object structure that could represent a causal or mechanistic relationship, clinical technique, or universally applicable knowledge.'}

In [100]:
result

{'factuality': False,
 'triple': None,
 'reason': "This sentence describes the study's objectives and methodology but does not report any results or assert generalizable factual knowledge. It mentions the intention to examine relationships and effects without stating the findings, thus it does not express a subject-relation-object fact."}

In [90]:
# robust_parse(responses[0].content.strip(), parser)
output_str = responses[0].content

cleaned = re.sub(r"^```(json)?|```$", "", output_str.strip(), flags=re.MULTILINE).strip()
json_like_match = re.search(r"\{.*\}", cleaned, re.DOTALL)
print(json_like_match.group())

print(output_str)

{
    "factuality": False,
    "triple": null,
    "reason": "This sentence mentions an experimental setup and the measurements taken but does not report any results or assert a generalizable fact. It describes a specific experimental procedure without stating a subject-relation-object relationship that conveys factual knowledge."
}
```json
{
    "factuality": False,
    "triple": null,
    "reason": "This sentence mentions an experimental setup and the measurements taken but does not report any results or assert a generalizable fact. It describes a specific experimental procedure without stating a subject-relation-object relationship that conveys factual knowledge."
}
```


In [91]:
# parser.parse(json_like_match.group())
parser.parse(output_str)
print(json_like_match.group())

OutputParserException: Invalid json output: ```json
{
    "factuality": False,
    "triple": null,
    "reason": "This sentence mentions an experimental setup and the measurements taken but does not report any results or assert a generalizable fact. It describes a specific experimental procedure without stating a subject-relation-object relationship that conveys factual knowledge."
}
```
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 

In [70]:
print(type(responses[1].content))
parser.parse(responses[1].content.strip())

<class 'str'>


OutputParserException: Invalid json output: ```
{
    "factuality": False,
    "triple": null,
    "reason": "This sentence describes the experimental setup and the intention to study the relationship between mental stress, reactive oxygen scavenging enzymes, and the effect of mental stress reduction by makeup. However, it does not report any results or assert a generalizable fact. It only mentions what was examined, without providing a conclusion or a factual finding that can be structured as a subject-relation-object triple."
}
```
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 

/tmp/ipykernel_989689/3026971379.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  result.dict()


{'factuality': False,
 'triple': None,
 'reason': 'This sentence describes an experimental setup (measuring cortisol and SOD activity in saliva under mental stress) without reporting results or asserting a generalizable biomedical fact. It outlines a methodological approach rather than stating a subject-relation-object relationship that applies broadly.'}

In [None]:
import asyncio
import time
from langchain_core.runnables import RunnableConfig

async def adaptive_batch_process(llm, messages, start_concurrency=1, max_concurrency=20, step=1):
    concurrency = start_concurrency
    index = 0
    responses = []

    while index < len(messages):
        end = min(index + concurrency, len(messages))
        current_batch = messages[index:end]
        config = RunnableConfig(max_concurrency=concurrency)

        start_time = time.time()

        batch_results = []
        async for idx, result in llm.abatch_as_completed(current_batch, config=config):
            print(f"[{concurrency=} | {index + idx=}] Got result: {result}")
            batch_results.append(result)

        duration = time.time() - start_time
        throughput = len(batch_results) / duration

        print(f"Processed {len(batch_results)} items in {duration:.2f}s (Throughput: {throughput:.2f} req/s)")

        responses.extend(batch_results)
        index += concurrency

        # Increase concurrency if throughput is improving
        if throughput > 1:  # Tune this threshold as needed
            concurrency = min(concurrency + step, max_concurrency)
        else:
            concurrency = max(1, concurrency - step)

        await asyncio.sleep(0.1)  # Optional pause to avoid rate limits

    return responses

In [None]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=5)

responses = []
async for idx, result in llm.abatch_as_completed(messages[20:30], config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

Got result 6: content='```json\n{\n    "request_id": "nisshoshi1964@@88/9/88_9_2119_sentid:0",\n    "sentence": "We performed percutaneous transhepatic gallbladder drainage (PTGBD) in 71 of 129 patients with acute cholecystitis.",\n    "factuality": false,\n    "triple": null,\n    "reason": "This sentence describes a specific case observation, detailing the number of patients who underwent a particular procedure. It does not express generalizable factual knowledge that can be structured as a subject-relation-object triple. Instead, it reports on the application of a procedure to a specific group of patients, which is more about a case study or observational data rather than asserting a widely applicable fact or relationship."\n}\n```' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 166, 'prompt_tokens': 640, 'total_tokens': 806, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': '0068_QA-Gen', 'system_fingerprin

In [None]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=10)

responses = []
async for idx, result in llm.abatch_as_completed(messages, config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

Got result 24: content='```json\n{\n    "request_id": "jjabcs1992@@6/3/6_3_283_sentid:6",\n    "sentence": "It was possible to calculate the average glandular dose with exposure in air.",\n    "factuality": False,\n    "triple": null,\n    "reason": "This sentence describes a possibility or capability related to calculating a specific dose but does not assert a generalizable factual relationship between entities. It lacks a clear subject-relation-object structure that represents widely applicable knowledge."\n}\n```' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 112, 'prompt_tokens': 622, 'total_tokens': 734, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': '0068_QA-Gen', 'system_fingerprint': None, 'id': 'chatcmpl-793834cb27c44f1c8dd5e3002e276b57', 'service_tier': None, 'finish_reason': 'stop', 'logprobs': None} id='run--8a20231b-0277-4ab7-89ae-d521b249c652-0' usage_metadata={'input_tokens': 622, 'output_tok

CancelledError: 

In [None]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=1)

responses = []
async for idx, result in structured_llm.abatch_as_completed(messages[:50], config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

Got result 4: request_id='jji1950@@25/2/25_2_101_sentid:0' sentence='Population structure and gonad histology were investigated in six species of the anemonefish genus Amphiprion occurring in Japanese waters.' factuality=False triple=None reason='This sentence describes a specific investigation or study setup without reporting the results or asserting a general, factual relationship between entities. It mentions the subjects of the investigation (population structure, gonad histology, and species of anemonefish) but does not provide a conclusion or fact that can be structured as a subject-relation-object triple.'
Got result 2: request_id='sccj1979@@42/2/42_2_121_sentid:3' sentence='In this result, the concentration of cortisol increased and the activity of SOD decreased significantly.' factuality=False triple=None reason='This sentence describes specific case observations, reporting changes in cortisol concentration and SOD activity, but does not explicitly express generalizable factua

In [None]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=2)

responses = []
async for idx, result in structured_llm.abatch_as_completed(messages[50:60], config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

Got result 7: request_id='example-1' sentence='In order to investigate the effects of a new drug on blood pressure in hypertensive rats, the researchers aimed to evaluate its potential benefits and risks.' factuality=False triple=None reason='This sentence only states the intention to study something and does not assert any general fact. It does not describe a subject-relation-object fact but rather the experimental purpose.'
Got result 4: request_id='request_id": "4"' sentence='Single administration of a large dose of Dex tended to reduce EAA and significantly reduced the EA-induced increase in plasma Cort and ACTH.' factuality=True triple={'subject': 'Single administration of a large dose of Dex', 'relation': 'tended to reduce', 'object': 'EAA and EA-induced increase in plasma Cort and ACTH'} reason='This sentence expresses generalizable factual knowledge about the effect of a large dose of Dex on EAA and plasma Cort and ACTH levels, which can be structured as a triple. It describes 

In [15]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=5)

responses = []
async for idx, result in structured_llm.abatch_as_completed(messages[60:70], config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

Got result 4: request_id='jjabcs1992@@6/3/6_3_283_sentid:6' sentence='It was possible to calculate the average glandular dose with exposure in air.' factuality=False triple=None reason='This sentence describes a specific capability or feasibility without asserting a general fact or a causal relationship. It mentions the possibility of calculating a dose under certain conditions but does not provide a generalizable factual statement about biomedical knowledge, mechanisms, or relationships.'
Got result 9: request_id='nisshoshi1964@@88/9/88_9_2119_sentid:7' sentence='In 24 of 71 patients, Percutaneous Transhepatic Gallbladder Scope (PTGBS) were attempted to retrieve stones, and it was completely successful in 16 patients.' factuality=False triple=None reason='This sentence describes specific case observations, reporting the outcomes of a procedure (PTGBS) in a particular group of patients.'
Got result 8: request_id='nisshoshi1964@@88/9/88_9_2119_sentid:4' sentence='Based on these data, we

CancelledError: 

In [None]:
from langchain_core.runnables import RunnableConfig

config = RunnableConfig(max_concurrency=10)

responses = []
async for idx, result in structured_llm.abatch_as_completed(messages[70:80], config=config):
    print(f"Got result {idx}: {result}")
    responses.append(result)

In [9]:
# import asyncio

# print("Before invoking the LLM...")
# tasks = [structured_llm.ainvoke(prompt) for prompt in messages]
# print("Invoking the LLM...")
# results = await asyncio.gather(*tasks, return_exceptions=True)
# print("LLM invocation completed.")

In [10]:
# for i, result in enumerate(results):
#     print(f"[{i}] Response: {result.dict()}")
#     # if isinstance(result, Exception):
#     #     print(f"[{i}] Error: {result}")
#     # else:
#     #     print(f"[{i}] Response: {result.content}")