# LLM split sentences

Check the project `2_llm_split_sent`, which uses llm to split a report sentence that contain conjunctions into multiple sentences.

Empty reports (sections) are omitted

Example output:

```
{"doc_key": "train#2#findings",
 "sent_idx": 0,
 "original_sent": "Normal cardiomedastinal silhouette without evidence of pulmonary infiltrates or occupation of the costophrenic sinuses.",
 "split_sents": ["Normal cardiomedastinal silhouette.","No evidence of pulmonary infiltrates.","No occupation of the costophrenic sinuses."]}
```

# Interpret-cxr sentences

After getting the split sentences from `arrg_sentgen`, we need to process it for `bioportal`, `cxrgraph` and `radcoref`

We take LLM output as input

Example output:

```
{"doc_key": "train#0#impression#0#0",
 "split_sent_text": "Decreased bibasilar parenchymal opacities are seen.",
 "split_sent_toks": [["Decreased", "bibasilar", "parenchymal", "opacities", "are", "seen", "."]], 
 "tok_char_indices": [[[0, 9], [10, 19], [20, 31], [32, 41], [42, 45], [46, 50], [50, 51]]]}
 ```

All items are not empty.



In [10]:
import json
import os

import spacy
from tqdm import tqdm

In [2]:
llm_sents_dir = "/home/yuxiang/liao/workspace/arrg_sentgen/outputs/interpret_cxr"

docs = []
for partition in [1, 2, 3]:
    with open(os.path.join(llm_sents_dir, f"llm_sent_splits_{partition}_of_3.json")) as f:
        docs += [json.loads(line.strip()) for line in f]

In [11]:
nlp = spacy.load("en_core_web_sm", exclude=["tagger", "attribute_ruler", "lemmatizer", "ner"])
print(nlp.pipe_names)

['tok2vec', 'parser']


In [None]:
text_tuples = []
for idx, doc in enumerate(tqdm(docs)):
    doc_key_prefix = f'{doc["doc_key"]}#{doc["sent_idx"]}'
    for split_sent_idx, split_sent_text in enumerate(doc["sent_splits"]):
        text_tuples.append((split_sent_text, {"data_id": f"{doc_key_prefix}#{split_sent_idx}", **doc}))

In [None]:
output_file_dir = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_sents"
os.makedirs(output_file_path, exist_ok=True)
f = open(os.path.join(output_file_dir, "raw_sents.json"), "w", encoding="utf-8")
f_warn_empty = open(os.path.join(output_file_dir, "warn_empty_sent.json"), "w", encoding="utf-8")
f_warn_multi = open(os.path.join(output_file_dir, "warn_multi_sent.json"), "w", encoding="utf-8")

# each return is a split sentence
for doc, info_dict in tqdm(nlp.pipe(text_tuples, as_tuples=True, n_process=8)):
    if len(doc.sents) == 0:
        f_warn_empty.write(json.dumps({"nlp": [[tok for tok in sent] for sent in doc.sents], **info_dict}))
        f_warn_empty.write("\n")
    else:
        output_dict = {"doc_key": info_dict["data_id"], "split_sent_text": doc.text, "split_sent_toks": [], "tok_char_indices": []}
        # Should only have one sentence
        # If there are multipel sentences, we treated it as one sentence
        split_sent_toks = []
        tok_char_indices = []
        for tok in doc:
            tok_text = tok.text.strip()
            tok_start_char = tok.idx + tok.text.index(tok_text)
            tok_end_char = tok_start_char + len(tok_text)
            # Omit empty tokens
            if tok_text != "":
                split_sent_toks.append(tok_text)
                tok_char_indices.append((tok_start_char, tok_end_char))

        assert len(split_sent_toks) != 0
        output_dict["split_sent_toks"].append(split_sent_toks)
        output_dict["tok_char_indices"].append(tok_char_indices)
        f.write(json.dumps(output_dict))
        f.write("\n")

        # Extra warning recorded if there are multiple sentences
        if len(doc.sents) > 1:
            f_warn_multi.write(json.dumps({"nlp": [[tok for tok in sent] for sent in doc.sents], **info_dict}))
            f_warn_multi.write("\n")

f.close()