# Interpret-cxr reports

Take interpret-cxr (hg datasets) as input.

The output will be used as the input for arrg_sentgen

Example output:

```
{"doc_key": "train#0#impression", 
 "sent_toks": [["1.DECREASED", "BIBASILAR", "PARENCHYMAL", "OPACITIES", ",", "NOW", "MINIMAL", "."], ["STABLE", "SMALL", "LEFT", "PLEURAL", "EFFUSION", "."], ["2", ".", "FEEDING", "TUBE", "AND", "STERNAL", "PLATES", "AGAIN", "SEEN", "."]], 
 "tok_char_indices": [[[0, 11], [12, 21], [22, 33], [34, 43], [43, 44], [45, 48], [49, 56], [56, 57]], [[58, 64], [65, 70], [71, 75], [76, 83], [84, 92], [92, 93]], [[94, 95], [95, 96], [97, 104], [105, 109], [110, 113], [114, 121], [122, 128], [129, 134], [135, 139], [139, 140]]], 
 "sents": ["1.DECREASED BIBASILAR PARENCHYMAL OPACITIES, NOW MINIMAL.", "STABLE SMALL LEFT PLEURAL EFFUSION.", "2. FEEDING TUBE AND STERNAL PLATES AGAIN SEEN."], 
 "sent_char_indices": [[0, 57], [58, 93], [94, 140]]}
 ```

## Load datasets

In [1]:
import datasets
from datasets import load_dataset, Sequence, Image, DatasetDict, concatenate_datasets
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = "/home/yuxiang/liao/mimic/mimic-cxr-jpg/2.1.0"
PATH_PREFIX = "/home/yuxiang/liao/mimic/mimic-cxr-jpg/2.1.0"

dataset_interpret = datasets.load_from_disk("/home/yuxiang/liao/resources/datasets/interpret-cxr")
dataset_interpret_test_public = datasets.load_from_disk("/home/yuxiang/liao/resources/datasets/interpret-cxr-test-public")
dataset_mimic = load_dataset("json", data_files={"train": os.path.join(data_dir, "train_mimic.json"), "validation": os.path.join(data_dir, "val_mimic.json")})


def add_prefix(example):
    example["images"] = [os.path.join(PATH_PREFIX, i) for i in example["images"]]
    return example


dataset_mimic = dataset_mimic.map(add_prefix, num_proc=8).cast_column("images", Sequence(Image()))
dataset_final = DatasetDict({"train": concatenate_datasets([dataset_interpret["train"], dataset_mimic["train"]]), "validation": concatenate_datasets([dataset_interpret["validation"], dataset_mimic["validation"]]), "test": dataset_interpret_test_public["test"]})

In [3]:
# We don't need this column at the moment. It will slowdown the iter speed
dataset_final = dataset_final.remove_columns(["images"])

In [4]:
dataset_final

DatasetDict({
    train: Dataset({
        features: ['source', 'images_path', 'impression', 'findings'],
        num_rows: 550395
    })
    validation: Dataset({
        features: ['source', 'images_path', 'impression', 'findings'],
        num_rows: 14111
    })
    test: Dataset({
        features: ['findings', 'impression'],
        num_rows: 3677
    })
})

## Generate raw_reports.json file

In [5]:
import spacy
from tqdm import tqdm
import json

In [6]:
nlp = spacy.load("en_core_web_sm", exclude=["tagger", "attribute_ruler", "lemmatizer", "ner"])
print(nlp.pipe_names)

['tok2vec', 'parser']


In [7]:
text_tuples = []
for split in ["train", "validation", "test"]:
    for idx, data in enumerate(tqdm(dataset_final[split])):
        doc_key_prefix = f"{split}#{idx}"
        if split == "test":
            valid_key = ""
        else:
            valid_key = f'{data["source"]}#{data["images_path"][0]}'

        text_tuples.append((data["findings"], {"doc_key": f"{doc_key_prefix}#findings", "valid_key": valid_key}))
        text_tuples.append((data["impression"], {"doc_key": f"{doc_key_prefix}#impression", "valid_key": valid_key}))

100%|██████████| 550395/550395 [00:13<00:00, 39426.53it/s]
100%|██████████| 14111/14111 [00:00<00:00, 41530.18it/s]
100%|██████████| 3677/3677 [00:00<00:00, 71681.01it/s]


In [8]:
output_file_path = "/home/yuxiang/liao/workspace/arrg_preprocessing/outputs/interpret_reports/raw_reports.json"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
f = open(output_file_path, "w", encoding="utf-8")

doc_tuples = nlp.pipe(text_tuples, as_tuples=True, n_process=8)

for doc, info_dict in tqdm(doc_tuples):
    output_dict = {"doc_key": info_dict["doc_key"], "sent_toks": [], "tok_char_indices": [], "sents": [], "sent_char_indices": []}
    for sent in doc.sents:
        # Remove leading & trailing whitespaces in a sentence
        sent_text = sent.text.strip()
        sent_start_char = sent.start_char + sent.text.index(sent_text)
        sent_end_char = sent_start_char + len(sent_text)
        output_dict["sents"].append(sent_text)
        output_dict["sent_char_indices"].append((sent_start_char, sent_end_char))

        sent_toks = []
        tok_char_indices = []
        for tok in sent:
            tok_text = tok.text.strip()
            tok_start_char = tok.idx + tok.text.index(tok_text)
            tok_end_char = tok_start_char + len(tok_text)
            # Omit empty tokens
            if tok_text != "":
                sent_toks.append(tok_text)
                tok_char_indices.append((tok_start_char, tok_end_char))
        output_dict["sent_toks"].append(sent_toks)
        output_dict["tok_char_indices"].append(tok_char_indices)

    f.write(json.dumps(output_dict))
    f.write("\n")
f.close()

1136366it [09:47, 1935.78it/s]
