In [13]:
import pandas as pd
from constants import (
    FILE_NAMES,
    SPECIAL_HANDLING,
    SPECIAL_RECOMMENDABLES,
    CORE_MODALITIES
)

In [3]:
mappings = pd.read_parquet(FILE_NAMES["mappings"])
mappings.sort_index(inplace=True, na_position="first")
report_codes = pd.read_parquet(FILE_NAMES["report_codes"])
recommendables = pd.read_parquet(FILE_NAMES["recommendables"])
modality_mappings = pd.read_parquet(FILE_NAMES["modality_mappings"])

In [4]:
mapped_recommendables = mappings["Recommendable"].unique()
report_code_recommendables = report_codes["InsertedRecommendable"].unique()

In [14]:
core_recommendables = recommendables[recommendables["Modality"].isin(CORE_MODALITIES)]

In [16]:
core_recommendables

Unnamed: 0,Name,Category,Modality,Region
4,Recommendation for Additional Imaging - Latera...,special,MG,
5,Recommendation for Additional Imaging - Latera...,special,MR,
6,Recommendation for Additional Imaging - Latera...,special,MRA,
9,Recommendation for Additional Imaging - Latera...,special,US,
37,MRA Popliteal Entrapment Left,cv,MRA,leg
...,...,...,...,...
1492,XR Chest Lordotic,thoracic,Radiography,chest
1501,XR Chest Nipple Markers,msk,Radiography,chest
1510,XR Chest Nipple Markers,thoracic,Radiography,chest
1519,XR Hips with Pelvis Bilateral,msk,Radiography,pelvis


In [11]:
def get_code_for_recommendable(
    report_codes: pd.DataFrame, recommendable: str
) -> str | None:
    try:
        codes = report_codes[report_codes["InsertedRecommendable"] == recommendable][
            "code"
        ]
        return codes.iloc[0] if not codes.empty else None
    except IndexError:
        return None

def classify_recommendable(recommendable: str) -> str:
    if recommendable == SPECIAL_HANDLING:
        return "SpecialHandling"
    if (
        recommendable in mapped_recommendables
        or recommendable in SPECIAL_RECOMMENDABLES
    ):
        return "—"
    if recommendable in report_code_recommendables:
        code = get_code_for_recommendable(report_codes, recommendable)
        assert code is not None
        return code
    return "Unknown"

In [None]:
recommendables["Code"] = recommendables["Name"].apply(classify_recommendable)

In [17]:
# First, get all of the unique recommendable names in the mappings DataFrame
unique_recommendables = mappings["Recommendable"].unique()
# Then, get all of the unique recommendable names in the report_codes DF
unique_report_codes = report_codes["InsertedRecommendable"].unique()
# Combine both sets of unique recommendables
all_recommendables = set(unique_recommendables).union(set(unique_report_codes))

In [18]:
old_recommendables = pd.read_csv("data/old/recommendables.csv")

In [20]:
# Find the recommendables in the all_recommendables set that are not in the old_recommendables DataFrame
missing_recommendables = all_recommendables.difference(
    set(old_recommendables["name"].unique())
)

In [21]:
print("Missing recommendables:", "\n".join(missing_recommendables))

Missing recommendables: US Upper Extremity Non-Venous Right
US Lower Extremity Non-Venous Right
XR Thoracic Spine
MR Pulmonary vein
XR Sacrum and Coccyx
US Lower Back
XR Toe Right
MR Pelvis
US Lower Extremity Non-Venous Left
US Breast Bilateral
US Axilla Not Breast Left
XR Sacroiliac Joints
MRI Toe Right
XR Toe Left
CT Pulmonary veins
XR Skull
US Upper Back
NM HIDA
MRI Toe Left
US Axilla Not Breast Right
US Neck Lymph Node Survey
CT Chest Esophagus
XR Sternum
US Upper Extremity Non-Venous Left
US Infant Spinal Canal
US Infant Hips
XR Skeletal Survey


In [22]:
new_recommendables = pd.read_csv("data/recommendables.csv")

In [23]:
old_recommendables.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 427 entries, 0 to 426
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        427 non-null    int64 
 1   name      427 non-null    object
 2   category  427 non-null    object
 3   modality  422 non-null    object
 4   region    422 non-null    object
dtypes: int64(1), object(4)
memory usage: 16.8+ KB


In [30]:
# Using new_recommendables to get the missing fields, create new rows in old_recommendables for each missing recommendable
new_rows = []
for recommendable in missing_recommendables:
    if recommendable in new_recommendables["name"].values:
        new_row = new_recommendables[new_recommendables["name"] == recommendable]
        if not new_row.empty:
            new_row = new_row.iloc[0]
            new_rows.append(
                {
                    "id": new_row["ID"],
                    "name": recommendable,
                    "modality": new_row["Modality"],
                    "region": new_row["Region"],
                    "category": new_row["Category"],
                }
            )

In [31]:
# create a new all_recommendables DataFrame with the old_recommendables and the new rows
if new_rows:
    new_recommendables_df = pd.DataFrame(new_rows)
    all_recommendables_df = pd.concat([old_recommendables, new_recommendables_df], ignore_index=True)

In [34]:
# What are the duplicated names in the all_recommendables_df?
duplicated_names = all_recommendables_df[all_recommendables_df.duplicated(subset=["name"], keep=False)]
if not duplicated_names.empty:
    print("Duplicated names in all_recommendables_df:")
    print(duplicated_names)

In [35]:
# Why are there 451 in the all_recommendables set, but 453 in the all_recommendables_df?
# What is in the all_recommendables_df that is not in the all_recommendables set?
extra_recommendables = set(all_recommendables_df["name"].unique()).difference(all_recommendables)
if extra_recommendables:
    print("Extra recommendables in all_recommendables_df that are not in all_recommendables set:")
    print("\n".join(extra_recommendables))

Extra recommendables in all_recommendables_df that are not in all_recommendables set:
NM Gallbladder HIDA
Interventional Procedure Recommendation
Additional Imaging Needed


In [37]:
# Find the recommendables in all_recommendables_df whose modality is not a two-letter code
non_two_letter_modality = all_recommendables_df[~all_recommendables_df["modality"].str.match(r"^[A-Z]{2,3}$", na=False)]
if not non_two_letter_modality.empty:
    print("Recommendables with non-two-letter modality:")
    print(non_two_letter_modality[["name", "modality"]])
# --- IGNORE ---

Recommendables with non-two-letter modality:
                                                  name                modality
0                            Additional Imaging Needed                     NaN
141            Interventional Procedure Recommendation                     NaN
325                       Non-Radiology Recommendation                     NaN
326  Recommendation for Additional Imaging - Latera...                     NaN
327  Recommendation for Additional Imaging - Specia...                     NaN
427                US Upper Extremity Non-Venous Right                   Elbow
428                US Lower Extremity Non-Venous Right        US • Transrectal
429                                  XR Thoracic Spine  X-Ray - Absorptiometry
431                               XR Sacrum and Coccyx             Radiography
432                                      US Lower Back        US • Transrectal
433                                       XR Toe Right             Radiography
434    

In [42]:
fixed_modalities = {
    427: "US",
    428: "US",
    429: "XR",
    430: "MR",
    431: "XR",
    432: "US",
    433: "XR",
    434: "US",  
    436: "US",
    437: "XR",
    438: "MR",
    439: "XR",
    440: "CT",
    441: "XR",
    443: "NM",
    444: "MR",
    445: "US",
    447: "CT",
    448: "XR",
    449: "US",
    450: "US",
    451: "US",
    452: "XR",
}

In [43]:
# Update the all_recommendables_df with the fixed modalities; the keys are indices, not ids
for index, modality in fixed_modalities.items():
    all_recommendables_df.loc[index, "modality"] = modality

In [45]:
all_recommendables_df.loc[443, "category"] = "nm"
all_recommendables_df.loc[443, "region"] = "abdomen"

In [46]:
# Find the values for ID that occur more than once in the all_recommendables_df
duplicate_ids = all_recommendables_df[all_recommendables_df.duplicated(subset=["id"], keep=False)]
if not duplicate_ids.empty:
    print("Duplicate IDs in all_recommendables_df:")
    print(duplicate_ids[["id", "name"]])

Duplicate IDs in all_recommendables_df:
       id                        name
333  6573   US Axilla Non Breast Left
334  6574  US Axilla Non Breast Right
436  6573   US Axilla Not Breast Left
445  6574  US Axilla Not Breast Right


In [47]:
# Drop rows 333 and 334 from all_recommendables_df
all_recommendables_df.drop(index=[333, 334], inplace=True)