In [2]:
import pandas as pd

## Transform


In [33]:
CORE_MODALITIES = [
    "CT",
    "CTA",
    "DEXA",
    "Fluoroscopy",
    "MG",
    "MR",
    "MRA",
    "Myelography",
    "Nuclear Medicine",
    "PET",
    "Radiography",
    "US",
    "US Duplex",
]


def load_mappings(filename: str) -> pd.DataFrame:
    # Load the mappings file using pyarrow data types:
    # - Ignore the RecID column
    # - BodyPart, Modality, and Laterality should be categories
    # - Turn BodyPart and Modality and Laterality columns into normalized strings--no whitespace or special characters, all lowercase
    # - Recommendable should be a string
    # - Drop the RecID column
    df = pd.read_csv(
        filename,
        dtype={
            "BodyPart": "str",
            "Modality": "str",
            "Laterality": "str",
            "Recommendable": "str",
        },
    )
    df["BodyPartNorm"] = df["BodyPart"].str.lower().str.replace(r"\W", "", regex=True)
    df["ModalityNorm"] = df["Modality"].str.lower().str.replace(r"\W", "", regex=True)
    df["LateralityNorm"] = (
        df["Laterality"].str.lower().str.replace(r"\W", "", regex=True)
    )
    df["BodyPart"] = df["BodyPart"].astype("category")
    df["Modality"] = df["Modality"].astype("category")
    df["Laterality"] = df["Laterality"].astype("category")
    df["BodyPartNorm"] = df["BodyPartNorm"].astype("category")
    df["ModalityNorm"] = df["ModalityNorm"].astype("category")
    df["LateralityNorm"] = df["LateralityNorm"].astype("category")
    df["Recommendable"] = df["Recommendable"].astype("category")
    # Make the index be a composite of BodyPartNorm, ModalityNorm, and LateralityNorm
    df.set_index(["BodyPartNorm", "ModalityNorm", "LateralityNorm"], inplace=True)
    # Drop the RecID column
    df = df.drop(columns=["RecID"])
    df["CoreModality"] = df["Modality"].isin(CORE_MODALITIES)
    return df

In [34]:
mappings = load_mappings("data/body_part_modality_laterality_recommendable.csv")

In [15]:
mappings.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11100 entries, ('abdomen', 'ct', 'left') to ('wholebody', 'petwct', 'unspecified')
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   BodyPart       11100 non-null  category
 1   Modality       11100 non-null  category
 2   Laterality     11100 non-null  category
 3   Recommendable  11100 non-null  category
 4   CoreModality   11100 non-null  bool    
dtypes: bool(1), category(4)
memory usage: 143.3 KB


In [16]:
mappings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BodyPart,Modality,Laterality,Recommendable,CoreModality
BodyPartNorm,ModalityNorm,LateralityNorm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abdomen,ct,left,Abdomen,CT,Left,CT Abdomen Pelvis,True
abdomen,ct,right,Abdomen,CT,Right,CT Abdomen Pelvis,True
abdomen,ct,bilateral,Abdomen,CT,Bilateral,CT Abdomen Pelvis,True
abdomen,ct,unilateral,Abdomen,CT,Unilateral,CT Abdomen Pelvis,True
abdomen,ct,unspecified,Abdomen,CT,Unspecified,CT Abdomen Pelvis,True


In [24]:
# Save to a parquet file
mappings.to_parquet("data/body_part_modality_laterality_recommendable.parquet")

In [23]:
mappings[(mappings["Recommendable"] == "CT Abdomen Pelvis") & mappings["CoreModality"]][
    ["BodyPart", "Modality", "Laterality"]
]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BodyPart,Modality,Laterality
BodyPartNorm,ModalityNorm,LateralityNorm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abdomen,ct,left,Abdomen,CT,Left
abdomen,ct,right,Abdomen,CT,Right
abdomen,ct,bilateral,Abdomen,CT,Bilateral
abdomen,ct,unilateral,Abdomen,CT,Unilateral
abdomen,ct,unspecified,Abdomen,CT,Unspecified
abdomenpelvis,ct,left,Abdomen Pelvis,CT,Left
abdomenpelvis,ct,right,Abdomen Pelvis,CT,Right
abdomenpelvis,ct,bilateral,Abdomen Pelvis,CT,Bilateral
abdomenpelvis,ct,unilateral,Abdomen Pelvis,CT,Unilateral
abdomenpelvis,ct,unspecified,Abdomen Pelvis,CT,Unspecified


In [35]:
# Load data//modality_mappings.csv into a DataFrame


def load_modality_mappings(filename: str) -> pd.DataFrame:
    # Load the mappings file using pyarrow data types:
    # - Ignore the RecID column
    # - Modality should be a category
    # - Turn Modality column into normalized strings--no whitespace or special characters, all lowercase
    # - Drop the RecID column
    df = pd.read_csv(filename, dtype={"Modality": "str"})
    df["ModalityNorm"] = df["Modality"].str.lower().str.replace(r"\W", "", regex=True)
    df["Modality"] = df["Modality"].astype("category")
    df["ModalityNorm"] = df["ModalityNorm"].astype("category")
    # Make the index be a composite of BodyPartNorm, ModalityNorm, and LateralityNorm
    df.set_index(["ModalityNorm"], inplace=True)
    # Drop the RecID column
    df = df.drop(columns=["ID", "MapTo", "Active"])
    # Drop the row with a missing Modality name
    df = df.dropna(subset=["Modality"])
    return df


modality_mappings = load_modality_mappings("data/modality_mappings.csv")

In [36]:
modality_mappings.info()

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 106 entries, absorptiometry to consult
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Modality  106 non-null    category
 1   Class     106 non-null    object  
dtypes: category(1), object(1)
memory usage: 10.8+ KB


In [30]:
modality_mappings.to_parquet("data/modality_mappings.parquet")

In [37]:
def load_report_codes(filename) -> pd.DataFrame:
    # Load the mappings file using pyarrow data types
    # Leave out the inserted_id and replace_id columns
    # Rename inserted_recommendable to InsertedRecommendable and replace_recommendable to ReplaceRecommendable
    # Make InsertedRecommendable and ReplaceRecommendable categories
    df = pd.read_csv(
        filename,
        dtype={
            "inserted_recommendable": "str",
            "replace_recommendable": "str",
        },
    )
    df["InsertedRecommendable"] = df["inserted_recommendable"].astype("category")
    df["ReplaceRecommendable"] = df["replace_recommendable"].astype("category")
    df = df.drop(
        columns=[
            "inserted_id",
            "replace_id",
            "inserted_recommendable",
            "replace_recommendable",
        ]
    )
    return df


report_codes = load_report_codes("data/report_codes.csv")

In [38]:
report_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788 entries, 0 to 787
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   code                   788 non-null    object  
 1   InsertedRecommendable  788 non-null    category
 2   ReplaceRecommendable   788 non-null    category
dtypes: category(2), object(1)
memory usage: 20.5+ KB


In [39]:
report_codes.to_parquet("data/report_codes.parquet")

In [41]:
def load_recommendables(filename: str) -> pd.DataFrame:
    # Load the mappings file using pyarrow data types
    df = pd.read_csv(
        filename,
        usecols=["name", "category", "modality", "region"],
    )
    df["Name"] = df["name"].astype("category")
    df["Category"] = df["category"].astype("category")
    df["Modality"] = df["modality"].astype("category")
    df["Region"] = df["region"].astype("category")
    df = df.drop(columns=["name", "category", "modality", "region"])
    return df


recommendables = load_recommendables("data/recommendables.csv")

In [42]:
recommendables.to_parquet("data/recommendables.parquet")

## Using Parquet Files


In [3]:
mappings = pd.read_parquet("data/body_part_modality_laterality_recommendable.parquet")
modality_mappings = pd.read_parquet("data/modality_mappings.parquet")
report_codes = pd.read_parquet("data/report_codes.parquet")
recommendables = pd.read_parquet("data/recommendables.parquet")

In [4]:
# Get that resulting dataframe as a list of tuples
list(
    report_codes[report_codes["ReplaceRecommendable"] == "MR Brain"][
        ["code", "InsertedRecommendable"]
    ].itertuples(index=False, name=None)
)

[('MRBrainCranialNerves', 'MR Brain Cranial Nerves'),
 ('MRBrainIAC', 'MR Brain IAC'),
 ('WithContrast', 'MR Brain I+'),
 ('MRBrainTrigeminal', 'MR Brain Trigeminal'),
 ('MRFaceCranialNerves', 'MR Face Cranial Nerves'),
 ('MRSacralPlexus', 'MR Sacral Plexus'),
 ('MRBrainPituitary', 'MR Brain Pituitary'),
 ('MRBrainSpectroscopy', 'MR Brain Spectroscopy')]

In [10]:
report_codes.head()

Unnamed: 0,code,InsertedRecommendable,ReplaceRecommendable
0,SpecialHandling,Recommendation for Additional Imaging - Specia...,All recommendations
1,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Skull Base
2,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Sinus
3,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Parotid
4,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Orbits


In [5]:
recommendables.head()

Unnamed: 0,Name,Category,Modality,Region
0,Additional Imaging Needed,special,,
1,CT Abdomen,abdominal,CT,abdomen
2,CT Abdomen Pelvis,abdominal,CT,abdomen
3,CT Abdomen Pelvis Hypervascular Tumor,abdominal,CT,abdomen
4,CT Adrenal,abdominal,CT,abdomen


In [24]:
def get_code_for_recommendable(report_codes: pd.DataFrame, recommendable: str) -> str:
    try:
        codes = report_codes[report_codes["ReplaceRecommendable"] == recommendable][
            "ReplaceRecommendable"
        ]
        return codes.iloc[0] if not codes.empty else None
    except IndexError:
        return None

In [25]:
get_code_for_recommendable(report_codes, "MR Brain")

'MR Brain'

In [20]:
codes.iloc[0] if len(codes) > 0 else None

'MR Brain'

In [26]:
mappings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,BodyPart,Modality,Laterality,Recommendable,CoreModality
BodyPartNorm,ModalityNorm,LateralityNorm,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abdomen,ct,left,Abdomen,CT,Left,CT Abdomen Pelvis,True
abdomen,ct,right,Abdomen,CT,Right,CT Abdomen Pelvis,True
abdomen,ct,bilateral,Abdomen,CT,Bilateral,CT Abdomen Pelvis,True
abdomen,ct,unilateral,Abdomen,CT,Unilateral,CT Abdomen Pelvis,True
abdomen,ct,unspecified,Abdomen,CT,Unspecified,CT Abdomen Pelvis,True


In [37]:
def get_mappings_for_recommendable(
    mappings: pd.DataFrame, recommendable: str
) -> pd.DataFrame:
    result = mappings[
        (mappings["Recommendable"] == recommendable) & mappings["CoreModality"]
    ][["BodyPart", "Modality"]]
    # drop the index
    result.reset_index(drop=True, inplace=True)
    # get unique combinations of BodyPart, Modality, and Laterality
    result.drop_duplicates(inplace=True)
    return result

In [49]:
result = get_mappings_for_recommendable(mappings, "CT Abdomen")

In [50]:
result

Unnamed: 0,BodyPart,Modality
0,Gallbladder,CT
5,Liver,CT
10,Spleen,CT
15,Stomach,CT


In [47]:
result = list(result.itertuples(index=False, name=None))

In [48]:
result

[('Ankle', 'CT')]

In [52]:
report_codes.head()

Unnamed: 0,code,InsertedRecommendable,ReplaceRecommendable
0,SpecialHandling,Recommendation for Additional Imaging - Specia...,All recommendations
1,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Skull Base
2,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Sinus
3,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Parotid
4,MRBrainCranialNerves,MR Brain Cranial Nerves,MR Orbits


In [51]:
def get_code_for_recommendable(
    report_codes: pd.DataFrame, recommendable: str
) -> str | None:
    try:
        codes = report_codes[report_codes["InsertedRecommendable"] == recommendable][
            "code"
        ]
        return codes.iloc[0] if not codes.empty else None
    except IndexError:
        return None

In [53]:
report_codes[
    report_codes["InsertedRecommendable"] == "CT Abdomen Pelvis Hypervascular Tumor"
]["ReplaceRecommendable"].unique()

['CT Cystogram', 'CT Abdomen Pelvis', 'CT Renal', 'CT Abdomen', 'CT Colonography', ..., 'CT Pancreas', 'CTA Abdomen', 'CTA Abdomen Pelvis Venogram', 'CT Liver', 'CT Lumbar Spine']
Length: 13
Categories (194, object): ['All recommendations', 'CT Abdomen', 'CT Abdomen Pelvis', 'CT Adrenal', ..., 'XR Tibia Fibula Right', 'XR Wrist Bilateral', 'XR Wrist Left', 'XR Wrist Right']

In [55]:
def get_mappings_for_unmapped_recommendable(
    mappings: pd.DataFrame, report_codes, recommendable: str
) -> pd.DataFrame:
    code = get_code_for_recommendable(report_codes, recommendable)
    # Get the ReplaceRecommendables for the code
    replace_recommendables = report_codes[
        report_codes["ReplaceRecommendable"] == recommendable
    ]["ReplaceRecommendable"].unique()
    # Get the mappings for each ReplaceRecommendable
    result = mappings[
        (mappings["Recommendable"].isin(replace_recommendables))
        & mappings["CoreModality"]
    ][["BodyPart", "Modality"]]
    # drop the index
    result.reset_index(drop=True, inplace=True)
    # get unique combinations of BodyPart, Modality, and Laterality
    result.drop_duplicates(inplace=True)
    return result

In [56]:
get_mappings_for_unmapped_recommendable(
    mappings, report_codes, "CT Abdomen Pelvis Hypervascular Tumor"
)

Unnamed: 0,BodyPart,Modality
0,Abdomen,CT
5,Abdomen,CTA
10,Abdomen Pelvis,CT
15,Abdomen Pelvis,CTA
20,Celiac Plexus,CT
...,...,...
325,Visceral Arteries,CTA
330,IVC,CT
335,IVC,CTA
340,Ovarian Vein,CT


In [24]:
import numpy as np

# get a new df with just the unique inserted recommendables and their report codes
unique_inserted_recommendables = report_codes.drop_duplicates(
    subset=["InsertedRecommendable"]
)[["code", "InsertedRecommendable"]]
report_code_info = unique_inserted_recommendables.merge(
    recommendables, left_on="InsertedRecommendable", right_on="Name"
)
report_code_info.drop(columns=["Name"], inplace=True)
# If code is Venogram, WithContrast, or Arthrogram, set Category to "special", and Modality and Region to NaN
report_code_info.loc[
    report_code_info["code"].isin(["Venogram", "WithContrast", "Arthrogram"]),
    ["Category", "Modality", "Region"],
] = ["special", np.nan, np.nan]
# replace the NaNs in the Modality and Region columns with empty strings
report_code_info.head()

Unnamed: 0,code,InsertedRecommendable,Category,Modality,Region
0,SpecialHandling,Recommendation for Additional Imaging - Specia...,special,,
1,MRBrainCranialNerves,MR Brain Cranial Nerves,neuro,MR,head
2,CTChestILD,CT Chest ILD,thoracic,CT,chest
3,Venogram,CTA Lower Extremity Venogram Bilateral,special,,
4,XRPelvisHipRight,XR Pelvis Hip Right,msk,XR,pelvis


In [25]:
report_code_info.to_csv("data/report_code_info.csv", index=False)

In [26]:
report_code_info.value_counts("code")

code
WithContrast         66
Arthrogram           14
Venogram             12
USAbdomenLimited      1
NMWBCScan             1
                     ..
MRAbdominalWall       1
MRAbdomenwithMRCP     1
MRAbdomenPelvis       1
MRAThoracicOutlet     1
XRPelvisHipRight      1
Name: count, Length: 103, dtype: int64