Import libraries that will be used

In [None]:
import pandas as pd
import re

Inspired by: https://www.drivendata.org/competitions/217/cdc-fall-narratives/community-code/13/
Who used a medical dictionary (e.g. https://medical-dictionary.thefreedictionary.com/) and lots of googling to gather what some of the main terms mean. 

With a fair number of modifications and additions 


In [None]:
medical_terms = {
    "&": "and",
    "***": "",
    ">>": "clinical diagnosis",
    "@": "at",
    "+": "with",
    "?": "unknown if",
    "abd": "abdomen",
    "af": "accidental fall",
    "afib": "atrial fibrillation",
    "alf": "assisted living facility",
    "aki": "acute kidney injury",
    "am": "morning",
    "ams": "altered mental status",
    "bac": "blood alcohol content",
    "biba": "brought in by ambulance",
    "bwd": "backwards",
    "c/o": "complains of",
    "chi": "closed-head injury",
    "clsd": "closed",
    "cpk": "creatine phosphokinase",
    "cva": "cerebral vascular accident",
    "dn": "down",
    "dtr": "daughter",
    "dx": "clinical diagnosis",
    "ecf": "extended care facility",
    "er": "emergency room",
    "etoh": "ethyl alcohol",
    "eval": "evaluation",
    "fd": "found",
    "ft": "foot",
    "fx": "fracture",
    "fxs": "fractures",
    "fwd": "forwards",
    "glf": "ground level fall",
    "h/o": "history of",
    "hr": "hours",
    "htn": "hypertension",
    "hx": "history of",
    "inj": "injury",
    "inr": "international normalized ratio",
    "intox": "intoxication",
    "lac": "laceration",
    "loc": "loss of consciousness",
    "lt": "left",
    "mech": "mechanical",
    "mult": "multiple",
    "n h ": "nursing home",
    "nh": "nursing home",
    "p/w": "presents with",
    "pm": "afternoon",
    "pt": "patient",
    "pta": "prior to arrival",
    "pts": "patient's",
    "px": "physical examination", # not "procedure",
    "r/o": "rules out",
    "rt": "right",
    "s/p": "after",
    "sah": "subarachnoid hemorrhage",
    "sdh": "acute subdural hematoma",
    "sts": "sit to stand",
    "tr": "trauma",
    "uti": "urinary tract infection",
    "unwit'd": "unwitnessed",
    "w/o": "without",
    "w/": "with",
    "wks": "weeks"
}

In [None]:
def add_space_after(text, target=","):
    replaced_text = re.sub(r'({})(\S)'.format(re.escape(target)), r'{} \2'.format(target),text)
    return replaced_text

def add_space_before(text, target=","):
    replaced_text = re.sub(r'(\S)({})'.format(re.escape(target)), r'\1 {}'.format(target), text)
    return replaced_text

# cleanning
def clean_narrative(text):
    # lowercase everything
    if pd.isna(text):
        return "NA"
    else:     
        text = text.lower()
        
        # unglue DX
        regex_dx = r"([ˆ\W]*(dx)[ˆ\W]*)"
        text = re.sub(regex_dx, r". dx: ", text)
    
        # remove age and sex identifications
        ## regex to capture age and sex (not perfect but captures almost all of the cases)
        regex_age_sex = r"(\d+)\s*?(yof|yf|yo\s*female|yo\s*f|yom|ym|yo\s*male|yo\s*m)"
        age_sex_match = re.search(regex_age_sex, text)
    
        ## format age and sex
        if age_sex_match:
            age = age_sex_match.group(1)
            sex = age_sex_match.group(2)
            
            # probably not best practice but it works with this data
            if "f" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old female")
                text = text.replace(age_sex_match.group(0), f"patient")
            elif "m" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old male")
                text = text.replace(age_sex_match.group(0), f"patient")
                
        text = add_space_after(text, target=",") 
        text = add_space_after(text, target=";") 
        text = add_space_after(text, target=":") 
        text = add_space_before(text, target="--")  
        text = add_space_after(text, target="--")
        text = re.sub("-", " ", text)
        text = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", text).strip()
        text = re.sub("\.", " ", text)
        
        # The below two are separated from the dictionary because of the '&' character conflict
        #   with the translate medical terms section use of '&'
        text = re.sub("t'd&f", "tripped and fell", text)
        text = re.sub("s'd&f", "slipped and fell",text)     
    
        
        # translate medical terms
        for term, replacement in medical_terms.items():
            if term == "@" or term == ">>" or term == "&" or term == "***" or term == "+" or term == "?":
                pattern = fr"({re.escape(term)})"
                text = re.sub(pattern, f" {replacement} ", text) # force spaces around replacement
                
            else:
                pattern = fr"(?<!-)\b({re.escape(term)})\b(?!-)"
                text = re.sub(pattern, replacement, text)
                            
        # This done after translate medical terms because some use a '/'
        text = add_space_before(text, target="/")  
        text = add_space_after(text, target="/")
        
        # remove extra white spaces
        text = re.sub(r'\s+', ' ', text)
    
        return text.upper()

Test a couple narrative samples with function

In [None]:
text = '72 YOF SLIPPED AND FELL ON THE FLOOR THIS AM>>L-3, L-4 FRACTURE, +LOC, RT RIB FRACTURES X 3.'
print("Original text:", text)
print("Clean text:", clean_narrative(text))

In [None]:
text = "84YF @ ALF, LAST PM SLID OUT OF THE BED&FD THIS AM ON THE FLOOR, PER ECF PT HIT HEAD, ?LOC>>CHI"
print("Original text:", text)
print("Clean text:", clean_narrative(text))

## Clean all narratives

In [None]:
import pandas as pd
df = pd.read_csv(
    "primary_data.csv",
    # set columns that can be null to nullable ints
    dtype={"body_part_2": "Int64", "diagnosis_2": "Int64"},
)
df.head()

Subset to just the first two columns

In [None]:
df_narratives = df.iloc[:, 0:2]
df_narratives.head(2)

In [None]:
df_narratives['narrative'] = df_narratives['narrative'].apply(lambda x: clean_narrative(x))

In [None]:
# Rename original narrative column
df_nn = df.rename(columns={"narrative": "narrative_original"})

In [None]:
df_final = pd.merge(df_narratives, df_nn, on="cpsc_case_number", how="left")

In [None]:
len(df_final)

In [None]:
df_final.to_csv("corrected_narrative_primary.csv", index=False)