In [71]:
import pandas as pd

In [72]:
def clean_text_and_return_concept_indices(input_string, meds_allergies=False):

    if meds_allergies:
        # Find the indices of "{r ... }" and "{m ... }" pairs
        r_start = input_string.find("{r ")
        r_end = input_string.find("}", r_start)
        m_start = input_string.find("{m ")
        m_end = input_string.find("}", m_start)

        # Remove "{r ... }" and "{m ... }" patterns from the input string
        result_string = input_string.replace("{r ", "").replace("}", "").replace("{m ", "").replace("}", "")

        # Calculate the start and end indices for "{m ... }" pattern
        m_end -= 3
        r_end -= 3

        # Adjust the start and end indices based on the removal of "{r ... }" pattern
        if r_start != -1 and r_start < m_start:
            m_start -= 4  # Account for the removal of "{r " and "}"
            m_end -= 4

        if m_start != -1 and m_start < r_start:
            r_start -= 4
            r_end -= 4

        return result_string, m_start, m_end, r_start, r_end
    else:
        p_start = input_string.find("{p ")
        p_end = input_string.find("}", p_start)
        result_string = input_string.replace("{p ", "").replace("}", "")
        p_end -= 3

        return result_string, p_start, p_end

In [73]:
def preprocess_miade_synthetic_data(data, lower_case=True, prefix="p"):
    # drop any NaNs in the concepts column
    data = data.dropna(subset=[prefix]).reset_index(drop=True)

    if lower_case:
        data = data.astype(str).apply(lambda x: x.str.lower())  # all lower case

    # extract cui and name in separate columns
    data[["cui", "name"]] = data[prefix].str.extract(r"^(\d+)\s*\|\s*(.+)$")
    # remove words inside brackets e.g.(disease)
    data["name"].replace(r"\s*\([^)]*\)", "", regex=True, inplace=True)
    # drop the original column
    data.drop(prefix, axis=1, inplace=True)
    # some entries end with | - remove that
    data['name'] = data['name'].str.rstrip('|')

    # extract the start and end indices of concept from text and remove the annotations e.g. {p...}
    start = []
    end = []
    text = []
    for i in range(len(data)):
        if prefix == "m":
            result, m_start, m_end, r_start, r_end = clean_text_and_return_concept_indices(data.text.values[i], meds_allergies=True)
            start.append(m_start)
            end.append(m_end)
        elif prefix == "r":
            result, m_start, m_end, r_start, r_end = clean_text_and_return_concept_indices(data.text.values[i], meds_allergies=True)
            start.append(r_start)
            end.append(r_end)
        elif prefix == "p":
            result, p_start, p_end = clean_text_and_return_concept_indices(data.text.values[i])
            start.append(p_start)
            end.append(p_end)

        text.append(result)

    data["start"] = start
    data["end"] = end
    data["text"] = text

    if prefix == "p":
        # convert labels
        data.replace("no laterality", "none", inplace=True)
        data.replace("positive", "present", inplace=True)
        # tidy up columns
        data.rename(columns={"p_meta_relevance": "relevance", "p_meta_confirmed": "presence", "p_meta_laterality": "laterality (generic)"}, inplace=True)
        data = data[["text", "cui", "name", "start", "end", "relevance", "presence", "laterality (generic)"]]
    elif prefix == "m":
        data.rename(columns={"m_meta_category": "substance_category", "m_meta_allergytype": "allergy_type", "m_meta_severity": "severity"}, inplace=True)
        data = data.fillna("unspecified")
        data = data[["text", "cui", "name", "start", "end", "substance_category", "allergy_type", "severity"]]
    elif prefix == "r":
        data.replace("not a reaction", "none", inplace=True)
        data.rename(columns={"r_meta_reactionpos": "reaction_pos"}, inplace=True)
        data = data[["text", "cui", "name", "start", "end", "reaction_pos"]]


    return data

In [74]:
def balance_classes(df, feature_name):
    min_size = min(df.groupby([feature_name]).size())
    df = df[['text', 'cui', 'name', 'start', 'end', feature_name]]
    return pd.concat([
        df[
            df[feature_name] == class_name
        ].iloc[:min_size]
        for
            class_name
        in
            df[feature_name].unique()
    ])

In [82]:
DATA_DIR = "../medications_and_allergies/2023-08-09/"
data = pd.read_csv(DATA_DIR + "/patterns_medallerg.csv")
reactions_df = preprocess_miade_synthetic_data(data, prefix="r")
reactions_df

Unnamed: 0,text,cui,name,start,end,reaction_pos
0,no personal hist hernia . taking promazine 50m...,197118003,constipation - functional,246,269,before
1,diag list :\nhyperhidrosis of palms\ntinea ami...,200963002,psoriasis circinata,344,363,none
2,problem is sciatic nerve lesion\ndiag : snake ...,67233009,middle insomnia,335,352,before
3,attended clinic today with her grandmother . h...,65124004,swelling,243,251,before
4,"diag : conductive hearing loss,\nincomplete ri...",39579001,anaphylaxis,169,180,after
...,...,...,...,...,...,...
15069,on examination - bladder not palpable . intole...,162031009,indigestion,98,109,after
15070,no cyst of semilunar cartilage or morbid obesi...,95655001,ophthalmic migraine,209,228,before
15071,family history of cocaine-induced psychosis .\...,68962001,muscle pain,55,64,before
15072,"feeling unwell today., standard chest x-ray re...",,,-1,-4,none


In [83]:
DATA_DIR = "../medications_and_allergies/2023-08-09/"
data = pd.read_csv(DATA_DIR + "/patterns_medallerg.csv")
med_allergy_df = preprocess_miade_synthetic_data(data, prefix="m")

allergy_severity_df = balance_classes(med_allergy_df, 'severity')
substance_category_df = balance_classes(med_allergy_df, 'substance_category')
allergy_type_df = balance_classes(med_allergy_df, 'allergy_type')

In [84]:
DATA_DIR = "../problems/2023-08-09/"
data = pd.read_csv(DATA_DIR + "patterns_problems.csv")
problems_df = preprocess_miade_synthetic_data(data, prefix="p")
problems_df

Unnamed: 0,text,cui,name,start,end,relevance,presence,laterality (generic)
0,issue list\nilio-inguinal nerve entrapment\npa...,22033007,fetal growth retardation,551,574,present,negated,
1,diagnosis list :\ncolitis presumed infectious\...,89538001,helicobacter-associated gastritis,73,106,present,confirmed,
2,problems list hypoglossia\ninjury following al...,31268005,thrombophlebitis migrans,106,130,historic,confirmed,
3,no open wound of flank without complication . ...,275459007,open wound of flank without complication,3,43,present,negated,
4,"differential\nsuspected corn, diverticulitis o...",253866002,crossed ectopia of kidney with fusion anomaly,62,107,present,negated,
...,...,...,...,...,...,...,...,...
47442,dx :\nno loeys-dietz syndrome\n\n\nactive meds...,90465004,crushing injury of face,174,192,present,confirmed,
47443,frequent dsap - disseminated superficial actin...,41495000,disseminated superficial actinic porokeratosis,9,62,historic,confirmed,
47444,never had splinter in foot . currently on isos...,287121002,splinter in foot,10,26,historic,negated,
47445,had anhydramnios\ndifferential anterior shin s...,201137002,alopecia localis,60,76,present,confirmed,


In [85]:
problems_df.to_csv("problems_synthetic_train_data.csv")

In [86]:
reactions_df.to_csv("reactions_synthetic_train_data.csv")

In [87]:
med_allergy_df.to_csv("meds_synthetic_train_data.csv")
allergy_severity_df.to_csv("allergy_severity_synthetic_train_data.csv")
substance_category_df.to_csv("substance_category_synthetic_train_data.csv")
allergy_type_df.to_csv("allergy_type_synthetic_train_data.csv")