In [69]:
import json
import re
import pandas as pd
import numpy as np
import seaborn as sns

from typing import Dict, Optional, Tuple, Iterable, List
from matplotlib import pyplot as plt
from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.config_meta_cat import ConfigMetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT
from tokenizers import ByteLevelBPETokenizer
from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBase

In [70]:
def clean_text_and_return_concept_indices(input_string, meds_allergies=False):

    if meds_allergies:
        # Find the indices of "{r ... }" and "{m ... }" pairs
        r_start = input_string.find("{r ")
        r_end = input_string.find("}", r_start)
        m_start = input_string.find("{m ")
        m_end = input_string.find("}", m_start)

        # Remove "{r ... }" and "{m ... }" patterns from the input string
        result_string = input_string.replace("{r ", "").replace("}", "").replace("{m ", "").replace("}", "")

        # Calculate the start and end indices for "{m ... }" pattern
        m_end -= 3
        r_end -= 3

        # Adjust the start and end indices based on the removal of "{r ... }" pattern
        if r_start != -1 and r_start < m_start:
            m_start -= 4  # Account for the removal of "{r " and "}"
            m_end -= 4

        if m_start != -1 and m_start < r_start:
            r_start -= 4
            r_end -= 4

        return result_string, m_start, m_end, r_start, r_end
    else:
        p_start = input_string.find("{p ")
        p_end = input_string.find("}", p_start)
        result_string = input_string.replace("{p ", "").replace("}", "")
        p_end -= 3

        return result_string, p_start, p_end

In [71]:
def preprocess_miade_synthetic_data(data, lower_case=True, prefix="p"):
    # drop any NaNs in the concepts column
    data = data.dropna(subset=[prefix]).reset_index(drop=True)

    if lower_case:
        data = data.astype(str).apply(lambda x: x.str.lower())  # all lower case

    # extract cui and name in separate columns
    data[["cui", "name"]] = data[prefix].str.extract(r"^(\d+)\s*\|\s*(.+)$")
    # remove words inside brackets e.g.(disease)
    data["name"].replace(r"\s*\([^)]*\)", "", regex=True, inplace=True)
    # drop the original column
    data.drop(prefix, axis=1, inplace=True)
    # some entries end with | - remove that
    data['name'] = data['name'].str.rstrip('|')

    # extract the start and end indices of concept from text and remove the annotations e.g. {p...}
    start = []
    end = []
    text = []
    for i in range(len(data)):
        if prefix == "m":
            result, m_start, m_end, r_start, r_end = clean_text_and_return_concept_indices(data.text.values[i], meds_allergies=True)
            start.append(m_start)
            end.append(m_end)
        elif prefix == "r":
            result, m_start, m_end, r_start, r_end = clean_text_and_return_concept_indices(data.text.values[i], meds_allergies=True)
            start.append(r_start)
            end.append(r_end)
        elif prefix == "p":
            result, p_start, p_end = clean_text_and_return_concept_indices(data.text.values[i])
            start.append(p_start)
            end.append(p_end)

        text.append(result)

    data["start"] = start
    data["end"] = end
    data["text"] = text

    if prefix == "p":
        # convert labels
        data.replace("no laterality", "none", inplace=True)
        data.replace("positive", "present", inplace=True)
        # tidy up columns
        data.rename(columns={"p_meta_relevance": "relevance", "p_meta_confirmed": "presence", "p_meta_laterality": "laterality (generic)"}, inplace=True)
        data = data[["text", "cui", "name", "start", "end", "relevance", "presence", "laterality (generic)"]]
    elif prefix == "m":
        data.rename(columns={"m_meta_category": "substance_category", "m_meta_allergytype": "allergy_type", "m_meta_severity": "severity"}, inplace=True)
        data = data.fillna("unspecified")
        data = data[["text", "cui", "name", "start", "end", "substance_category", "allergy_type", "severity"]]
    elif prefix == "r":
        data.replace("not a reaction", "none", inplace=True)
        data.rename(columns={"r_meta_reactionpos": "reaction_pos"}, inplace=True)
        data = data[["text", "cui", "name", "start", "end", "reaction_pos"]]


    return data

In [81]:
DATA_DIR = "../medications_and_allergies/2023-07-19/"
data = pd.read_csv(DATA_DIR + "/patterns_medallerg.csv")
reactions_df = preprocess_miade_synthetic_data(data, prefix="r")
reactions_df

Unnamed: 0,text,cui,name,start,end,reaction_pos
0,feeling breathless today.,267036007,dyspnea,8,18,none
1,allergies: severe severe depression with co-te...,310497006,severe depression,18,35,before
2,allergies: severe liver palms with cannabidiol.,248413004,liver palms,18,29,before
3,allergies: severe oral dyspraxia with dapoxetine.,361275004,oral dyspraxia,18,32,before
4,allergies: severe postcholecystectomy diarrhea...,53156005,postcholecystectomy diarrhea,18,46,before
...,...,...,...,...,...,...
356,experienced anaphylaxis with romosozumab,39579001,anaphylaxis,12,23,before
357,had anaphylaxis due to multivitamin capsules,39579001,anaphylaxis,4,15,before
358,had anaphylaxis due to homeopathic hamamelis,39579001,anaphylaxis,4,15,before
359,had anaphylaxis with larvae sterile,39579001,anaphylaxis,4,15,before


In [82]:
DATA_DIR = "../medications_and_allergies/2023-07-19/"
data = pd.read_csv(DATA_DIR + "/patterns_medallerg.csv")
med_allergy_df = preprocess_miade_synthetic_data(data, prefix="m")
med_allergy_df

Unnamed: 0,text,cui,name,start,end,substance_category,allergy_type,severity
0,severe egg allergy.,102263004,eggs,7,10,adverse reaction,allergy,severe
1,allergic to pineapple.,256313003,pineapple,12,21,adverse reaction,allergy,unspecified
2,meds: penicillin v 500mg qds,39359008,product containing phenoxymethylpenicillin,6,18,taking,unspecified,unspecified
3,allergies: severe severe depression with co-te...,18037411000001106,co-tenidone,41,52,adverse reaction,allergy,severe
4,allergies: severe liver palms with cannabidiol.,35022511000001100,cannabidiol,35,46,adverse reaction,allergy,severe
...,...,...,...,...,...,...,...,...
558,not keen on taking alpelisib,38924711000001103,alpelisib,19,28,irrelevant,,
559,not keen on taking selpercatinib,39482711000001104,selpercatinib,19,32,irrelevant,,
560,not keen on taking kaolin poultices,14614411000001101,kaolin poultices,19,35,irrelevant,,
561,not keen on taking gilteritinib,37898211000001109,gilteritinib,19,31,irrelevant,,


In [84]:
DATA_DIR = "../problems/2023-08-09/"
data = pd.read_csv(DATA_DIR + "patterns_problems.csv")
problems_df = preprocess_miade_synthetic_data(data, prefix="p")
problems_df

Unnamed: 0,text,cui,name,start,end,relevance,presence,laterality (generic)
0,issue list\nilio-inguinal nerve entrapment\npa...,22033007,fetal growth retardation,551,574,present,negated,
1,diagnosis list :\ncolitis presumed infectious\...,89538001,helicobacter-associated gastritis,73,106,present,confirmed,
2,problems list hypoglossia\ninjury following al...,31268005,thrombophlebitis migrans,106,130,historic,confirmed,
3,no open wound of flank without complication . ...,275459007,open wound of flank without complication,3,43,present,negated,
4,"differential\nsuspected corn, diverticulitis o...",253866002,crossed ectopia of kidney with fusion anomaly,62,107,present,negated,
...,...,...,...,...,...,...,...,...
47442,dx :\nno loeys-dietz syndrome\n\n\nactive meds...,90465004,crushing injury of face,174,192,present,confirmed,
47443,frequent dsap - disseminated superficial actin...,41495000,disseminated superficial actinic porokeratosis,9,62,historic,confirmed,
47444,never had splinter in foot . currently on isos...,287121002,splinter in foot,10,26,historic,negated,
47445,had anhydramnios\ndifferential anterior shin s...,201137002,alopecia localis,60,76,present,confirmed,


In [92]:
# Concatenate with previous dataset
previous_problems_dataset = pd.read_csv("problems_synthetic_train_data.csv")
df = pd.concat([previous_problems_dataset, problems_df])
df.to_csv("problems_synthetic_train_data.csv")
df

  previous_problems_dataset = pd.read_csv("problems_synthetic_train_data.csv")


Unnamed: 0.1,Unnamed: 0,text,cui,name,start,end,relevance,presence,laterality (generic)
0,0.0,decidual endometritis from age 12,75585005,decidual endometritis,0,21,historic,confirmed,none
1,1.0,bilateral metatarsus adductus prev,15667441000119108,bilateral metatarsus adductus,0,29,historic,confirmed,bilateral
2,2.0,crohn's disease of pylorus a few years previously,61424003,crohn's disease of pylorus,0,26,historic,confirmed,none
3,3.0,no prev hist of congenital facial nerve palsy,230542008,congenital facial nerve palsy,16,45,historic,negated,none
4,4.0,no prev hx accidental fusidic acid overdose,296643001,accidental fusidic acid overdose,11,43,historic,negated,none
...,...,...,...,...,...,...,...,...,...
47442,,dx :\nno loeys-dietz syndrome\n\n\nactive meds...,90465004,crushing injury of face,174,192,present,confirmed,
47443,,frequent dsap - disseminated superficial actin...,41495000,disseminated superficial actinic porokeratosis,9,62,historic,confirmed,
47444,,never had splinter in foot . currently on isos...,287121002,splinter in foot,10,26,historic,negated,
47445,,had anhydramnios\ndifferential anterior shin s...,201137002,alopecia localis,60,76,present,confirmed,


In [93]:
# Concatenate with previous dataset
previous_reactions_dataset = pd.read_csv("reactions_synthetic_train_data.csv")
df = pd.concat([previous_reactions_dataset, reactions_df])
df.to_csv("reactions_synthetic_train_data.csv")
df

Unnamed: 0,text,cui,name,start,end,reaction_pos
0,feeling breathless today.,267036007,dyspnea,8,18,none
1,allergies: severe severe depression with co-te...,310497006,severe depression,18,35,before
2,allergies: severe liver palms with cannabidiol.,248413004,liver palms,18,29,before
3,allergies: severe oral dyspraxia with dapoxetine.,361275004,oral dyspraxia,18,32,before
4,allergies: severe postcholecystectomy diarrhea...,53156005,postcholecystectomy diarrhea,18,46,before
...,...,...,...,...,...,...
356,experienced anaphylaxis with romosozumab,39579001,anaphylaxis,12,23,before
357,had anaphylaxis due to multivitamin capsules,39579001,anaphylaxis,4,15,before
358,had anaphylaxis due to homeopathic hamamelis,39579001,anaphylaxis,4,15,before
359,had anaphylaxis with larvae sterile,39579001,anaphylaxis,4,15,before


In [94]:
# Concatenate with previous dataset
previous_medications_dataset = pd.read_csv("meds_synthetic_train_data.csv")
df = pd.concat([previous_medications_dataset, med_allergy_df])
df.to_csv("meds_synthetic_train_data.csv")
df

Unnamed: 0,text,cui,name,start,end,substance_category,allergy_type,severity
0,severe egg allergy.,102263004,eggs,7,10,adverse reaction,allergy,severe
1,allergic to pineapple.,256313003,pineapple,12,21,adverse reaction,allergy,unspecified
2,meds: penicillin v 500mg qds,39359008,product containing phenoxymethylpenicillin,6,18,taking,unspecified,unspecified
3,allergies: severe severe depression with co-te...,18037411000001106,co-tenidone,41,52,adverse reaction,allergy,severe
4,allergies: severe liver palms with cannabidiol.,35022511000001100,cannabidiol,35,46,adverse reaction,allergy,severe
...,...,...,...,...,...,...,...,...
558,not keen on taking alpelisib,38924711000001103,alpelisib,19,28,irrelevant,,
559,not keen on taking selpercatinib,39482711000001104,selpercatinib,19,32,irrelevant,,
560,not keen on taking kaolin poultices,14614411000001101,kaolin poultices,19,35,irrelevant,,
561,not keen on taking gilteritinib,37898211000001109,gilteritinib,19,31,irrelevant,,
