<a href="https://colab.research.google.com/github/victormurcia/CTS_Test/blob/main/Testing_Parsing_of_Patient_EHR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [71]:
#I need to import locale to ensure that the encoding is set to UTF-8 (weird Google Colab bug)
import locale
locale.getpreferredencoding = lambda: "UTF-8"

#Check the current build in Google Colab
!cat /etc/*release
print('\n')

#Check CUDA version
!nvcc --version
print('\n')

#Ensure that the required packages are installed in the current environment
!pip install ipywidgets --quiet
!pip install spacy==3.4.4 --quiet
!pip install scispacy --quiet
!pip install medspacy --quiet
!pip install negspacy --quiet
!pip install transformers
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet 
print('\n')

#Spacy models used for processing biomedical, scientific, or clinical text 
#Spacy pipeline for biomedical data.
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz --quiet

print('\n')

DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=20.04
DISTRIB_CODENAME=focal
DISTRIB_DESCRIPTION="Ubuntu 20.04.5 LTS"
NAME="Ubuntu"
VERSION="20.04.5 LTS (Focal Fossa)"
ID=ubuntu
ID_LIKE=debian
PRETTY_NAME="Ubuntu 20.04.5 LTS"
VERSION_ID="20.04"
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
VERSION_CODENAME=focal
UBUNTU_CODENAME=focal


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  Preparing metadata (setup.py) ... [?25l[?25hdone




In [72]:
#Import the required libraries/packages
#General utilities
import numpy as np
import pandas as pd
import dask.dataframe as dd
import seaborn as sns
from ipywidgets import widgets, interact, interactive, fixed, interact_manual
import random, string

#NLP Stuff
#Spacy
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #Load stopwords
from spacy.language import Language
from spacy.tokenizer import Tokenizer
#Scispacy
import scispacy
from scispacy.linking import EntityLinker
from scispacy.abbreviation import AbbreviationDetector
from scispacy.hyponym_detector import HyponymDetector
#Medspacy
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent
from negspacy.negation import Negex

#To use Transformers models from HuggingFace
import transformers
from transformers import AutoTokenizer, AutoModel,AutoModelForTokenClassification

In [73]:
#Enable data to be extracted from my Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load the Restructured Patient Dataframe
I have the .csv file containing a dataframe I made after restructuring the patient EHRs to play with in my GitHub repository. 

In [74]:
url ='https://raw.githubusercontent.com/victormurcia/CTS_Test/main/multi_veteran_df.csv'
patients_df = pd.read_csv(url)
patients_df

Unnamed: 0,PATIENT,CODE_als,DESCRIPTION_als,Id_cps,CODE_cps,DESCRIPTION_cps,REASONCODE_cps,REASONDESCRIPTION_cps,CODE_cds,DESCRIPTION_cds,...,LAT_pts,LON_pts,HEALTHCARE_EXPENSES_pts,HEALTHCARE_COVERAGE_pts,CODE_prs,DESCRIPTION_prs,REASONCODE_prs,REASONDESCRIPTION_prs,CODE_sps,DESCRIPTION_sps
0,98de4759-8225-4160-adb6-2559305fe1df,"[300916003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 91930004.0, 300913006.0]","['Latex allergy', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy', 'All...","['3015755d-ed8c-4d24-9351-c62a2c70c4cb', 'af95a5e1-36cc-41ba-b037-85aec558cf77', 'df21521c-2625-...","[711282006.0, 384758001.0, 699728000.0, 225358003.0, 170836005.0, 734163000.0, 225358003.0, 5395...","['Skin condition care', 'Self-care interventions (procedure)', 'Asthma self management', 'Wound ...","[24079001.0, nan, 233678006.0, 284549007.0, nan, 55680006.0, 283371005.0, 10509002.0, 15777000.0...","['Atopic dermatitis', nan, 'Childhood asthma', 'Laceration of hand', nan, 'Drug overdose', 'Lace...","[24079001.0, 65363002.0, 233678006.0, 367498001.0, 195662009.0, 284549007.0, 43878008.0, 1956620...","['Atopic dermatitis', 'Otitis media', 'Childhood asthma', 'Seasonal allergic rhinitis', 'Acute v...",...,[38.11572396207773],[-122.29443682927533],[1015063.23],[15986.36],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 395142003.0, 430193006.0, 1712...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, nan, nan, 233678006.0, nan, 195662009.0, nan, nan, nan, 284549007.0, n...","[nan, nan, nan, nan, nan, nan, nan, 'Childhood asthma', nan, 'Acute viral pharyngitis (disorder)...","[409534002.0, 713779008.0, 469673003.0, 706724001.0, 419343004.0, 470618009.0, 409534002.0, 7137...","['Disposable air-purifying respirator (physical object)', 'Nitrile examination/treatment glove ..."
1,ad5977c9-1260-495b-aa55-dc09860ec783,"[419474003.0, 232350006.0, 232347008.0, 418689008.0, 419263009.0, 425525006.0, 91934008.0]","['Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy', 'Allergy to grass pol...","['b18fecf0-42c3-4c45-8693-168720f9390a', '9e610fdb-a195-4bab-95c4-08750c2fc1ca', 'a9326bbb-1b14-...","[384758001.0, 699728000.0, 53950000.0, 385691007.0, 386522008.0, 385691007.0, 91251008.0, 539500...","['Self-care interventions (procedure)', 'Asthma self management', 'Respiratory therapy', 'Fractu...","[nan, 233678006.0, 10509002.0, 33737001.0, 192127007.0, 16114001.0, 44465007.0, 10509002.0]","[nan, 'Childhood asthma', 'Acute bronchitis (disorder)', 'Fracture of rib', 'Child attention def...","[65363002.0, 446096008.0, 444814009.0, 233678006.0, 65363002.0, 65363002.0, 10509002.0, 33737001...","['Otitis media', 'Perennial allergic rhinitis', 'Viral sinusitis (disorder)', 'Childhood asthma'...",...,[34.07864131788067],[-117.68830169526343],[30604.94],[0.0],"[430193006.0, 395142003.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 171231001.0, 4301...","['Medication Reconciliation (procedure)', 'Allergy screening test', 'Medication Reconciliation (...","[nan, nan, nan, nan, nan, nan, 233678006.0, nan, nan, 33737001.0, nan, nan, nan, nan, nan, 19212...","[nan, nan, nan, nan, nan, nan, 'Childhood asthma', nan, nan, 'Fracture of rib', nan, nan, nan, n...",[],[]
2,af5f7e54-ddd4-4203-833e-d6e2987be0b0,"[419474003.0, 232347008.0, 418689008.0, 419263009.0, 425525006.0, 420174000.0]","['Allergy to mould', 'Dander (animal) allergy', 'Allergy to grass pollen', 'Allergy to tree poll...","['06cff475-6242-48da-85f8-33400552168f', '5e3a362a-855b-40ee-9fa6-ffee45aa860a', '0d6944bc-6e99-...","[384758001.0, 47387005.0, 53950000.0, 385691007.0, 385691007.0, 698360004.0, 91251008.0, 5395000...","['Self-care interventions (procedure)', 'Head injury rehabilitation', 'Respiratory therapy', 'Fr...","[nan, 62106007.0, 10509002.0, 65966004.0, 263102004.0, 15777000.0, 44465007.0, 10509002.0, 28338...","[nan, 'Concussion with no loss of consciousness', 'Acute bronchitis (disorder)', 'Fracture of fo...","[65363002.0, 62106007.0, 10509002.0, 195662009.0, 65966004.0, 444814009.0, 263102004.0, 44481400...","['Otitis media', 'Concussion with no loss of consciousness', 'Acute bronchitis (disorder)', 'Acu...",...,[37.34491726453704],[-121.9326999347838],[1424871.8],[6762.9],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 2342...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, nan, nan, 10509002.0, nan, nan, nan, nan, nan, nan, 65966004.0, nan, n...","[nan, nan, nan, nan, nan, nan, nan, 'Acute bronchitis (disorder)', nan, nan, nan, nan, nan, nan,...",[],[]
3,3d8909a0-651c-4e62-bf3e-1482390bdc58,"[300916003.0, 424213003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 419263009.0, 4255...","['Latex allergy', 'Allergy to bee venom', 'Allergy to mould', 'House dust mite allergy', 'Dander...","['d8250f49-a205-4e93-bcd4-892ca1e42ea0', '508ab50c-a8de-4ce8-8b7b-98dd7f4375c1', 'f31dbc3b-0f08-...","[384758001.0, 53950000.0, 711282006.0, 53950000.0, 385691007.0, 386522008.0, 734163000.0, 698360...","['Self-care interventions (procedure)', 'Respiratory therapy', 'Skin condition care', 'Respirato...","[nan, 10509002.0, 24079001.0, 10509002.0, 65966004.0, 192127007.0, 449868002.0, 15777000.0, 2398...","[nan, 'Acute bronchitis (disorder)', 'Atopic dermatitis', 'Acute bronchitis (disorder)', 'Fractu...","[10509002.0, 43878008.0, 24079001.0, 65363002.0, 10509002.0, 65966004.0, 232353008.0, 43878008.0...","['Acute bronchitis (disorder)', 'Streptococcal sore throat (disorder)', 'Atopic dermatitis', 'Ot...",...,[38.41288433052555],[-121.47075316015002],[1610676.85],[10712.75],"[430193006.0, 269911007.0, 117015009.0, 430193006.0, 430193006.0, 23426006.0, 430193006.0, 12250...","['Medication Reconciliation (procedure)', 'Sputum examination (procedure)', 'Throat culture (pro...","[nan, 10509002.0, 43878008.0, nan, nan, 10509002.0, nan, nan, 65966004.0, nan, nan, nan, 1921270...","[nan, 'Acute bronchitis (disorder)', 'Streptococcal sore throat (disorder)', nan, nan, 'Acute br...",[],[]
4,a8fcc478-a986-47bb-8a49-47387a3e15ab,"[424213003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 419263009.0, 91930004.0, 91934...","['Allergy to bee venom', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy...","['ab529498-2354-4110-aa5a-6f5c8abad668', 'c609caad-a030-4524-aaa3-0e4140e40129', 'f548eabb-1202-...","[384758001.0, 699728000.0, 170836005.0, 53950000.0, 91251008.0, 53950000.0, 47387005.0, 38569100...","['Self-care interventions (procedure)', 'Asthma self management', 'Allergic disorder monitoring'...","[nan, 233678006.0, nan, 10509002.0, 44465007.0, 10509002.0, 62106007.0, 65966004.0, 284551006.0,...","[nan, 'Childhood asthma', nan, 'Acute bronchitis (disorder)', 'Sprain of ankle', 'Acute bronchit...","[65363002.0, 195662009.0, 233678006.0, 65363002.0, 65363002.0, 232353008.0, 444814009.0, 4448140...","['Otitis media', 'Acute viral pharyngitis (disorder)', 'Childhood asthma', 'Otitis media', 'Otit...",...,[26.0539759874286],[-80.24179482942706],[1159901.93],[35541.49999999999],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 171231001.0, 430193006.0, 430193006.0, 4301...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, 233678006.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na...","[nan, nan, nan, nan, 'Childhood asthma', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[],[]
5,797e0e78-121c-4523-bfc5-c1fac631e504,"[232347008.0, 91930004.0]","['Dander (animal) allergy', 'Allergy to eggs']","['d90cf202-537f-4938-88c2-e8e02627bafe', 'da6922e5-9803-464d-9efb-1aea50b29991', '26eab1e8-a9fa-...","[384758001.0, 53950000.0, 443402002.0, 736254008.0, 718347000.0, 91251008.0, 736353004.0, 539500...","['Self-care interventions (procedure)', 'Respiratory therapy', 'Lifestyle education regarding hy...","[nan, 10509002.0, 59621000.0, 47505003.0, 47505003.0, 44465007.0, nan, 10509002.0, 44054006.0, 2...","[nan, 'Acute bronchitis (disorder)', 'Hypertension', 'Posttraumatic stress disorder', 'Posttraum...","[65363002.0, 65363002.0, 65363002.0, 195662009.0, 241929008.0, 195662009.0, 10509002.0, 44481400...","['Otitis media', 'Otitis media', 'Otitis media', 'Acute viral pharyngitis (disorder)', 'Acute al...",...,[33.92408161999087],[-118.05437508893844],[1624346.03],[543863.5699999991],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 117015009.0, 313191000.0, 4301...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, 195662009.0, nan, nan, 195662009.0, nan, nan, 10509002.0, nan, nan, na...","[nan, nan, nan, nan, nan, 'Acute viral pharyngitis (disorder)', nan, nan, 'Acute viral pharyngit...",[],[]
6,285ac536-7075-4743-868e-4fb185323cef,[91934008.0],['Allergy to nut'],"['e96495b9-0d9e-4031-b5d7-79b389a8ceba', 'cb2a1552-0936-4bf7-86fa-349d0d584cac', '4783ad20-43be-...","[384758001.0, 47387005.0, 91251008.0, 53950000.0, 91251008.0, 443402002.0, 718347000.0, 73743400...","['Self-care interventions (procedure)', 'Head injury rehabilitation', 'Physical therapy procedur...","[nan, 62106007.0, 44465007.0, 10509002.0, 44465007.0, 59621000.0, 36923009.0, 370143000.0, 62106...","[nan, 'Concussion with no loss of consciousness', 'Sprain of ankle', 'Acute bronchitis (disorder...","[241929008.0, 65363002.0, 65363002.0, 195662009.0, 43878008.0, 62106007.0, 43878008.0, 444814009...","['Acute allergic reaction', 'Otitis media', 'Otitis media', 'Acute viral pharyngitis (disorder)'...",...,[29.670226410773584],[-82.40880503150285],[1485517.82],[7659.919999999999],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 313191000.0, 430193006.0, 430193006.0, 4301...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 10509002.0, nan, nan, nan, nan, 369...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'Acute bronchitis (disorder)', nan,...",[],[]
7,1e97a5cd-22a2-4205-95af-9567cefbd5d8,"[300916003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 419263009.0, 91934008.0]","['Latex allergy', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy', 'All...","['f79d6834-9f3d-4538-a0a0-3b54ce4f1244', 'c973520f-43a7-4532-a8ae-c6eec6796661', '9659aed0-fcf4-...","[384758001.0, 711282006.0, 53950000.0, 53950000.0, 170836005.0, 698360004.0, 53950000.0, 5395000...","['Self-care interventions (procedure)', 'Skin condition care', 'Respiratory therapy', 'Respirato...","[nan, 24079001.0, 10509002.0, 10509002.0, nan, 15777000.0, 10509002.0, 10509002.0, 370247008.0, ...","[nan, 'Atopic dermatitis', 'Acute bronchitis (disorder)', 'Acute bronchitis (disorder)', nan, 'P...","[232353008.0, 24079001.0, 10509002.0, 65363002.0, 444814009.0, 10509002.0, 444814009.0, 19566200...","['Perennial allergic rhinitis with seasonal variation', 'Atopic dermatitis', 'Acute bronchitis (...",...,[27.732385680635296],[-82.69091589800718],[936662.5],[4484.039999999999],"[430193006.0, 430193006.0, 430193006.0, 395142003.0, 430193006.0, 430193006.0, 23426006.0, 43019...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, nan, 10509002.0, nan, nan, nan, 10509002.0, nan, nan, 195662009.0, nan...","[nan, nan, nan, nan, nan, nan, 'Acute bronchitis (disorder)', nan, nan, nan, 'Acute bronchitis (...",[],[]
8,462b6335-9775-456f-8573-5ab03500770b,"[300916003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 419263009.0, 420174000.0, 9193...","['Latex allergy', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy', 'All...","['e37cbdd8-bb79-46ed-9f45-8ee33f2751b6', '3e444799-8081-4bbb-9e7b-2144c3d347c3', 'e31dfdb7-25a0-...","[384758001.0, 699728000.0, 53950000.0, 225358003.0, 170836005.0, 53950000.0, 736376001.0, 736376...","['Self-care interventions (procedure)', 'Asthma self management', 'Respiratory therapy', 'Wound ...","[nan, 233678006.0, 10509002.0, 284549007.0, nan, 10509002.0, 840544004.0, 840539006.0]","[nan, 'Childhood asthma', 'Acute bronchitis (disorder)', 'Laceration of hand', nan, 'Acute bronc...","[65363002.0, 65363002.0, 195662009.0, 233678006.0, 232353008.0, 43878008.0, 43878008.0, 10509002...","['Otitis media', 'Otitis media', 'Acute viral pharyngitis (disorder)', 'Childhood asthma', 'Pere...",...,[32.697143538002976],[-117.1466232011608],[564257.87],[11213.899999999998],"[430193006.0, 430193006.0, 395142003.0, 430193006.0, 430193006.0, 430193006.0, 171231001.0, 1170...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Allergy scre...","[nan, nan, nan, nan, nan, nan, 233678006.0, 43878008.0, 10509002.0, 284549007.0, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, 'Childhood asthma', 'Streptococcal sore throat (disorder)', 'Acut...",[],[]
9,57a4c01b-5f84-417f-ae96-61a6537e21f6,"[424213003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 419263009.0, 91930004.0]","['Allergy to bee venom', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy...","['ef602217-11a9-49ef-b09d-42da7aea1ec8', 'f1f37e9c-bea7-4122-ac73-bf802352278d', 'b1b364a4-0748-...","[384758001.0, 53950000.0, 443402002.0, 53950000.0, 718347000.0, 737434004.0, 53950000.0, 7362540...","['Self-care interventions (procedure)', 'Respiratory therapy', 'Lifestyle education regarding hy...","[nan, 10509002.0, 59621000.0, 10509002.0, 36923009.0, 370143000.0, 10509002.0, 47505003.0, 22544...","[nan, 'Acute bronchitis (disorder)', 'Hypertension', 'Acute bronchitis (disorder)', 'Major depre...","[65363002.0, 241929008.0, 444814009.0, 367498001.0, 10509002.0, 59621000.0, 10509002.0, 36923009...","['Otitis media', 'Acute allergic reaction', 'Viral sinusitis (disorder)', 'Seasonal allergic rhi...",...,[30.35098270891739],[-84.49276761429394],[626520.63],[2617.42],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 313191000.0, 395142003.0, 430193006.0, 4301...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[],[]


In [75]:
# select a single row from the DataFrame
row_data = patients_df.loc[0]

# create a new DataFrame with the single row
patient_df = pd.DataFrame([row_data], columns=row_data.index)

patient_df

Unnamed: 0,PATIENT,CODE_als,DESCRIPTION_als,Id_cps,CODE_cps,DESCRIPTION_cps,REASONCODE_cps,REASONDESCRIPTION_cps,CODE_cds,DESCRIPTION_cds,...,LAT_pts,LON_pts,HEALTHCARE_EXPENSES_pts,HEALTHCARE_COVERAGE_pts,CODE_prs,DESCRIPTION_prs,REASONCODE_prs,REASONDESCRIPTION_prs,CODE_sps,DESCRIPTION_sps
0,98de4759-8225-4160-adb6-2559305fe1df,"[300916003.0, 419474003.0, 232350006.0, 232347008.0, 418689008.0, 91930004.0, 300913006.0]","['Latex allergy', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy', 'All...","['3015755d-ed8c-4d24-9351-c62a2c70c4cb', 'af95a5e1-36cc-41ba-b037-85aec558cf77', 'df21521c-2625-...","[711282006.0, 384758001.0, 699728000.0, 225358003.0, 170836005.0, 734163000.0, 225358003.0, 5395...","['Skin condition care', 'Self-care interventions (procedure)', 'Asthma self management', 'Wound ...","[24079001.0, nan, 233678006.0, 284549007.0, nan, 55680006.0, 283371005.0, 10509002.0, 15777000.0...","['Atopic dermatitis', nan, 'Childhood asthma', 'Laceration of hand', nan, 'Drug overdose', 'Lace...","[24079001.0, 65363002.0, 233678006.0, 367498001.0, 195662009.0, 284549007.0, 43878008.0, 1956620...","['Atopic dermatitis', 'Otitis media', 'Childhood asthma', 'Seasonal allergic rhinitis', 'Acute v...",...,[38.11572396207773],[-122.29443682927533],[1015063.23],[15986.36],"[430193006.0, 430193006.0, 430193006.0, 430193006.0, 430193006.0, 395142003.0, 430193006.0, 1712...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication R...","[nan, nan, nan, nan, nan, nan, nan, 233678006.0, nan, 195662009.0, nan, nan, nan, 284549007.0, n...","[nan, nan, nan, nan, nan, nan, nan, 'Childhood asthma', nan, 'Acute viral pharyngitis (disorder)...","[409534002.0, 713779008.0, 469673003.0, 706724001.0, 419343004.0, 470618009.0, 409534002.0, 7137...","['Disposable air-purifying respirator (physical object)', 'Nitrile examination/treatment glove ..."


In [76]:
patient_df.columns

Index(['PATIENT', 'CODE_als', 'DESCRIPTION_als', 'Id_cps', 'CODE_cps',
       'DESCRIPTION_cps', 'REASONCODE_cps', 'REASONDESCRIPTION_cps',
       'CODE_cds', 'DESCRIPTION_cds', 'CODE_dvs', 'DESCRIPTION_dvs', 'UDI_dvs',
       'Id_iss', 'BODYSITE_CODE_iss', 'BODYSITE_DESCRIPTION_iss',
       'MODALITY_CODE_iss', 'MODALITY_DESCRIPTION_iss', 'SOP_CODE_iss',
       'SOP_DESCRIPTION_iss', 'CODE_ims', 'DESCRIPTION_ims', 'CODE_mds',
       'DESCRIPTION_mds', 'DISPENSES_mds', 'TOTALCOST_mds', 'REASONCODE_mds',
       'REASONDESCRIPTION_mds', 'CODE_obs', 'DESCRIPTION_obs', 'VALUE_obs',
       'UNITS_obs', 'Id_pts', 'BIRTHDATE_pts', 'PREFIX_pts', 'MARITAL_pts',
       'RACE_pts', 'ETHNICITY_pts', 'GENDER_pts', 'BIRTHPLACE_pts', 'CITY_pts',
       'STATE_pts', 'COUNTY_pts', 'ZIP_pts', 'LAT_pts', 'LON_pts',
       'HEALTHCARE_EXPENSES_pts', 'HEALTHCARE_COVERAGE_pts', 'CODE_prs',
       'DESCRIPTION_prs', 'REASONCODE_prs', 'REASONDESCRIPTION_prs',
       'CODE_sps', 'DESCRIPTION_sps'],
      dtype

In [77]:
#Select columns that summarize patient profile and put them into a list
allergies     = patient_df['DESCRIPTION_als']
condition     = patient_df['DESCRIPTION_cds']
devices       = patient_df['DESCRIPTION_dvs']
immunizations = patient_df['DESCRIPTION_ims']
medications   = patient_df['DESCRIPTION_mds']
observations  = patient_df['DESCRIPTION_obs']
procedures    = patient_df['DESCRIPTION_prs']
birthday      = patient_df['BIRTHDATE_pts']
marital       = patient_df['MARITAL_pts']
race          = patient_df['RACE_pts']
ethnicity     = patient_df['ETHNICITY_pts']
gender        = patient_df['GENDER_pts']
city          = patient_df['CITY_pts']
county        = patient_df['COUNTY_pts']

#Make list for patient profile
patient_prof_list = [allergies, condition, devices, immunizations, medications, observations, procedures,birthday,marital, race, ethnicity, gender, city, county]
patient_prof_cols = ['allergies', 'condition', 'devices', 'immunizations', 'medications', 'observations', 'procedures', 'birthday', 'marital', 'race', 'ethnicity', 'gender', 'city', 'county']

# create a dictionary with column names and Series data
data_dict = dict(zip(patient_prof_cols, patient_prof_list))

#print(col_data)
# create a new DataFrame with the single column
patient_prof = pd.DataFrame(data_dict)

# set the maximum column width to 300 characters
pd.set_option('display.max_colwidth', 150)
patient_prof

Unnamed: 0,allergies,condition,devices,immunizations,medications,observations,procedures,birthday,marital,race,ethnicity,gender,city,county
0,"['Latex allergy', 'Allergy to mould', 'House dust mite allergy', 'Dander (animal) allergy', 'Allergy to grass pollen', 'Allergy to eggs', 'Shellfi...","['Atopic dermatitis', 'Otitis media', 'Childhood asthma', 'Seasonal allergic rhinitis', 'Acute viral pharyngitis (disorder)', 'Laceration of hand'...",[],"['Hep B adolescent or pediatric', 'Hep B adolescent or pediatric', 'Hib (PRP-OMP)', 'IPV', 'Hib (PRP-OMP)', 'IPV', 'IPV', 'Influenza seasonal ...","['Astemizole 10 MG Oral Tablet', 'Amoxicillin 250 MG Oral Capsule', 'Acetaminophen 160 MG Chewable Tablet', '120 ACTUAT Fluticasone propionate 0.0...","['Body Height', 'Pain severity - 0-10 verbal numeric rating [Score] - Reported', 'Body Weight', 'Weight-for-length Per age and sex', 'Head Occipit...","['Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication Reconciliation (procedure)', 'Medication Reconcilia...",['1983-08-07'],['M'],['white'],['nonhispanic'],['M'],['Vallejo'],['Solano County']


In [78]:
# concatenate the Series data into a single Series
combined_series = pd.concat(patient_prof_list)

# create a DataFrame with a single column using the combined Series
df = pd.DataFrame({'Patient_Profile': combined_series})

# set the maximum column width to 300 characters
pd.set_option('display.max_colwidth', 100)
df['aspects'] = patient_prof_cols

# Change the order of columns,reset the index, and drop the index column
df = df.reindex(columns=['aspects', 'Patient_Profile']).reset_index().drop('index',axis=1)

# Convert the list column to a string column
df['Patient_Profile'] = df['Patient_Profile'].apply(lambda x: ''.join(map(str, eval(x))))

df

Unnamed: 0,aspects,Patient_Profile
0,allergies,Latex allergyAllergy to mouldHouse dust mite allergyDander (animal) allergyAllergy to grass poll...
1,condition,Atopic dermatitisOtitis mediaChildhood asthmaSeasonal allergic rhinitisAcute viral pharyngitis (...
2,devices,
3,immunizations,Hep B adolescent or pediatricHep B adolescent or pediatricHib (PRP-OMP)IPVHib (PRP-OMP)IPVIPVI...
4,medications,Astemizole 10 MG Oral TabletAmoxicillin 250 MG Oral CapsuleAcetaminophen 160 MG Chewable Tablet1...
5,observations,Body HeightPain severity - 0-10 verbal numeric rating [Score] - ReportedBody WeightWeight-for-le...
6,procedures,Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication Reconciliat...
7,birthday,1983-08-07
8,marital,M
9,race,white


In [84]:
# Load the pre-trained spaCy model with sci-spaCy
ss_sm = spacy.load("en_core_sci_sm")

def get_umls_codes(text: str):
    
    #Add the EntityLinker pipe to spacy pipeline
    if 'scispacy_linker' not in ss_sm.pipe_names:
      ss_sm.add_pipe("scispacy_linker", config={"linker_name": "umls", "max_entities_per_mention": 1})
    
    # Process the text and extract UMLS codes
    doc = ss_sm(text)
    umls_codes = [
        {
            "text": entity.text,
            #"start": entity.start_char,
            #"end": entity.end_char,
            "umls_id": umls_ent[0],
            "score": umls_ent[1]
        }
        for entity in doc.ents
        for umls_ent in entity._.kb_ents
    ]
    
    return umls_codes

def extract_values(dicts, key):
    return [d.get(key, None) for d in dicts]

In [85]:
df['umls_codes'] = df['Patient_Profile'].apply(get_umls_codes)

# Create new columns from the keys in the dictionaries within the 'info' column lists
unique_keys = set().union(*(d.keys() for dicts in df['umls_codes'] for d in dicts))

for key in unique_keys:
    df[key] = df['umls_codes'].apply(lambda dicts: extract_values(dicts, key))

df

Unnamed: 0,aspects,Patient_Profile,umls_codes,umls_id,score,start,text,end
0,allergies,Latex allergyAllergy to mouldHouse dust mite allergyDander (animal) allergyAllergy to grass poll...,"[{'text': 'Latex allergyAllergy', 'umls_id': 'C0577628', 'score': 0.8627872467041016}, {'text': ...","[C0577628, C0998367, C0003062, C0018621, C0577625]","[0.8627872467041016, 0.7111788988113403, 1.0, 0.7985321283340454, 0.7244556546211243]","[0, 24, 60, 86, 109]","[Latex allergyAllergy, mouldHouse dust mite, animal, grass pollenAllergy, eggsShellfish allergy]","[20, 44, 66, 105, 130]"
1,condition,Atopic dermatitisOtitis mediaChildhood asthmaSeasonal allergic rhinitisAcute viral pharyngitis (...,"[{'text': 'Atopic dermatitisOtitis', 'umls_id': 'C0011615', 'score': 0.8733875155448914}, {'text...","[C0011615, C0231335, C0276143, C0036689, C0276143, C1827629, C0036689, C0276143, C0029944, C0149...","[0.8733875155448914, 0.8372848033905029, 1.0, 0.8360282778739929, 1.0, 1.0, 0.7170243859291077, ...","[0, 24, 77, 119, 165, 201, 243, 303, 321, 424, 479, 626, 644, 755, 787]","[Atopic dermatitisOtitis, mediaChildhood, viral pharyngitis, handStreptococcal sore throat, vira...","[23, 38, 94, 148, 182, 221, 286, 320, 359, 450, 504, 643, 651, 785, 794]"
2,devices,,[],[],[],[],[],[]
3,immunizations,Hep B adolescent or pediatricHep B adolescent or pediatricHib (PRP-OMP)IPVHib (PRP-OMP)IPVIPVI...,"[{'text': 'Hep B', 'umls_id': 'C0162569', 'score': 0.8482469916343689}, {'text': 'adult', 'umls_...","[C0162569, C0001675, C0033086, C0001675, C0033086]","[0.8482469916343689, 1.0, 1.0, 1.0, 1.0]","[0, 1231, 1238, 1802, 1809]","[Hep B, adult, preservative, adult, preservative]","[5, 1236, 1250, 1807, 1821]"
4,medications,Astemizole 10 MG Oral TabletAmoxicillin 250 MG Oral CapsuleAcetaminophen 160 MG Chewable Tablet1...,"[{'text': 'Astemizole', 'umls_id': 'C0085170', 'score': 1.0}, {'text': 'MG', 'umls_id': 'C002444...","[C0085170, C0024443, C0002645, C0024443, C0226896, C0024443, C0117996, C0178602, C4055499, C0117...","[1.0, 1.0, 0.7262410521507263, 1.0, 0.9999998807907104, 1.0, 1.0, 0.9999999403953552, 0.81247895...","[0, 14, 17, 44, 47, 77, 106, 219, 235, 242, 355, 371, 378, 491, 507, 514, 627, 643, 650, 763, 77...","[Astemizole, MG, Oral TabletAmoxicillin, MG, Oral, MG, Fluticasone propionate, Dose, ACTUAT, Flu...","[10, 16, 39, 46, 51, 79, 128, 223, 241, 264, 359, 377, 400, 495, 513, 536, 631, 649, 672, 767, 7..."
5,observations,Body HeightPain severity - 0-10 verbal numeric rating [Score] - ReportedBody WeightWeight-for-le...,"[{'text': 'severity', 'umls_id': 'C0439793', 'score': 1.0}, {'text': 'Score', 'umls_id': 'C04498...","[C0439793, C0449820, C0001779, C0262499, C0871470, C0005767, C0014772, C0005767, C0560267, C0560...","[1.0, 1.0, 0.8169045448303223, 0.9019030332565308, 0.8903989195823669, 1.0, 0.7105526328086853, ...","[16, 55, 101, 121, 162, 250, 259, 301, 337, 370, 390, 419, 438, 458, 496, 512, 522, 552, 559, 61...","[severity, Score, Per age, Occipital-frontal circumferenceDiastolic, Blood PressureSystolic, Blo...","[24, 60, 108, 161, 184, 255, 286, 306, 348, 385, 395, 433, 456, 470, 507, 521, 538, 557, 573, 62..."
6,procedures,Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication Reconciliat...,[{'text': 'Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication ...,"[C2317067, C2317067, C2317067, C2317067, C0185115, C0199230, C0199230, C0199230, C0199230, C2238...","[0.8900972008705139, 0.8144897222518921, 0.7232307195663452, 0.7518997192382812, 0.8148413300514...","[0, 286, 441, 2337, 2400, 2595, 2744, 2809, 2930, 3160, 3218, 3320, 3417, 3422, 3531, 3666, 3699...",[Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication Reconcilia...,"[158, 407, 476, 2399, 2421, 2604, 2753, 2818, 2939, 3193, 3225, 3329, 3421, 3435, 3540, 3680, 37..."
7,birthday,1983-08-07,[],[],[],[],[],[]
8,marital,M,[],[],[],[],[],[]
9,race,white,"[{'text': 'white', 'umls_id': 'C0007457', 'score': 1.0}]",[C0007457],[1.0],[0],[white],[5]


In [86]:
df = df.drop(['start','end'],axis=1)
df

Unnamed: 0,aspects,Patient_Profile,umls_codes,umls_id,score,text
0,allergies,Latex allergyAllergy to mouldHouse dust mite allergyDander (animal) allergyAllergy to grass poll...,"[{'text': 'Latex allergyAllergy', 'umls_id': 'C0577628', 'score': 0.8627872467041016}, {'text': ...","[C0577628, C0998367, C0003062, C0018621, C0577625]","[0.8627872467041016, 0.7111788988113403, 1.0, 0.7985321283340454, 0.7244556546211243]","[Latex allergyAllergy, mouldHouse dust mite, animal, grass pollenAllergy, eggsShellfish allergy]"
1,condition,Atopic dermatitisOtitis mediaChildhood asthmaSeasonal allergic rhinitisAcute viral pharyngitis (...,"[{'text': 'Atopic dermatitisOtitis', 'umls_id': 'C0011615', 'score': 0.8733875155448914}, {'text...","[C0011615, C0231335, C0276143, C0036689, C0276143, C1827629, C0036689, C0276143, C0029944, C0149...","[0.8733875155448914, 0.8372848033905029, 1.0, 0.8360282778739929, 1.0, 1.0, 0.7170243859291077, ...","[Atopic dermatitisOtitis, mediaChildhood, viral pharyngitis, handStreptococcal sore throat, vira..."
2,devices,,[],[],[],[]
3,immunizations,Hep B adolescent or pediatricHep B adolescent or pediatricHib (PRP-OMP)IPVHib (PRP-OMP)IPVIPVI...,"[{'text': 'Hep B', 'umls_id': 'C0162569', 'score': 0.8482469916343689}, {'text': 'adult', 'umls_...","[C0162569, C0001675, C0033086, C0001675, C0033086]","[0.8482469916343689, 1.0, 1.0, 1.0, 1.0]","[Hep B, adult, preservative, adult, preservative]"
4,medications,Astemizole 10 MG Oral TabletAmoxicillin 250 MG Oral CapsuleAcetaminophen 160 MG Chewable Tablet1...,"[{'text': 'Astemizole', 'umls_id': 'C0085170', 'score': 1.0}, {'text': 'MG', 'umls_id': 'C002444...","[C0085170, C0024443, C0002645, C0024443, C0226896, C0024443, C0117996, C0178602, C4055499, C0117...","[1.0, 1.0, 0.7262410521507263, 1.0, 0.9999998807907104, 1.0, 1.0, 0.9999999403953552, 0.81247895...","[Astemizole, MG, Oral TabletAmoxicillin, MG, Oral, MG, Fluticasone propionate, Dose, ACTUAT, Flu..."
5,observations,Body HeightPain severity - 0-10 verbal numeric rating [Score] - ReportedBody WeightWeight-for-le...,"[{'text': 'severity', 'umls_id': 'C0439793', 'score': 1.0}, {'text': 'Score', 'umls_id': 'C04498...","[C0439793, C0449820, C0001779, C0262499, C0871470, C0005767, C0014772, C0005767, C0560267, C0560...","[1.0, 1.0, 0.8169045448303223, 0.9019030332565308, 0.8903989195823669, 1.0, 0.7105526328086853, ...","[severity, Score, Per age, Occipital-frontal circumferenceDiastolic, Blood PressureSystolic, Blo..."
6,procedures,Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication Reconciliat...,[{'text': 'Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication ...,"[C2317067, C2317067, C2317067, C2317067, C0185115, C0199230, C0199230, C0199230, C0199230, C2238...","[0.8900972008705139, 0.8144897222518921, 0.7232307195663452, 0.7518997192382812, 0.8148413300514...",[Medication Reconciliation (procedure)Medication Reconciliation (procedure)Medication Reconcilia...
7,birthday,1983-08-07,[],[],[],[]
8,marital,M,[],[],[],[]
9,race,white,"[{'text': 'white', 'umls_id': 'C0007457', 'score': 1.0}]",[C0007457],[1.0],[white]


In [88]:
#Save parsed patient EHR to .csv
df.to_csv('parsed_patient_ehr.csv', index=False)