# Example Notebook To Read PhantomDB Database

---

Includes functions to:

+ pull the compressed file from zenodo
+ uncompress into json file on disk
+ read json file in parts
+ list patient ids, DX codes and other attributes



In [17]:
# install python pacakages if needed
import requests
import tarfile
import json
import os
import ijson
import collections

def extract_first_n_patients(file_path, n):
    patients = []
    with open(file_path, 'r') as file:
        parser = ijson.items(file, 'item')
        for patient in parser:
            patients.append(patient)
            if len(patients) >= n:
                break
    
    # Save the extracted patients to a new JSON file
    with open(output_filename, 'w') as output_file:
        json.dump(patients, output_file, indent=4)
    
    print(f"Extracted first {n} patients and saved to {output_filename}")

def list_patient_ids(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
            patient_ids = [patient["patient_id"] for patient in data]
        return patient_ids
    else:
        print(f"{file_path} not found.")
        return []
    
    
def get_timestamped_codes(file_path, patient_id):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
            for patient in data:
                if patient["patient_id"] == patient_id:
                    return patient["DX_record"]
        print(f"Patient ID {patient_id} not found.")
        return []
    else:
        print(f"{file_path} not found.")
        return []
    
def list_patient_races(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
            patient_races = []
            for patient in data:
                race_info = {
                    "patient_id": patient["patient_id"],
                    "race": patient.get("race")
                }
                patient_races.append(race_info)
            return patient_races
    else:
        print(f"{file_path} not found.")
        return []
    
    

In [None]:
# zenodo URL
url = "https://zenodo.org/records/10598052/files/phantomDB.tgz?download=1"
filename = "phantomDB.tgz"
json_filename = "full_dataset.json"  # Replace with the actual name of the JSON file inside the tar.gz archive

# Download compressed file (1.8G)
response = requests.get(url, stream=True)
if response.status_code == 200:
    with open(filename, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)
    print(f"Downloaded {filename}")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

In [None]:
# Extract json file (30G)
if tarfile.is_tarfile(filename):
    with tarfile.open(filename) as tar:
        tar.extractall()
    print(f"Extracted {filename}")
else:
    print(f"{filename} is not a valid tar file.")

In [11]:
# extract n patients from json
# set n as desired
output_json_filename = "first_n_patients.json"

# Number of patients to extract
N = 30  # Replace with the desired number of patients

# Extract the first N patients
if os.path.exists(json_filename):
    extract_first_n_patients(json_filename, N)
    print("Patients extracted successfully.")
else:
    print(f"{json_filename} not found.")

Extracted first 30 patients and saved to first_n_patients.json
Patients extracted successfully.


In [13]:
# List the patient IDs
patient_ids = list_patient_ids(output_json_filename)
print("Patient IDs:", patient_ids)

Patient IDs: ['NU126_00000', 'NU126_00001', 'NU126_00002', 'NU126_00003', 'NU126_00004', 'NU126_00005', 'NU126_00006', 'NU126_00007', 'NU126_00008', 'NU126_00009', 'NU126_00010', 'NU126_00011', 'NU126_00012', 'NU126_00013', 'NU126_00014', 'NU126_00015', 'NU126_00016', 'NU126_00017', 'NU126_00018', 'NU126_00019', 'NU126_00020', 'NU126_00021', 'NU126_00022', 'NU126_00023', 'NU126_00024', 'NU126_00025', 'NU126_00026', 'NU126_00027', 'NU126_00028', 'NU126_00029']


In [14]:
# Define the patient ID to search for
patient_id = "NU126_00000"  # Replace with the desired patient ID

# Get the timestamped codes for the given patient ID
timestamped_codes = get_timestamped_codes(output_json_filename, patient_id)

# Display the timestamped codes
if timestamped_codes:
    print(f"Timestamped codes for patient ID {patient_id}:")
    for record in timestamped_codes:
        print(f"Date: {record['date']}, Code: {record['code']}")
else:
    print("No timestamped codes found.")

Timestamped codes for patient ID NU126_00000:
Date: 2010-09-11, Code: E78.1
Date: 2011-09-08, Code: E78.5
Date: 2014-02-20, Code: E78.4
Date: 2018-07-28, Code: E78.0
Date: 2015-02-20, Code: Z12.3
Date: 2008-03-28, Code: M54.1
Date: 2010-03-14, Code: M54.1
Date: 2011-09-06, Code: M54.1
Date: 2014-02-22, Code: M54.8
Date: 2023-01-09, Code: M54.5
Date: 2013-03-02, Code: M25.5
Date: 2014-02-23, Code: M25.5
Date: 2021-01-18, Code: M25.5
Date: 2011-09-09, Code: Z00.0
Date: 2017-02-06, Code: Z00.0
Date: 2015-08-15, Code: Z01.4
Date: 2019-07-25, Code: Z01.8
Date: 2009-09-16, Code: Z23
Date: 2018-02-01, Code: Z23
Date: 2018-07-30, Code: R10.8
Date: 2020-07-22, Code: R10.8
Date: 2014-08-20, Code: R06.0
Date: 2021-01-19, Code: R06.0
Date: 2022-07-08, Code: R06.0
Date: 2009-03-22, Code: Z79.8
Date: 2021-07-18, Code: Z79.4
Date: 2008-09-22, Code: G47.3
Date: 2012-03-03, Code: G47.3
Date: 2016-08-10, Code: G47.3
Date: 2018-02-02, Code: G47.0
Date: 2021-01-16, Code: G47.3
Date: 2022-07-09, Code: R53.

In [19]:
# List the races for all patients
patient_races = list_patient_races(output_json_filename)

# Display the races for all patients
for patient in patient_races:
    print(f"Patient ID: {patient['patient_id']} {patient['race']}")

Patient ID: NU126_00000 None
Patient ID: NU126_00001 None
Patient ID: NU126_00002 None
Patient ID: NU126_00003 None
Patient ID: NU126_00004 None
Patient ID: NU126_00005 None
Patient ID: NU126_00006 None
Patient ID: NU126_00007 None
Patient ID: NU126_00008 None
Patient ID: NU126_00009 None
Patient ID: NU126_00010 None
Patient ID: NU126_00011 None
Patient ID: NU126_00012 None
Patient ID: NU126_00013 None
Patient ID: NU126_00014 None
Patient ID: NU126_00015 None
Patient ID: NU126_00016 None
Patient ID: NU126_00017 None
Patient ID: NU126_00018 None
Patient ID: NU126_00019 None
Patient ID: NU126_00020 None
Patient ID: NU126_00021 None
Patient ID: NU126_00022 None
Patient ID: NU126_00023 None
Patient ID: NU126_00024 None
Patient ID: NU126_00025 None
Patient ID: NU126_00026 None
Patient ID: NU126_00027 None
Patient ID: NU126_00028 None
Patient ID: NU126_00029 None


In [22]:
# in the database African-American patients 
# generated as digital twins of AA patienst from Chicago
# occur as teh last 1M
def extract_last_n_patients(file_path, n):
    if not os.path.exists(file_path):
        print(f"{file_path} not found.")
        return []
    
    last_n_patients = collections.deque(maxlen=n)  # Use deque to keep the last N elements
    
    with open(file_path, 'r') as file:
        for patient in ijson.items(file, 'item'):
            last_n_patients.append(patient)
    
    return list(last_n_patients)

# Number of patients to extract
N = 3  # Replace with the desired number of patients

# Extract the last N patients
last_n_patients = extract_last_n_patients(json_filename, N)

# Display the attributes for the last N patients
for patient in last_n_patients:
    print(f"Patient ID: {patient['patient_id']}")
    for key, value in patient.items():
        if key not in ["seeded", "DX_record"]:
            print(f"{key}: {value}")
    print("\n")

Patient ID: AS456_00997
patient_id: AS456_00997
race: African American


Patient ID: AS456_00998
patient_id: AS456_00998
race: African American


Patient ID: AS456_00999
patient_id: AS456_00999
race: African American




In [23]:
last_n_patients

[{'patient_id': 'AS456_00997',
  'DX_record': [{'date': '2023-01-05', 'code': 'Z01.8'},
   {'date': '2010-09-10', 'code': 'M79.6'},
   {'date': '2009-09-18', 'code': 'R53.8'},
   {'date': '2021-07-17', 'code': 'N40.1'},
   {'date': '2014-08-19', 'code': 'N18.9'},
   {'date': '2021-01-14', 'code': 'Z95.0'},
   {'date': '2019-01-26', 'code': 'R49.0'},
   {'date': '2020-01-23', 'code': 'K86.2'}],
  'race': 'African American',
  'seeded': True},
 {'patient_id': 'AS456_00998',
  'DX_record': [{'date': '2015-08-13', 'code': 'M54.6'},
   {'date': '2022-07-13', 'code': 'B35.1'},
   {'date': '2015-08-18', 'code': 'R41.3'},
   {'date': '2015-08-15', 'code': 'T14.9'},
   {'date': '2008-09-22', 'code': 'N13.3'},
   {'date': '2008-03-23', 'code': 'N17.9'},
   {'date': '2014-08-23', 'code': 'D46.9'}],
  'race': 'African American',
  'seeded': True},
 {'patient_id': 'AS456_00999',
  'DX_record': [{'date': '2022-01-08', 'code': 'E78.5'},
   {'date': '2020-01-20', 'code': 'I10'},
   {'date': '2019-07-2