In [None]:
import os
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

# Optional: FHIR validation
USE_VALIDATION = False
if USE_VALIDATION:
    from fhir.resources.patient import Patient

DATA_DIR = Path("C:\\Users\\tonim\\Downloads\\output\\fhir\\Patients")
OUTPUT_FILE = "C:\\Users\\tonim\\Downloads\\output\\parquet\\patients.parquet"
    

def process_file(file):
    try:
        # your parsing code
        return parsed_rows
    except Exception as e:
        print(f"X Error in {file}: {e}")
        return None

with ProcessPoolExecutor(max_workers=1) as executor:
    results = list(executor.map(process_file, batch_files))

In [None]:

from pathlib import Path
import os
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

# Load a single FHIR JSON
with open("C:\\Users\\tonim\\Downloads\\output\\fhir\\Patients\\Floyd420_Streich926_42f4db2f-b049-c9a1-a961-7a944ea72e48.json") as f:
    data = json.load(f)

# Flatten JSON
df = pd.json_normalize(data)

#print(df.head())
df.to_parquet(
    "C:\\Users\\tonim\\Downloads\\output\\parquet\\patient.parquet",
    engine="fastparquet",
    index=False
)


In [None]:
import os
import json
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ProcessPoolExecutor, as_completed

INPUT_DIR = "C:\\Users\\tonim\\Downloads\\output\\fhir\\Patients"
OUTPUT_DIR = "C:\\Users\\tonim\\Downloads\\output\\parquet"
BATCH_SIZE = 500
PARQUET_FILE = os.path.join(OUTPUT_DIR, "patients.parquet")

os.makedirs(OUTPUT_DIR, exist_ok=True)

def process_file(filepath):
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Example flatten (customize per schema)
        record = {
            "id": data.get("id"),
            "name": data.get("name", {}).get("given", [None])[0],
            "gender": data.get("gender"),
            "birthDate": data.get("birthDate")
        }
        return record

    except Exception as e:
        print(f" Failed: {filepath} ({e})")
        return None

def process_batch(batch_files):
    rows = []
    with ProcessPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(process_file, f) for f in batch_files]
        for future in as_completed(futures):
            result = future.result()
            if result:
                rows.append(result)

    if rows:
        df = pd.DataFrame(rows)
        table = pa.Table.from_pandas(df)

        if not os.path.exists(PARQUET_FILE):
            pq.write_table(table, PARQUET_FILE)
        else:
            pq.write_table(table, PARQUET_FILE, append=True)

def main():
    all_files = [os.path.join(INPUT_DIR, f) for f in os.listdir(INPUT_DIR) if f.endswith(".json")]

    for i in range(0, len(all_files), BATCH_SIZE):
        batch_files = all_files[i:i + BATCH_SIZE]
        print(f"▶ Processing batch {i // BATCH_SIZE + 1}: {len(batch_files)} files")
        process_batch(batch_files)

if __name__ == "__main__":
    main()


In [None]:
import logging

# Global logging configuration (optional, you can set handlers per logger too)
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG to capture all debug messages
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Create individual loggers
patient_logger = logging.getLogger('parser.patient')
encounter_logger = logging.getLogger('parser.encounter')
document_logger = logging.getLogger('parser.document')
care_plan_logger = logging.getLogger('parser.care_plan')



In [None]:
import os
import json
import pandas as pd
import logging
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import psycopg2
from psycopg2.extras import execute_values

import logging
import json

####-----------------------------------09/03/2025------------------------
# Parsers ---> parqut

# Create a dedicated logger for claims parsing
#claims_logger = logging.getLogger("claims_parser")
#claims_logger.setLevel(logging.DEBUG)  # or INFO if you prefer

# Create a file handler that writes to a dedicated file
#file_handler = logging.FileHandler("claims_parsing.log", mode="w", encoding="utf-8")
#file_handler.setLevel(logging.DEBUG)

# Optional: add a formatter
#formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
#file_handler.setFormatter(formatter)

# Add handler to the logger
#claims_logger.addHandler(file_handler)
#---------------------------------------------
#Logger
#--------------------------------------------

def get_parser_logger(name, log_file, level=logging.DEBUG):
    """Create a dedicated logger for a parser method."""
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Avoid adding multiple handlers if logger already exists
    if not logger.handlers:
        file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
        file_handler.setLevel(level)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    
    return logger

# -------------------------
# Helper Functions
# -------------------------

def extract_codings(resource_field):
    """
    Given a FHIR resource field that may have a 'coding' list,
    return a list of dicts with 'system' and 'code'.
    """
    codings = resource_field.get("coding", [])
    result = []
    for coding in codings:
        system = coding.get("system")
        code = coding.get("code")
        result.append({"system": system, "code": code})
    return result

def extract_diagnoses_from_claim(resource):
    """
    Extract diagnoses directly from claim resource.
    Returns a list of dicts with sequence, reference, and code.
    """
    diagnoses = []

    for diag in resource.get("diagnosis", []):
        sequence = diag.get("sequence")
        diagnosis_ref = diag.get("diagnosisReference", {}).get("reference")
        codeable_concept = diag.get("diagnosisCodeableConcept", {}).get("coding", [{}])[0].get("code")
        diagnoses.append({
            "sequence": sequence,
            "diagnosisReference": diagnosis_ref,
            "diagnosisCodeableConcept": codeable_concept
        })

    return diagnoses

#---getting diagnostic and insurance info from the sequences
def combine_claim_info_by_sequence(resource):
    """
    Combine supportingInfo, diagnoses, and insurance by sequence.
    Returns a dictionary keyed by sequence with all associated info.
    """
    combined = {}

    # Start with supporting info
    info_list, _ = extract_supporting_info_combined(resource)
    for entry in info_list:
        seq = entry["sequence"]
        combined.setdefault(seq, {})
        combined[seq].setdefault("supporting_info", []).append(entry)

    # Add diagnoses
    for diag in extract_diagnoses_from_claim(resource):
        seq = diag["sequence"]
        combined.setdefault(seq, {})
        combined[seq].setdefault("diagnoses", []).append({
            "diagnosisReference": diag.get("diagnosisReference"),
            "diagnosisCodeableConcept": diag.get("diagnosisCodeableConcept")
        })

    # Add insurance
    for ins in extract_insurance_from_claim(resource):
        seq = ins["sequence"]
        combined.setdefault(seq, {})
        combined[seq].setdefault("insurance", []).append({
            "focal": ins.get("focal"),
            "coverage": ins.get("coverage")
        })

    return combined


def extract_insurance_from_claim(resource):
    """
    Extract insurance entries directly from claim resource.
    Returns a list of dicts with sequence, focal, and coverage.
    """
    insurance_list = []

    for ins in resource.get("insurance", []):
        insurance_list.append({
            "sequence": ins.get("sequence"),
            "focal": ins.get("focal"),
            "coverage": ins.get("coverage", {}).get("reference") 
                        or ins.get("coverage", {}).get("display")
        })

    return insurance_list


def extract_supporting_info_combined(resource):
    """
    Extract supportingInfo entries from a FHIR resource.
    Returns both:
      - a list preserving original order
      - a dict grouped by sequence
    """
    supporting_info = resource.get("supportingInfo", [])
    info_list = []
    info_by_sequence = {}

    for info in supporting_info:
        seq = info.get("sequence")
        if seq is None:
            continue

        category = []
        for coding in info.get("category", {}).get("coding", []):
            category.append({
                "system": coding.get("system"),
                "code": coding.get("code"),
                "display": coding.get("display")
            })

        entry = {
            "sequence": seq,
            "category": category,
            "valueString": info.get("valueString"),
            "valueBoolean": info.get("valueBoolean"),
            "valueQuantity": info.get("valueQuantity"),
            "valueAttachment": info.get("valueAttachment"),
            "timingDate": info.get("timingDate"),
            "timingPeriod": info.get("timingPeriod")
        }

        info_list.append(entry)
        info_by_sequence.setdefault(seq, []).append(entry)

    return info_list, info_by_sequence

def extract_EOB_info(resource):
    
    contained_info = resource.get("contained", [])
    """for contained in contained_info:
        type = contained.get("resourceType")
        id = contained.get("id")
        status = contained.get("status")
        intent = contained.get("intent")
        reference = contained.get("subject").get("reference")
        requester = contained.get("requester").get("reference")
        performer = contained.get("performer").get("performer")
        EOB_extracted_info. append({
            "type": type,
            "id": id,
            "status": status,
            "intent": intent,
            "reference": reference,
            "requester":requester,
            "performer": performer    
        })"""
    return contained_info


    #resource["diagnosis_parsed"] = diagnoses    

def extract_supporting_info_combined(resource):
    """
    Extract supportingInfo entries from a FHIR resource.
    Returns both:
      - a list preserving original order
      - a dict grouped by sequence
    """
    supporting_info = resource.get("supportingInfo", [])
    info_list = []
    info_by_sequence = {}

    for info in supporting_info:
        seq = info.get("sequence")
        if seq is None:
            continue  # skip entries without sequence

        # Extract all codings under category
        category = []
        for coding in info.get("category", {}).get("coding", []):
            category.append({
                "system": coding.get("system"),
                "code": coding.get("code"),
                "display": coding.get("display")
            })

        entry = {
            "sequence": seq,
            "category": category,
            "valueString": info.get("valueString"),
            "valueBoolean": info.get("valueBoolean"),
            "valueQuantity": info.get("valueQuantity"),
            "valueAttachment": info.get("valueAttachment"),
            "timingDate": info.get("timingDate"),
            "timingPeriod": info.get("timingPeriod")
        }

        # Add to list (preserves order)
        info_list.append(entry)

        # Add to dict (grouped by sequence)
        if seq not in info_by_sequence:
            info_by_sequence[seq] = []
        info_by_sequence[seq].append(entry)

    return info_list, info_by_sequence


def safe_get(resource, *keys, default=None):
    """Safely navigate nested dicts and lists."""
    try:
        for key in keys:
            if isinstance(resource, list) and isinstance(key, int):
                resource = resource[key]
            else:
                resource = resource[key]
        return resource
    except (KeyError, IndexError, TypeError):
        return default

def safe_get_first(lst, default=None):
    """Return first element of a list if it exists."""
    if isinstance(lst, list) and lst:
        return lst[0]
    return default

#def safe_get_first_coding(resource, field):
    """Return the first coding dict from a field."""
#    coding = safe_get(resource, field, 0, "coding", 0, default={})
#    logging.info(f"coding: {coding}")
#    return coding if coding else {}

def get_first_coding(resource, field):
    """Safely extract first coding (code, system, display) from a FHIR element."""
    if field in resource and isinstance(resource[field], dict):
        coding_list = resource[field].get("coding", [])
        if coding_list and isinstance(coding_list, list):
            return coding_list[0]  # first coding dict
    return {}

def extract_identifier(resource, code_to_find):
    """Return identifier value for a given code (SS, DL, PPN)."""
    for ident in resource.get("identifier", []):
        for c in ident.get("type", {}).get("coding", []):
            if c.get("code") == code_to_find:
                return ident.get("value")
    return None

def extract_extensions(resource, url_to_find):
    """Extract the 'valueString' from a specific extension url."""
    for ext in resource.get("extension", []):
        if ext.get("url") == url_to_find:
            for sub in ext.get("extension", []):
                if sub.get("url") == "text":
                    return sub.get("valueString")
    return None

# -------------------------
# Parsers
# -------------------------
def parse_patient_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    if "entry" not in data:
        return []

    patients = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Patient":
            continue
        
        names = resource.get("name",[])
        name = names[0] if names else{}
        #name = resource.get("name", []), {})
        given_name = safe_get(name.get("given", 0))
        last_name = name.get("family")
        prefix = safe_get(name.get("prefix", 0))

        extensions = safe_get(resource, "extension", default=[])
        birth_place = extensions[0].get("valueAddress", {}) if extensions else {}

        patients.append({
            "id": resource.get("id"),
            "first_name": given_name,
            "last_name": last_name,
            "prefix": prefix,
            "gender": resource.get("gender"),
            "birth_date": resource.get("birthDate"),
            "deceased_date_time": resource.get("deceasedDateTime"),
            "ssn": extract_identifier(resource, "SS"),
            "driversLicense": extract_identifier(resource, "DL"),
            "Passport": extract_identifier(resource, "PPN"),
            "maritalStatus": safe_get(resource, "maritalStatus", "text") or safe_get_first(resource.get("maritalStatus", {}).get("coding", []), {}).get("display"),
            "race": extract_extensions(resource, "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race"),
            "ethnicity": extract_extensions(resource, "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity"),
            "birth_place": birth_place
        })
    return patients

def parse_encounter_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    if "entry" not in data:
        return []

    encounters = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Encounter":
            continue

        coding = get_first_coding(resource, "type")
        encounters.append({
            "id": resource.get("id"),
            "patient_ref": safe_get(resource, "subject", "reference"),
            "status": resource.get("status"),
            "code": coding.get("code"),
            "description": coding.get("display"),
            "start_date_time": safe_get(resource, "period", "start"),
            "end_date_time": safe_get(resource, "period", "end")
        })
    return encounters

def insert_conditions(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if "entry" not in data:
        return []

    rows = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Condition":
            continue

        coding = get_first_coding(resource, "code")
        category_coding = get_first_coding(resource, "category") if resource.get("category") else {}

        rows.append({
            "resource_type": "Condition",
            "resource": json.dumps(resource),
            "condition_id": resource.get("id"),
            "patient_ref": safe_get(resource, "subject", "reference"),
            "encounter_ref": safe_get(resource, "encounter", "reference"),
            "code": coding.get("code"),
            "description": coding.get("display"),
            "category": category_coding.get("code"),
            "onset_date": resource.get("onsetDateTime")
        })
    return rows

def parse_observations(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if "entry" not in data:
        return []

    observations = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Observation":
            continue

        coding = get_first_coding(resource, "code")
        observations.append({
            "observation_id": resource.get("id"),
            "code": coding.get("code"),
            "system": coding.get("system"),
            "description": coding.get("display"),
            "patient_ref": safe_get(resource, "subject", "reference"),
            "encounter_ref": safe_get(resource, "encounter", "reference")
        })
    return observations

def parse_diagnostic_reports(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if "entry" not in data:
        return []

    reports = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "DiagnosticReport":
            continue
        diagnostic_id = resource.get("id")
        #logging.info(f"diagnostic_id: {diagnostic_id}")
        #coding = safe_get_first_coding(resource, "code")
        coding = get_first_coding(resource, "code")
        #logging.info(f"coding: {coding}")
        code = coding.get("code")
        #logging.info(f"coding: {code}")
        #results_list = [r.get("display") for r in resource.get("result", [])]
        results_list = []
        if "result" in resource:
            for r in resource["result"]:
                ref = r.get("reference")  # e.g. "Observation/12345"
                display = r.get("display")
                #logging.info(f"display: {display}")
                
                results_list.append({
                    "reference" : ref,
                    "display": display
                })
                
        #logging.info(f"result_list: {results_list}")
        system = coding.get("system")
        #logging.info(f"system: {system}")
        description = coding.get("display")
        #logging.info(f"description: {description}")
        patient_ref = safe_get(resource, "subject", "reference")
        #logging.info(f"patient_ref: {patient_ref}")
        encounter_ref =  safe_get(resource, "encounter", "reference")
        #logging.info(f"encounter:{encounter_ref}")
        effective_date_time = resource.get("effectiveDateTime")
        #logging.info(f"effective_date_time: {effective_date_time}")


        reports.append({
            "diagnostic_id": diagnostic_id,
            "code": code,
            "system": system,
            "description": description,
            "patient_ref": patient_ref,
            "encounter_ref": encounter_ref,
            "encounter_date_time": effective_date_time,
            "results": json.dumps(results_list)
        })
    return reports

'''def parse_document_reference(file_path):
    #print("inside document_reference")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
                return None
        
        document_references=[]
        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") == "DocumentReference":
                logging.info("----insert document reference.")
                document_id = resource.get("id") 
                logging.info("------Document Reference--------")
                logging.info(f"document_id: {document_id}") 
                status = resource.get("status")
                patient_ref = safe_get(resource, "subject", "reference")
                logging.info(f"patient_ref: {patient_ref}")
                #encounter_ref = safe_get(resource, "context", "encounter")
                #logging.info(f"encounter_ref:{encounter_ref}")
                #coding = get_first_coding(resource, "code")

                 # context.encounter can be an array of references
                context = resource.get("context", {})

                encounter_refs = safe_get(resource, "context", "encounter") or []
                encounters_info = []
                for enc in encounter_refs:
                    if encounter_refs:
                        encounters_info.append({
                            "encounter_ref": enc.get("reference"),
                            "encounter_display": enc.get("display")
                        })
                
                date_time = resource.get("date")
                logging.info(f"date_time: {date_time}")

                codings = resource.get("type", {}).get("coding", [])
                code_info =[]
                #logging.info(f"coding: {codings}")
                for coding in codings:
                    code = coding.get("code")
                    #codes.append(code)
                    logging.info(f"coding: {code}")
                    
                    system = coding.get("system")
                    #systems.append(system)
                    logging.info(f"system: {system}")
                    description = coding.get("display")
                    #descriptions.append(description)
                    logging.info(f"description: {description}")

                    code_info.append({
                        "code": code,
                        "system" : system,
                        "description" : description    
                    })
                author_info = []
                authors = resource.get("author", [])
                for author in authors:
                    author_ref = author.get("reference")
                    logging.info(f"author_ref: {author_ref}")
                    author_display = author.get("display")
                    logging.info(f"author_name: {author_display}")
                    author_info.append({
                        "author_ref": author_ref,
                        "author_display" : author_display         
                    })
                
                custodian_info = []
                custodian = resource.get("custodian",{})
                custodian_reference = custodian.get("reference")
                logging.info(f"custodian_reference: {custodian_reference}")
                custodian_display = custodian.get("display")
                logging.info(f"custodian_display: {custodian_display}")
                custodian_info.append({
                    "custodian_reference" : custodian_reference,
                    "custodian_display"  : custodian_display
                })

                start_date_time = resource.get("context", {}).get("period", {}).get("start")
                logging.info(f"start_date_time: {start_date_time}")
                end_date_time = resource.get("context", {}).get("period", {}).get("end")
                logging.info(f"end_date_time: {end_date_time}")

                document_references.append({
                    "document_id": document_id,
                    "status": status,
                    "patient_ref": patient_ref,
                    "encounter_ref" : json.dumps(encounters_info),
                    "date_time" : date_time, 
                    "code_info" : json.dumps(code_info),
                    "author_info" : json.dumps(author_info),
                    "custodian" : json.dumps(custodian_info),
                    "start_date_time" : start_date_time,
                    "end_date_time" : end_date_time
                })      

        return document_references''' 

def parse_document_reference(file_path):
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None

        document_references = []

        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") != "DocumentReference":
                continue

            #logging.info("----insert document reference.")
            document_id = resource.get("id")
            #logging.info(f"document_id: {document_id}")

            status = resource.get("status")
            patient_ref = safe_get(resource, "subject", "reference")
            #logging.info(f"patient_ref: {patient_ref}")

            # Collect encounters
            context = resource.get("context", {})
            encounter_refs = safe_get(resource, "context", "encounter") or []
            encounters_info = [
                {
                    "encounter_ref": enc.get("reference"),
                    "encounter_display": enc.get("display")
                }
                for enc in encounter_refs
            ]

            date_time = resource.get("date")
            #logging.info(f"date_time: {date_time}")

            # Collect codings
            codings = resource.get("type", {}).get("coding", [])
            code_info = [
                {
                    "code": c.get("code"),
                    "system": c.get("system"),
                    "description": c.get("display")
                }
                for c in codings
            ]

            # Collect authors
            authors = resource.get("author", [])
            author_info = [
                {
                    "author_ref": a.get("reference"),
                    "author_display": a.get("display")
                }
                for a in authors
            ]

            # Collect custodian
            custodian = resource.get("custodian", {})
            custodian_info = [{
                "custodian_reference": custodian.get("reference"),
                "custodian_display": custodian.get("display")
            }]

            # Document period
            period = context.get("period", {})
            start_date_time = period.get("start")
            end_date_time = period.get("end")
            #logging.info(f"start_date_time: {start_date_time}")
            #logging.info(f"end_date_time: {end_date_time}")

            # Append to results
            document_references.append({
                "document_id": document_id,
                "status": status,
                "patient_ref": patient_ref,
                "encounter_ref": json.dumps(encounters_info),
                "date_time": date_time,
                "code_info": json.dumps(code_info),
                "author_info": json.dumps(author_info),
                "custodian": json.dumps(custodian_info),
                "start_date_time": start_date_time,
                "end_date_time": end_date_time
            })

        return document_references


def parse_care_plans(file_path):
    logger.debug("Starting Care Plans parsing")
    logging.info("------------------------Parse Care Plans--------------------------------")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if "entry" not in data:
        return []

    care_plans = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "CarePlan":
            continue

        ##coding = resource.get("coding")
        #logging.info(f"Care_Plan coding: {coding}")
        careteam_info = []
        for ct in resource.get("careTeam", []):
            ref = ct.get("reference")
            if ref:
                ref_id = ref.replace("urn:uuid:", "")
                careteam_resource = next(
                    (e.get("resource") for e in data.get("entry", [])
                     if e.get("resource", {}).get("id") == ref_id),
                    None
                )
                if careteam_resource:
                    careteam_info.append({
                        "careTeam_id": ref_id,
                        "name": careteam_resource.get("name"),
                        "managingOrganization": careteam_resource.get("managingOrganization"),
                        "participants": careteam_resource.get("participant", [])
                    })

        # Resolve addresses references (conditions/problems)
        addresses_info = []
        for addr in resource.get("addresses", []):
            ref = addr.get("reference")
            if ref:
                ref_id = ref.replace("urn:uuid:", "")
                condition_resource = next(
                    (e.get("resource") for e in data.get("entry", [])
                     if e.get("resource", {}).get("id") == ref_id),
                    None
                )
                if condition_resource:
                    addresses_info.append({
                        "condition_id": ref_id,
                        "code": condition_resource.get("code"),
                        "status": condition_resource.get("clinicalStatus"),
                        "verificationStatus": condition_resource.get("verificationStatus")
                    })

        logging.info(f"care_plan_id: {resource.get("id")}")
        #logging.info(f"code: {coding.get("code",[]).get("code")}")
        #logging.info(f"system: {coding.get("system")}")
        #logging.info(f"description: {coding.get("display")}")
        coding_info = []
        #category = resource.get("category", [])
        #logging.info(f"category: {category}")
        #if category is None:
        #    return coding_info

        category = resource.get("category", [])

        # Normalize so it's always a list
        category_data = resource.get("category", [])

        # normalize to always be a list
        if isinstance(category_data, dict):
            category_data = [category_data]

        for cat in category_data:
            for coding in cat.get("coding", []):
                system = coding.get("system")
                logging.info(f"care_plan category-System:  {system}")
                code = coding.get("code")
                logging.info(f"care_plan category- Code: {code}" )
                category = coding.get("display")
                logging.info(f"care_plan category- Display: {category}")
                #for cat in category:
                    #for coding in cat.get("coding", []):
                    #coding_info.append({
                if system:
                    coding_info.append(display)    
                if code: 
                    coding_info.append(code)
                if display: 
                     coding_info.append(display)
                #})                   
                            
            #else:
                #return coding_info
        logging.info(f"status: {resource.get("status")}")
        logging.info(f"intent: {resource.get("intent")}")
        logging.info(f"subject: {safe_get(resource, "subject", "reference")}")
        logging.info(f"encounter: {safe_get(resource, "encounter", "reference")}")
        logging.info(f"period_start: {safe_get(resource, "period", "start")}")
        logging.info(f"care team: {careteam_info}")
        logging.info(f"addresses: {addresses_info}")

        care_plans.append({
            "care_plan_id": resource.get("id"),
            "code": json.dumps(coding_info, default=str),
            "status": resource.get("status"),
            "intent": resource.get("intent"),
            "subject_ref": safe_get(resource, "subject", "reference"),
            "encounter_ref": safe_get(resource, "encounter", "reference"),
            "period_start": safe_get(resource, "period", "start"),
            "careteam": json.dumps(careteam_info),
            "addresses": json.dumps(addresses_info)
        })
    return care_plans

def deduplicate_supporting_info(info_list):
    """Remove duplicates based on sequence + reference."""
    seen = set()
    deduped = []
    for info in info_list:
        key = (info.get("sequence"), info.get("reference"))
        if key not in seen:
            seen.add(key)
            deduped.append(info)
    return deduped

def insert_supporting_info(claim_id, info_list):
    """Insert deduplicated supportingInfo into supporting_info table."""
    deduped_info = deduplicate_supporting_info(info_list)
    rows = [
        (   
            claim_id, 
            info.get("sequence"), 
            info.get("reference"), 
            info.get("display"), 
            info.get("type"))
        for info in deduped_info
    ]

    insert_query = """
    INSERT INTO supporting_info (claim_id, sequence, reference, display, type)
    VALUES %s
    ON CONFLICT (claim_id, sequence, reference) DO NOTHING
    """
    #execute_values(cur, insert_query, rows)

"""def parse_claims(file_path):
    claims_logger = get_parser_logger("claims_parser", "C:\\Users\\tonim\\Downloads\\output\\fhir\\claims_parsing_log")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
                return None
        
        claims=[]
        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") == "Claim":
                claim_id = resource.get("id") 
                claim_type = resource.get("type",{})
                claim_type_codings = claim_type.get("coding",[])
                claim_type_info = []
                claim__code = None
                for coding in claim_type_codings:
                    claim_code = coding.get("code")
                    claim_system = coding.get("system")
                    claims_logger.info(f"claim_type: {claim_code}, claim_system: {claim_system}")
                    claim_type_info.append({
                        "claim_type": claim_code,
                        "claim_system": claim_system
                        })
                    
                claim_status = resource.get("status")
                claim_patient = resource.get("patient",{})
                claim_patient_ref = claim_patient.get("reference")
                claim_total = resource.get("total")
                claim_total_value = claim_total.get("value") if claim_total else None
                claim_total_currency = claim_total.get("currency") if claim_total else None
                billable_period = resource.get("billablePeriod")
                billable_start = billable_period.get("start")
                billable_end = billable_period.get("end")
                bill_created = resource.get("created")
                provider = resource.get("provider")
                provider_ref = provider.get("reference")
                provider_name = provider.get("display")
                facility = resource.get("facility") or {}
                facility_ref = facility.get("reference")
                facility_name = facility.get("display")
                priority = resource.get("priority")
                coding_data = priority.get("coding",[])
                coding_info = []
                #encounter_ref = 
                #logging.info(f"priority:  {priority}")

                for coding in coding_data:
                    code = coding.get("code")
                    logging.info(f"code: {code}")
                    system = coding.get("system")
                    logging.info(f"system: {system}")
                    coding_info.append({
                        "code": code,
                        "system": system
                                        })
                
                

                claims_logger.info(f"claim_patient_ref: {claim_patient_ref}")

                claims_logger.info(f"claim_id: {claim_id}") 
                claims_logger.info(f"claim_status: {claim_status}")

                claims_logger.info(f"claim_total: {claim_total}")
                claims_logger.info(f"claim_total_value: {claim_total_value}")
                claims_logger.info(f"claim_total_currency: {claim_total_currency}")
                claims_logger.info(f"start: {billable_start}")
                claims_logger.info(f"end: {billable_end}")
                claims_logger.info(f"bill_created: {bill_created}")
                claims_logger.info(f"provider_ref: {provider_ref}")
                claims_logger.info(f"provider_name: {provider_name}")

                 # Diagnosis---need to test, these are usually observations, or 
                diagnosis = extract_diagnoses_from_claim(resource)
                claims_logger.info(f"extracted diagnosis: {diagnosis}")
                insurance = extract_insurance_from_claim(resource)
                claims_logger.info(f"extracted insurance: {insurance}")
                #for diag in diagnosis:
                #    sequence = diag.get("diagnosis",[])

                claims.append({
                    "claim_id": claim_id,
                    "claim_status": claim_status,
                    "claim_patient_ref": claim_patient_ref,
                    "claim_type":{json.dumps(claim_type_info)},
                    "claim_total" : f"{claim_total_value} {claim_total_currency}",
                    "billable_period": f"{billable_start}-{billable_end}",
                    "bill_date": bill_created,
                    "provider": provider_ref,
                    "facility": facility_ref,
                    "priority": priority,
                    "coding_info": coding_info,
                    "claim_diagnosis": {json.dumps(diagnosis)},
                    "claim_insurance": {json.dumps(insurance)}
                })

        return claims"""
def parse_claims(file_path):
    claims_logger = get_parser_logger(
        "claims_parser", 
        "C:\\Users\\tonim\\Downloads\\output\\fhir\\claims_parsing_log"
    )
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None
        
        claims_list = []

        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") != "Claim":
                continue

            claim_id = resource.get("id")
            claim_status = resource.get("status")
            
            # Patient
            claim_patient_ref = resource.get("patient", {}).get("reference")
            
            # Type
            claim_type_info = []
            claim_type = resource.get("type", {})
            for coding in claim_type.get("coding", []):
                claim_type_info.append({
                    "code": coding.get("code"),
                    "system": coding.get("system")
                })

            # Total
            claim_total = resource.get("total", {})
            
            # Billable period
            billable_period = resource.get("billablePeriod", {})
            billable_start = billable_period.get("start")
            billable_end = billable_period.get("end")
            
            # Provider & Facility
            provider = resource.get("provider", {})
            provider_ref = provider.get("reference")
            provider_name = provider.get("display")

            facility = resource.get("facility", {})
            facility_ref = facility.get("reference")
            facility_name = facility.get("display")
            
            # Priority
            priority_coding = []
            priority = resource.get("priority", {})
            for coding in priority.get("coding", []):
                priority_coding.append({
                    "code": coding.get("code"),
                    "system": coding.get("system")
                })
            
            # Diagnosis
            diagnosis = extract_diagnoses_from_claim(resource)

            # Insurance
            insurance = extract_insurance_from_claim(resource)

            # Items
            items = []
            for item in resource.get("item", []):
                items.append({
                    "sequence": item.get("sequence"),
                    "productOrService": item.get("productOrService", {}).get("coding", [{}])[0].get("code"),
                    "productOrServiceText": item.get("productOrService", {}).get("text"),
                    "encounters": [enc.get("reference") for enc in item.get("encounter", [])]
                })

            # Logging
            claims_logger.info(f"Parsing claim {claim_id} for patient {claim_patient_ref}")

            # Construct claim dict
            claims_list.append({
                "claim_id": claim_id,
                "status": claim_status,
                "patient_ref": claim_patient_ref,
                "type": claim_type_info,
                "total": claim_total,
                "billable_period": {"start": billable_start, "end": billable_end},
                "created": resource.get("created"),
                "provider": {"ref": provider_ref, "name": provider_name},
                "facility": {"ref": facility_ref, "name": facility_name},
                "priority": priority_coding,
                "diagnosis": diagnosis,
                "insurance": insurance,
                "items": items,
                "resource": resource
            })

        return claims_list


"""def parse_EOB(file_path):
    EOB_logger = get_parser_logger("EOB_parser", "C:\\Users\\tonim\\Downloads\\output\\fhir\\EOB_parsing_log")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
                return None
        
        eob=[]
        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") == "ExplanationOfBenefit":
                eob_id = resource.get("id") 
                EOB_extracted_info= []
                #print("eob_id: ", eob_id)
                EOB_info = extract_EOB_info(resource) 
                for contained in EOB_info:
                    performers_raw = contained.get("performer", [])
                    if isinstance(performers_raw, dict):
                        performers = [performers_raw.get("performer")]
                    elif isinstance(performers_raw, list):
                        performers = [p.get("performer") for p in performers_raw if isinstance(p,dict)]
                    else:
                        performers = []

                    type = contained.get("resourceType")
                    id = contained.get("id")
                    status = contained.get("status")
                    intent = contained.get("intent")
                    subject = contained.get("subject", {}).get("reference")
                    requester = contained.get("requester", {}).get("reference")
                    EOB_logger.debug("type: {type}")
                    EOB_logger.debug("id: {id}")
                    EOB_logger.debug("status: {status}")
                    
                    EOB_extracted_info.append({
                        "type": type,
                        "id": id,
                        "status": status,
                        "intent": intent,
                        "subject": subject,
                        "requester": requester,
                        "performer": performers
                    })

                EOB_logger.debug(f"EOB info: {EOB_info}")

                eob.append({
                    "eob_id": eob_id,
                    "extracted_info": {json.dumps(EOB_extracted_info)}
                })

        return eob"""  

def parse_EOB(file_path):
    EOB_logger = get_parser_logger(
        "EOB_parser", 
        "C:\\Users\\tonim\\Downloads\\output\\fhir\\EOB_parsing_log"
    )
    
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None
        
        eob_list = []

        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") != "ExplanationOfBenefit":
                continue

            # Top-level fields
            eob_id = resource.get("id")
            patient_ref = resource.get("patient", {}).get("reference")
            status = resource.get("status")
            eob_type = resource.get("type")
            use = resource.get("use")
            created = resource.get("created")

            # Insurance references (capture all insurers)
            insurance_list = resource.get("insurance", [])
            insurer_refs = [
                ins.get("provider", {}).get("reference") 
                for ins in insurance_list
                if ins.get("provider")
            ]

            # Supporting info / performers
            extracted_info = []
            supporting_info = resource.get("supportingInfo", [])
            for info in supporting_info:
                performers_raw = info.get("performer", [])
                if isinstance(performers_raw, dict):
                    performers = [performers_raw.get("actor", {}).get("reference")]
                elif isinstance(performers_raw, list):
                    performers = [
                        p.get("actor", {}).get("reference") 
                        for p in performers_raw if isinstance(p, dict)
                    ]
                else:
                    performers = []

                extracted_info.append({
                    "resourceType": info.get("resourceType"),
                    "id": info.get("id"),
                    "status": info.get("status"),
                    "intent": info.get("intent"),
                    "subject": info.get("subject", {}).get("reference"),
                    "requester": info.get("requester", {}).get("reference"),
                    "performers": performers
                })

            EOB_logger.debug(
                f"EOB id {eob_id}, patient {patient_ref}, insurers {insurer_refs}, "
                f"supporting info: {extracted_info}"
            )

            eob_list.append({
                "eob_id": eob_id,
                "patient_id": patient_ref,
                "status": status,
                "type": eob_type,
                "use": use,
                "created": created,
                "insurer_refs": insurer_refs,
                "extracted_info": extracted_info,
                "resource": resource
            })

        return eob_list

"""def parse_immunizations(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
                return None
        
        immunizations=[]
        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") == "Immunization":
                immunization_id = resource.get("id") 
                status = resource.get("status")
                patient_ref = resource.get("patient",{}).get("reference")
                encounter_ref = resource.get("encounter",{}).get("reference")
                occurrence_date_time = resource.get("occurrenceDateTime")
                primary_source = resource.get("primarySource")
                location_ref = resource.get("location", {}).get("reference")
                location_display = resource.get("location", {}).get("display")

                logger.debug(f"immunization_id: {immunization_id}")

                immunizations.append({
                    "immunization_id": immunization_id
                })

        return immunizations """

def parse_immunizations(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None
        
        immunizations = []

        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") != "Immunization":
                continue

            immunization_id = resource.get("id")
            status = resource.get("status")
            patient_ref = resource.get("patient", {}).get("reference")
            encounter_ref = resource.get("encounter", {}).get("reference")
            occurrence_date_time = resource.get("occurrenceDateTime")
            primary_source = resource.get("primarySource")
            
            location = resource.get("location", {})
            location_ref = location.get("reference")
            location_display = location.get("display")

            # Vaccine code
            vaccine_code = resource.get("vaccineCode", {})
            vaccine_codings = vaccine_code.get("coding", [])
            '''vaccine_info = [
                {"system": c.get("system"), "code": c.get("code"), "display": c.get("display")}
                for c in vaccine_codings
            ]'''
            vaccine_info = list(map(lambda c: {"system": c.get("system"), "code": c.get("code"), "display": c.get("display")}, vaccine_codings))


            # Performer(s)
            performers_raw = resource.get("performer", [])
            performers = []
            for p in performers_raw:
                actor = p.get("actor", {})
                performers.append({
                    "actor_ref": actor.get("reference"),
                    "actor_display": actor.get("display"),
                    "function": p.get("function", {}).get("text")
                })

            immunizations.append({
                "immunization_id": immunization_id,
                "status": status,
                "patient_ref": patient_ref,
                "encounter_ref": encounter_ref,
                "occurrence_date_time": occurrence_date_time,
                "primary_source": primary_source,
                "location": {"ref": location_ref, "display": location_display},
                "vaccine_info": vaccine_info,
                "performers": performers,
                "resource": resource
            })

    return immunizations


"""def parse_coverage(file_path):
    #print("inside coverage")
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None
        
        coverage = []
        for entry in data["entry"]:
            resource = entry.get("resource", {})

            # Case 1: Coverage is a top-level resource
            if resource.get("resourceType") == "Coverage":
                #print("Found top-level Coverage")
                #print("coverage_id", resource.get("id"))
                
                coverage.append({
                    "coverage_id": resource.get("id"),
                    "status": resource.get("status"),
                    "type": resource.get("type", {}).get("text"),
                })

            # Case 2: Coverage inside "contained" resources (like in EOB)
            #for contained in resource.get("contained", []):
                #if contained.get("resourceType") == "Coverage":
                    #print("Found contained Coverage")
                    #print("coverage_id: ",  contained.get("id"))
                    #coverage.append({
                    #    "coverage_id": contained.get("id"),
                    #    "status": contained.get("status"),
                    #    "type": contained.get("type", {}).get("text"),
                    #})

        return coverage"""

def parse_coverage(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None
        
        coverage_list = []

        for entry in data["entry"]:
            resource = entry.get("resource", {})

            if resource.get("resourceType") == "Coverage":
                coverage_list.append({
                    "coverage_id": resource.get("id"),
                    "status": resource.get("status"),
                    "type": resource.get("type", {}),  # keep full dict for JSONB
                    "subscriber_id": resource.get("subscriberId"),
                    "beneficiary": resource.get("beneficiary", {}).get("reference"),
                    "payor": [p.get("reference") for p in resource.get("payor", [])],
                    "period": resource.get("period"),
                    "class": resource.get("class", []),
                    "resource": resource  # store full raw FHIR JSON
                })

            # Check contained resources too
            for contained in resource.get("contained", []):
                if contained.get("resourceType") == "Coverage":
                    coverage_list.append({
                        "coverage_id": contained.get("id"),
                        "status": contained.get("status"),
                        "type": contained.get("type", {}),
                        "subscriber_id": contained.get("subscriberId"),
                        "beneficiary": contained.get("beneficiary", {}).get("reference"),
                        "payor": [p.get("reference") for p in contained.get("payor", [])],
                        "period": contained.get("period"),
                        "class": contained.get("class", []),
                        "resource": contained
                    })

        return coverage_list


def parse_devices(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if "entry" not in data:
            return None
        
        devices = []
        for entry in data["entry"]:
            resource = entry.get("resource", {})
            if resource.get("resourceType") == "Device":
                device_id = resource.get("id") 
                status = resource.get("status")
                distinct_identifier = resource.get("distinctIdentifier")
                manufacture_date = resource.get("manufactureDate")
                expiration_date = resource.get("expirationDate")
                lot_number = resource.get("lotNumber")
                serial_number = resource.get("serialNumber")
                patient_ref = resource.get("patient", {}).get("reference")

                # collect device names
                device_names = [
                    {
                        "name": dn.get("name"),
                        "type": dn.get("type")
                    }
                    for dn in resource.get("deviceName", [])
                ]

                # collect codings
                codings = [
                    {
                        "system": c.get("system"),
                        "code": c.get("code"),
                        "display": c.get("display")
                    }
                    for c in resource.get("type", {}).get("coding", [])
                ]
                
                logging.info(f"device_id: {device_id}")
                devices.append({
                    "device_id": device_id,
                    "status": status,
                    "distinct_identifier": distinct_identifier,
                    "manufacture_date": manufacture_date,
                    "expiration_date": expiration_date,
                    "lot_number": lot_number,
                    "serial_number": serial_number,
                    "patient_ref": patient_ref,
                    "device_names": device_names,
                    "codings": codings,
                    "resource": resource  # optional: keep raw JSON for completeness
                })   
        return devices

def parse_medication_request(resource):
    return {
        "medrequest_id": resource.get("id"),
        "status": resource.get("status"),
        "intent": resource.get("intent"),
        "medication_id": resource.get("medicationReference", {}).get("reference"),
        "subject_patient_id": resource.get("subject", {}).get("reference"),
        "encounter_id": resource.get("encounter", {}).get("reference"),
        "requester_practitioner_id": resource.get("requester", {}).get("reference"),
        "requester_org_id": resource.get("requester", {}).get("organization", {}).get("reference"),
        "authored_on": resource.get("authoredOn"),
        "dosage_instruction": resource.get("dosageInstruction"),
        "dispense_request": resource.get("dispenseRequest"),
        "resource": resource
    }

def parse_supply_delivery(resource):
    return {
        "supplydelivery_id": resource.get("id"),
        "status": resource.get("status"),
        "type": resource.get("type"),
        "patient_id": resource.get("patient", {}).get("reference"),
        "supplier_org_id": resource.get("supplier", {}).get("reference"),
        "destination_org_id": resource.get("destination", {}).get("organization", {}).get("reference"),
        "occurrence": resource.get("occurrenceDateTime"),
        "supplied_item": resource.get("suppliedItem"),
        "resource": resource
    }

def parse_procedure(resource):
    return {
        "procedure_id": resource.get("id"),
        "status": resource.get("status"),
        "category": resource.get("category"),
        "code": resource.get("code"),
        "subject_patient_id": resource.get("subject", {}).get("reference"),
        "encounter_id": resource.get("encounter", {}).get("reference"),
        "performer_practitioner_id": (resource.get("performer", [{}])[0].get("actor", {}).get("reference")),
        "performer_org_id": (resource.get("performer", [{}])[0].get("onBehalfOf", {}).get("reference")),
        "reason_code": resource.get("reasonCode"),
        "reason_reference": resource.get("reasonReference"),
        "body_site": resource.get("bodySite"),
        "outcome": resource.get("outcome"),
        "follow_up": resource.get("followUp"),
        "complication": resource.get("complication"),
        "performed_period": resource.get("performedPeriod"),
        "resource": resource
    }




# clean for empty field values
def clean_nested(data):
    if isinstance(data, dict):
        if not data:  # empty dict
            return None
        return {k: clean_nested(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [clean_nested(v) for v in data]
    return data

# You can follow the same pattern for document_reference, immunizations, claims, EOB, coverage
# -------------------------
# Test with one file and save to parquet
# -------------------------
if __name__ == "__main__":

    # Create a logger
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # File handler
    file_handler = logging.FileHandler('C:\\Users\\tonim\\Downloads\\output\\fhir\\debug.log')
    file_handler.setLevel(logging.DEBUG)

    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)

    # Formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)

    # Add handlers to logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

    # Usage
    #logger.debug("Debug message goes to file")
    #logger.info("Info message goes to both console and file")



    #test_file = r"C:\Users\tonim\Downloads\output\fhir\Patients\German382_Runte676_badeac37-4850-29ae-978e-3435d3358c85.json"
    #test_file = r"C:\Users\tonim\Downloads\output\fhir\Patients\Herman763_Sanford861_994754ee-a5d2-495c-90f4-fd4ada07bf02.json"
    test_file = r"C:\Users\tonim\Downloads\output\fhir\Patients\Floyd420_Streich926_42f4db2f-b049-c9a1-a961-7a944ea72e48.json"
    patients_output_file = r"C:\Users\tonim\Downloads\output\parquet\patients.parquet"
    encounters_output_file = r"C:\Users\tonim\Downloads\output\parquet\encounters.parquet"
    conditions_output_file = r"C:\Users\tonim\Downloads\output\parquet\conditions.parquet"
    observations_output_file = r"C:\Users\tonim\Downloads\output\parquet\observations.parquet"
    diagnostic_reports_output_file = r"C:\Users\tonim\Downloads\output\parquet\diagnostic_reports.parquet"
    care_plans_output_file = r"C:\Users\tonim\Downloads\output\parquet\care_plans_reports.parquet"
    claims_output_file = r"C:\Users\tonim\Downloads\output\parquet\claims.parquet"
    eob_output_file = r"C:\Users\tonim\Downloads\output\parquet\eob.parquet"
    coverage_output_file = r"C:\Users\tonim\Downloads\output\parquet\coverage.parquet"
    document_reference_output_file = r"C:\Users\tonim\Downloads\output\parquet\document_reference.parquet"
    immunization_output_file = r"C:\Users\tonim\Downloads\output\parquet\immunization_output.parquet"
    device_output_file = r"C:\Users\tonim\Downloads\output\parquet\immunization_output.parquet"

    #os.makedirs(os.path.dirname(output_file), exist_ok=True)

    patients_data = [clean_nested(p) for p in (parse_patient_file(test_file))]
    encounter_data = parse_encounter_file(test_file)
    condition_data = insert_conditions(test_file)
    observation_data = parse_observations(test_file)
    diagnostic_data = parse_diagnostic_reports(test_file)
    care_plans_data = parse_care_plans(test_file)
    claim_data = parse_claims(test_file)
    eob_data = parse_EOB(test_file)
    coverage_data = parse_coverage(test_file)
    document_reference_data = parse_document_reference(test_file)
    immunization_data = parse_immunizations(test_file)
    device_data = parse_devices(test_file)

if patients_data:
    df_patients = pd.DataFrame(patients_data)
    print(df_patients)
    df_patients.to_parquet(patients_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_patients)} patients to {patients_output_file}")
else:
    print("No patient records found.")

if encounter_data:
    df_encounters = pd.DataFrame(encounter_data)
    print(df_encounters)
    df_encounters.to_parquet(encounters_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_encounters)} encounters to {encounters_output_file}")
else:
    print("No encounter records found.")

if condition_data:
    df_conditions = pd.DataFrame(condition_data)
    print(df_conditions)
    df_conditions.to_parquet(conditions_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_conditions)} conditions to {conditions_output_file}")
else:
    print("No condition records found.")

if observation_data:
    df_observations = pd.DataFrame(observation_data)
    print(df_observations)
    df_observations.to_parquet(observations_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_observations)} observations to {observations_output_file}")
else:
    print("No observations records found.")

if diagnostic_data:
    df_diagnostic_reports = pd.DataFrame(diagnostic_data)
    print(df_diagnostic_reports)
    df_diagnostic_reports.to_parquet(diagnostic_reports_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_diagnostic_reports)} diagnosis to {diagnostic_reports_output_file}")
else:
    print("No diagnostic report records found.")

if immunization_data:
    df_immunizations = pd.DataFrame(immunization_data)
    print(df_immunizations)
    df_immunizations.to_parquet(immunization_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_immunizations)} diagnosis to {immunization_output_file}")
else:
    print("No immunization records found.")

if document_reference_data:
    df_document_reference = pd.DataFrame(document_reference_data)
    print(document_reference_data)
    df_document_reference.to_parquet(document_reference_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_document_reference)} diagnosis to {document_reference_output_file}")
else:
    print("No document_reference records found.")

if care_plans_data:
    df_care_plans_reports = pd.DataFrame(care_plans_data)
    print(df_care_plans_reports)
    df_care_plans_reports.to_parquet(care_plans_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_care_plans_reports)} care_plans to {care_plans_output_file}")
else:
    print("No care plans records found.")

if claim_data:
    df_claims = pd.DataFrame(claim_data)
    print(df_claims)
    df_claims.to_parquet(claims_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_claims)} claim to {claims_output_file}")
else:
    print("No claim records found.")

if eob_data:
    df_eob = pd.DataFrame(eob_data)
    print(df_eob)
    df_eob.to_parquet(eob_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_eob)} eob to {eob_output_file}")
else:
    print("No EOB records found.")

if coverage_data:
    df_coverage = pd.DataFrame(coverage_data)
    print(df_coverage)
    df_coverage.to_parquet(coverage_output_file, engine="pyarrow", index=False)
    print(f"Wrote {len(df_coverage)} coverage to {coverage_output_file}")
else:
    print("No coverage records found.")

if device_data:
    df_device = pd.DataFrame(device_data)
    logging.info("{df_device}")
    df_device.to_parquet(device_output_file, engine="pyarrow", index=False)
    logging.info(f"Wrote{df_device} device to {device_output_file}")
else:
    logging.info(f"No device records found.")


In [None]:
# Create a logger
import logging
def logger(name, log_file, level=logging.DEBUG):
    """Create a dedicated logger for a parser method."""
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Avoid adding multiple handlers if logger already exists
    if not logger.handlers:
        file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
        file_handler.setLevel(level)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    
    return logger

In [None]:
def clean_nested(data):
    """Recursively replace empty dicts/lists with None."""
    if isinstance(data, dict):
        if not data:
            return None
        return {k: clean_nested(v) for k, v in data.items()}
    elif isinstance(data, list):
        if not data:
            return None
        return [clean_nested(v) for v in data]
    return data

In [None]:
'''def safe_get(value, key=None, default=None):
    """Safe extraction from dicts/lists."""
    if key is None:
        return value if value is not None else default
    if isinstance(value, dict):
        return value.get(key, default)
    return default''' 

''''def safe_get(d, *keys):
    for k in keys:
        if isinstance(d, dict):
            d = d.get(k)
        else:
            return None
    return d''' 
def get_field(resource, *keys):
    """Safely navigate nested dictionaries/lists. Returns None if missing."""
    value = resource
    for key in keys:
        if isinstance(value, list):
            value = value[0] if value else None
        if isinstance(value, dict):
            value = value.get(key)
        else:
            return None
    return value

# --- Helpers ---
def clean_nested(data):
    """Recursively replace empty dicts/lists with None."""
    if isinstance(data, dict):
        if not data:
            return None
        return {k: clean_nested(v) for k, v in data.items()}
    elif isinstance(data, list):
        if not data:
            return None
        return [clean_nested(v) for v in data]
    return data



def safe_get(d, *keys, default=None):
    """Safely extract nested values from a dict."""
    for k in keys:
        if isinstance(d, dict):
            d = d.get(k, default)
        else:
            return default
    return d


def extract_identifier(resource, code):
    """Extract an identifier of a given type code (SS, DL, PPN)."""
    for ident in resource.get("identifier", []):
        type_code = safe_get(ident.get("type", {}).get("coding", [{}])[0], "code")
        if type_code == code:
            return ident.get("value")
    return None

def extract_extensions(resource, url):
    """Extract FHIR extensions matching a URL."""
    result = {}
    for ext in resource.get("extension", []) or []:
        if ext.get("url") == url:
            for inner in ext.get("extension", []) or []:
                key = inner.get("url")
                value = inner.get("valueString") or safe_get(inner, "valueCoding", "display")
                if key:
                    result[key] = value
    return result if result else None

def extract_geolocation(address):
    """Extract latitude/longitude from a FHIR address extension."""
    latitude = longitude = None
    for ext in address.get("extension", []) or []:
        if ext.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
            for geo in ext.get("extension", []) or []:
                if geo.get("url") == "latitude":
                    latitude = geo.get("valueDecimal")
                elif geo.get("url") == "longitude":
                    longitude = geo.get("valueDecimal")
    return latitude, longitude

def extract_birth_place(resource):
    """Return formatted birth_place string or None."""
    extensions = resource.get("extension", []) or []
    birth_place = next(
        (ext.get("valueAddress") for ext in extensions 
         if ext.get("url") == "http://hl7.org/fhir/StructureDefinition/patient-birthPlace"),
        None
    )
    if birth_place:
        parts = [birth_place.get("city"), birth_place.get("state"), birth_place.get("country")]
        return ", ".join([p for p in parts if p]) or None
    return None

def extract_addresses(resource):
    """Return list of addresses with geolocation if available."""
    addresses = []
    for addr in resource.get("address", []) or []:
        lat, lon = extract_geolocation(addr)
        addresses.append({
            "line": addr.get("line"),
            "city": addr.get("city"),
            "state": addr.get("state"),
            "postal_code": addr.get("postalCode"),
            "country": addr.get("country"),
            "latitude": lat,
            "longitude": lon
        })
    return addresses if addresses else None

def extract_telecoms(resource):
    """Return list of telecoms (phone/email/etc)."""
    telecoms_list = []
    for t in resource.get("telecom", []) or []:
        telecoms_list.append({
            "system": t.get("system"),
            "value": t.get("value"),
            "use": t.get("use"),
            "extension": t.get("extension")  # e.g., US Core Direct
        })
    return telecoms_list if telecoms_list else None

def extract_text_or_display(obj, text_key="text", coding_key="coding", category_key="ombCategory"):
    """
    Extracts a human-readable string from a FHIR object like maritalStatus, race, ethnicity.
    Priority: text -> coding.display -> ombCategory.display
    """
    if not isinstance(obj, dict):
        return None

    # Prefer the plain "text" field
    if text_key in obj and obj[text_key]:
        return obj[text_key]

    # Fallback to first coding.display
    if coding_key in obj and isinstance(obj[coding_key], list) and obj[coding_key]:
        return obj[coding_key][0].get("display")

    # Fallback to ombCategory.display (if race/ethnicity style)
    if category_key in obj and isinstance(obj[category_key], dict):
        return obj[category_key].get("display")

    return None


In [None]:
import logging
def insert_patient: 
    

# Create a proper logger
    Main_logger = logging.getLogger("FHIR_ETL")
    Main_logger.setLevel(logging.DEBUG)
    if not Main_logger.handlers:
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        Main_logger.addHandler(ch)
        conn = psycopg2.connect("dbname=FHIR_staging user=postgres password=new_password host=localhost")
        cur = conn.cursor()

        # Insert each patient
        #for patient_data in patients_data:
        """
            Inserts a patient into fhir_staging.patients_fhir_raw.
            Handles JSON fields and optional values safely.
        """

    
        columns = [
            "patient_id",
            "first_name",
            "last_name",
            "prefix",
            "gender",
            "birth_date",
            "deceased_date_time",
            "ssn",
            "drivers_license",
            "passport",
            "marital_status",
            "race",
            "ethnicity",
            "birth_place",
            "resource"
        ]

    # Define which columns are JSONB
    jsonb_columns = {"race", "ethnicity", "birth_place", "resource"}


    values = []
    for col in columns:
        val = patient_data.get(col)
        if col in jsonb_columns:
            if isinstance(val, (dict, list)):
                val = json.dumps(val)
            elif isinstance(val, str):
                # Wrap strings as JSON object {"text": "..."} for JSONB
                val = json.dumps({"text": val})
            else:
                val = json.dumps({})
        values.append(val)
    
    # Build the SQL query safely using psycopg2.sql
    query = sql.SQL("""
        INSERT INTO fhir_staging.patients_fhir_raw ({fields})
        VALUES ({placeholders})
        ON CONFLICT (patient_id) DO NOTHING
        """).format(
        fields=sql.SQL(', ').join(map(sql.Identifier, columns)),
        placeholders=sql.SQL(', ').join(sql.Placeholder() * len(columns))
    )

    try:
        cur.execute(query, values)
        if logger:
            logger.debug(f"Inserted patient: {patient_data.get('patient_id')}")
    except Exception as e:
        if logger:
            logger.error(f"Failed to insert patient {patient_data.get('patient_id')}: {e}")
        else:
            print(f"Failed to insert patient {patient_data.get('patient_id')}: {e}")
        raise
   
    conn.commit()
    cur.close()
    conn.close()
    Main_logger.debug("All patients, addresses, and telecoms inserted into Postgres successfully.")
    #try:
    ''''cur.execute(""" INSERT INTO fhir_staging.patients_fhir_raw 
                (patient_id, 
                first_name, 
                last_name, 
                prefix, 
                gender, 
                birth_date, 
                deceased_date_time, 
                ssn, drivers_license, 
                passport, 
                marital_status, 
                race, 
                ethnicity, 
                birth_place, 
                resource) 
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 
                ON CONFLICT (patient_id) DO NOTHING """, 
                ( patient_data["patient_id"], 
                    patient_data.get("first_name"),
                    patient_data.get("last_name"), 
                    patient_data.get("prefix"), 
                    patient_data.get("gender"), 
                    patient_data.get("birth_date"), 
                    patient_data.get("deceased_date_time"), 
                    patient_data.get("ssn"), 
                    patient_data.get("drivers_license"), 
                    patient_data.get("passport"), 
                    patient_data.get("marital_status"), 
                    Json(patient_data.get("race", {})), 
                    Json(patient_data.get("ethnicity", {})), 
                    patient_data.get("birth_place"), 
                    Json(patient_data.get("resource", {})) ) )'''
        #if logger:
        #    Main_logger.debug(f"Inserted patient: {patient_data.get('patient_id')}")
    #except Exception as e:
      #  if logger:
    #Main_logger.error(f"Failed to insert patient {patient_data.get('patient_id')}: {e}")
       # else:
    #print(f"Failed to insert patient {patient_data.get('patient_id')}: {e}")
        #raise
    
    columns = [
        "patient_id",
        "first_name",
        "last_name",
        "prefix",
        "gender",
        "birth_date",
        "deceased_date_time",
        "ssn",
        "drivers_license",
        "passport",
        "marital_status",
        "race",
        "ethnicity",
        "birth_place",
        "resource"
    ]

    # Define which columns are JSONB
    jsonb_columns = {"race", "ethnicity", "birth_place", "resource"}


    values = []
    for col in columns:
        val = patient_data.get(col)
        if col in jsonb_columns:
            if isinstance(val, (dict, list)):
                val = json.dumps(val)
            elif isinstance(val, str):
                # Wrap strings as JSON object {"text": "..."} for JSONB
                val = json.dumps({"text": val})
            else:
                val = json.dumps({})
        values.append(val)
    
    # Build the SQL query safely using psycopg2.sql
    query = sql.SQL("""
        INSERT INTO fhir_staging.patients_fhir_raw ({fields})
        VALUES ({placeholders})
        ON CONFLICT (patient_id) DO NOTHING
    """).format(
        fields=sql.SQL(', ').join(map(sql.Identifier, columns)),
        placeholders=sql.SQL(', ').join(sql.Placeholder() * len(columns))
    )

    try:
        cur.execute(query, values)
        if logger:
            logger.debug(f"Inserted patient: {patient_data.get('patient_id')}")
    except Exception as e:
        if logger:
            logger.error(f"Failed to insert patient {patient_data.get('patient_id')}: {e}")
        else:
            print(f"Failed to insert patient {patient_data.get('patient_id')}: {e}")
        raise
   
    conn.commit()
    cur.close()
    conn.close()
    Main_logger.debug("All patients, addresses, and telecoms inserted into Postgres successfully.")


This is the parser for patients and inserts data into a postgres database

In [3]:
###Refactor 09-03-2025 Testing with database
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import psycopg2
from psycopg2.extras import Json
from psycopg2 import sql
import logging

# --- Helpers ---
def clean_nested(data):
    """Recursively replace empty dicts/lists with None."""
    if isinstance(data, dict):
        if not data:
            return None
        return {k: clean_nested(v) for k, v in data.items()}
    elif isinstance(data, list):
        if not data:
            return None
        return [clean_nested(v) for v in data]
    return data



def safe_get(d, *keys, default=None):
    """Safely extract nested values from a dict."""
    for k in keys:
        if isinstance(d, dict):
            d = d.get(k, default)
        else:
            return default
    return d

def extract_identifier(resource, code):
    """Extract an identifier of a given type code (SS, DL, PPN)."""
    for ident in resource.get("identifier", []):
        type_code = safe_get(ident.get("type", {}).get("coding", [{}])[0], "code")
        if type_code == code:
            return ident.get("value")
    return None

def extract_extensions(resource, url):
    """Extract FHIR extensions matching a URL."""
    result = {}
    for ext in resource.get("extension", []) or []:
        if ext.get("url") == url:
            for inner in ext.get("extension", []) or []:
                key = inner.get("url")
                value = inner.get("valueString") or safe_get(inner, "valueCoding", "display")
                if key:
                    result[key] = value
    return result if result else None

def extract_geolocation(address):
    """Extract latitude/longitude from a FHIR address extension."""
    latitude = longitude = None
    for ext in address.get("extension", []) or []:
        if ext.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
            for geo in ext.get("extension", []) or []:
                if geo.get("url") == "latitude":
                    latitude = geo.get("valueDecimal")
                elif geo.get("url") == "longitude":
                    longitude = geo.get("valueDecimal")
    return latitude, longitude

def extract_birth_place(resource):
    """Return formatted birth_place string or None."""
    extensions = resource.get("extension", []) or []
    birth_place = next(
        (ext.get("valueAddress") for ext in extensions 
         if ext.get("url") == "http://hl7.org/fhir/StructureDefinition/patient-birthPlace"),
        None
    )
    if birth_place:
        parts = [birth_place.get("city"), birth_place.get("state"), birth_place.get("country")]
        return ", ".join([p for p in parts if p]) or None
    return None

def extract_addresses(resource):
    """Return list of addresses with geolocation if available."""
    addresses = []
    for addr in resource.get("address", []) or []:
        lat, lon = extract_geolocation(addr)
        addresses.append({
            "line": addr.get("line"),
            "city": addr.get("city"),
            "state": addr.get("state"),
            "postal_code": addr.get("postalCode"),
            "country": addr.get("country"),
            "latitude": lat,
            "longitude": lon
        })
    return addresses if addresses else None

def extract_telecoms(resource):
    """Return list of telecoms (phone/email/etc)."""
    telecoms_list = []
    for t in resource.get("telecom", []) or []:
        telecoms_list.append({
            "system": t.get("system"),
            "value": t.get("value"),
            "use": t.get("use"),
            "extension": t.get("extension")  # e.g., US Core Direct
        })
    return telecoms_list if telecoms_list else None

def extract_text_or_display(obj, text_key="text", coding_key="coding", category_key="ombCategory"):
    """
    Extracts a human-readable string from a FHIR object like maritalStatus, race, ethnicity.
    Priority: text -> coding.display -> ombCategory.display
    """
    if not isinstance(obj, dict):
        return None

    # Prefer the plain "text" field
    if text_key in obj and obj[text_key]:
        return obj[text_key]

    # Fallback to first coding.display
    if coding_key in obj and isinstance(obj[coding_key], list) and obj[coding_key]:
        return obj[coding_key][0].get("display")

    # Fallback to ombCategory.display (if race/ethnicity style)
    if category_key in obj and isinstance(obj[category_key], dict):
        return obj[category_key].get("display")

    return None

def extract_human_readable(obj):
    """
    Return a human-readable string from common FHIR shapes:
      - CodeableConcept: {'coding':[...], 'text': '...'}
      - Coding: {'system':..., 'code':..., 'display':...}
      - US Core race/ethnicity extension shapes (nested 'extension' arrays)
      - valueCodeableConcept / valueCoding / valueString
      - Lists of any of the above (returns first non-empty)
    Priority (rough): text / valueString -> coding[0].display -> coding[0].code -> valueCoding.display -> ombCategory.display
    """
    # quick guards
    if obj is None:
        return None
    if isinstance(obj, str):
        return obj

    # lists: return first non-None result
    if isinstance(obj, list):
        for item in obj:
            v = extract_human_readable(item)
            if v:
                return v
        return None

    # dict handling
    if isinstance(obj, dict):
        # direct fields
        if obj.get("text"):
            return obj.get("text")
        if obj.get("display"):
            return obj.get("display")
        if obj.get("valueString"):
            return obj.get("valueString")

        # valueCoding
        vc = obj.get("valueCoding") or obj.get("valueCoding")
        if isinstance(vc, dict):
            return vc.get("display") or vc.get("code")

        # valueCodeableConcept
        vcc = obj.get("valueCodeableConcept")
        if vcc is not None:
            return extract_human_readable(vcc)

        # coding list (CodeableConcept)
        coding = obj.get("coding")
        if isinstance(coding, list) and coding:
            first = coding[0]
            if isinstance(first, dict):
                return first.get("display") or first.get("code")

        # ombCategory (US Core race/ethnicity compact shape)
        omb = obj.get("ombCategory")
        if isinstance(omb, dict):
            return omb.get("display") or omb.get("code")

        # Some FHIR extensions use 'extension' arrays (e.g., US Core race/ethnicity)
        if isinstance(obj.get("extension"), list):
            # First, try the common nested pattern where an extension contains a nested 'extension' list
            for ext in obj["extension"]:
                # If ext directly has a valueString / valueCoding / valueCodeableConcept, use it
                if isinstance(ext, dict):
                    if ext.get("valueString"):
                        return ext.get("valueString")
                    if ext.get("valueCoding") and isinstance(ext.get("valueCoding"), dict):
                        return ext["valueCoding"].get("display") or ext["valueCoding"].get("code")
                    if ext.get("valueCodeableConcept"):
                        v = extract_human_readable(ext["valueCodeableConcept"])
                        if v:
                            return v
                    # If ext contains its own nested 'extension' list (the US Core pattern)
                    if isinstance(ext.get("extension"), list):
                        # try to find 'text' sub-extension first
                        for sub in ext["extension"]:
                            if sub.get("url") == "text" and sub.get("valueString"):
                                return sub.get("valueString")
                        # try to find 'ombCategory' or similar sub-extension with valueCoding
                        for sub in ext["extension"]:
                            if sub.get("url") in ("ombCategory", "race", "ethnicity") and sub.get("valueCoding"):
                                vc = sub.get("valueCoding")
                                if isinstance(vc, dict):
                                    return vc.get("display") or vc.get("code")
                        # fallback: recurse into each sub-extension
                        for sub in ext["extension"]:
                            v = extract_human_readable(sub)
                            if v:
                                return v

            # If we didn't return yet, try recursing into each top-level extension element
            for ext in obj["extension"]:
                v = extract_human_readable(ext)
                if v:
                    return v

        # fallback: sometimes keys like 'category' or other nested dicts contain display/code
        for key in ("category",):
            val = obj.get(key)
            if isinstance(val, dict):
                return val.get("display") or val.get("code")

    # no match
    return None

def parse_race(resource):
    """
    Extracts the patient's race from a FHIR Patient resource.
    
    Returns the display string or None if not found.
    """
    if "extension" not in resource:
        return None

    for ext in resource["extension"]:
        if ext.get("url") == "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race":
            for sub_ext in ext.get("extension", []):
                if sub_ext.get("url") == "ombCategory":
                    return sub_ext.get("valueCoding", {}).get("display")
    return None

def parse_ethnicity(patient_json):
    ethnicity = None
    if "extension" in patient_json:
        for ext in patient_json["extension"]:
            if ext.get("url") == "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity":
                for sub_ext in ext.get("extension", []):
                    if sub_ext.get("url") == "ombCategory":
                        ethnicity = sub_ext.get("valueCoding", {}).get("display")
    return ethnicity

import json
import psycopg2
# Create a logger
import logging
def logger(name, log_file, level=logging.DEBUG):
    """Create a dedicated logger for a parser method."""
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Avoid adding multiple handlers if logger already exists
    if not logger.handlers:
        file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
        file_handler.setLevel(level)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    
    return logger

def insert_patients(patients_list, conn):
    """
        Inserts patients, addresses, and telecoms into Postgres.
        Expects a list of patient dicts (already parsed).
    """
    insertPatients_logger = logger(
        "INS_patients", 
        "C:\\Users\\tonim\\Downloads\\output\\fhir\\ins_patients_log"
    )
    
    cur = conn.cursor()

    # Queries
    patient_query = """
    INSERT INTO fhir_staging.patients_fhir_raw (
        patient_id, first_name, last_name, prefix, gender, birth_date, deceased_date_time,
        ssn, drivers_license, passport, marital_status, race, ethnicity, birth_place, resource
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (patient_id) DO NOTHING
    """

    address_query = """
    INSERT INTO fhir_staging.patient_addresses (
        patient_id, line, city, state, postal_code, country, latitude, longitude
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """

    telecom_query = """
    INSERT INTO fhir_staging.patient_telecoms (
        patient_id, system, value, use, extension
    ) VALUES (%s, %s, %s, %s, %s)
    """

    for patient in patients_list:
        # Insert patient
        cur.execute(patient_query, (
            patient.get("patient_id"),
            patient.get("first_name"),
            patient.get("last_name"),
            patient.get("prefix"),
            patient.get("gender"),
            patient.get("birth_date"),
            patient.get("deceased_date_time"),
            patient.get("ssn"),
            patient.get("drivers_license"),
            patient.get("passport"),
            patient.get("marital_status"),
            patient.get("race"),
            patient.get("ethnicity"),
            patient.get("birth_place"),
            json.dumps(patient.get("resource")) if patient.get("resource") else None
        ))

        # Insert addresses
        for addr in patient.get("addresses", []):
            #print(f"address: {addr}")
            #lines = addr.get("line", [])
            addy = ""
            lines = addr.get("line")

        # Normalize lines into a list of strings
        if lines is None:
            lines = []
        elif isinstance(lines, str):
            lines = [lines]
        elif isinstance(lines, list):
            # Extract strings if the list contains dicts
            lines = [line if isinstance(line, str) else line.get("line", "") for line in lines]

        # Join with comma + space
        addy = ", ".join([line for line in lines if line])
        print(f"addy: {addy}")
        city = addr.get("city")
        state = addr.get("state")
        postal_code = addr.get("postalCode")
        country = addr.get("country")
        #latitude = addr.get("latitude")
        #longitude = addr.get("longitude")
        latitude = float(addr.get("latitude")) if addr.get("latitude") is not None else None
        longitude = float(addr.get("longitude")) if addr.get("longitude") is not None else None

        cur.execute(address_query, (
            patient.get("patient_id"),
            lines,
            city,
            state,
            postal_code,
            country,
            latitude,
            longitude
        ))

        # Insert telecoms
        for tel in patient.get("telecoms", []):
            system = tel.get("system")
            value = tel.get("value")
            use = tel.get("use")
            extension = tel.get("extension")  # this can be a list/dict
            #insertPatients_logger(f"extension: {extension}")

            # Convert extension to JSON string if it exists
            extension_json = json.dumps(extension) if extension else None
            
            cur.execute(telecom_query, (
                patient.get("patient_id"),
                system,
                value,
                use,
                extension
            ))

    conn.commit()
    cur.close()



def parse_patient_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    if "entry" not in data:
        return []

    patients = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Patient":
            continue
        
        id = resource.get("id")
        Main_logger.debug(f"id: {id}")
        names = resource.get("name", [])

        name = names[0] if names else {}
        given_name = safe_get(name.get("given", [None])[0])
        Main_logger.debug(f"given_name: {given_name}")

        last_name = name.get("family")
        Main_logger.debug(f"last_name: {last_name}")
        prefix = safe_get(name.get("prefix", [None])[0])
        Main_logger.debug(f"prefix: {prefix}")
        gender = resource.get("gender")
        Main_logger.debug(f"gender: {gender}")
        birth_date = resource.get("birthDate")
        Main_logger.debug(f"birth_date: {birth_date}")
        deceased_date_time = resource.get("deceasedDateTime")
        Main_logger.debug(f"deceased_date: {deceased_date_time}")
        ssn = extract_identifier(resource, "SS")
        Main_logger.debug(f"ssn: {ssn}")
        drivers_license = extract_identifier(resource, "DL")
        Main_logger.debug(f"drivers_license: {drivers_license}")
        passport = extract_identifier(resource, "PPN")
        Main_logger.debug(f"passport: {passport}")
    
        #ms = resource.get("maritalStatus") or {}
        #marital_status = ms.get("text") or ((ms.get("coding") or [{}])[0].get("display"))
        #Main_logger.debug(f"marital_status: {marital_status}")
        #race = extract_extensions(resource, "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race")
        #Main_logger.debug(f"race: {race}")
        #ethnicity =extract_extensions(resource, "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity")


        #Main_logger.debug(f"ethicity: {ethnicity}")

        marital_status = extract_human_readable(resource.get("maritalStatus")) \
                 or extract_human_readable(resource.get("extension"))
        
        #race = extract_human_readable(resource.get("race")) or extract_human_readable(resource.get("extension"))
        race = parse_race(resource)
        #ethnicity = extract_human_readable(resource.get("ethnicity")) or extract_human_readable(resource.get("extension"))
        ethnicity = parse_ethnicity(resource)

        #race = extract_text_or_display(resource.get("race"))
        Main_logger.debug(f"race: {race}")
        #ethnicity = extract_text_or_display(resource.get("ethnicity"))
        Main_logger.debug(f"ethnicity: {ethnicity}")
        #marital_status = extract_text_or_display(resource.get("maritalStatus"))
        Main_logger.debug(f"marital_status: {marital_status}")
            
        #extensions = safe_get(resource, "extension", default=[])
        #address = extensions.get("address",[0]).get("line",[])
        #address = resource.get("address",[])
        #telecoms = resource.get("telecom", [])
        
        birth_place_str = extract_birth_place(resource)
        Main_logger.info(f"birth_place_str: {birth_place_str}")

        addresses = extract_addresses(resource)
        telecoms = extract_telecoms(resource)
        

        patients.append({
            "patient_id": resource.get("id"),
            "first_name": given_name,
            "last_name": last_name,
            "prefix": prefix,
            "gender": resource.get("gender"),
            "birth_date": resource.get("birthDate"),
            "deceased_date_time": resource.get("deceasedDateTime"),
            "ssn": extract_identifier(resource, "SS"),
            "drivers_license": extract_identifier(resource, "DL"),
            "passport": extract_identifier(resource, "PPN"),
            "marital_status": marital_status,
            "race": race,
            "ethnicity": ethnicity,
            "birth_place": birth_place_str,
            "addresses": addresses,
            "telecoms": telecoms,
            "resource": json.dumps(resource) if resource else None
        })

    return patients
if __name__ == "__main__":

    Main_logger = logger(
        "Main", 
        "C:\\Users\\tonim\\Downloads\\output\\fhir\\Main_log"
    )
    #Main_logger = logging.getLogger("FHIR_ETL")
    #Main_logger.setLevel(logging.DEBUG)

    '''if not Main_logger.handlers:
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        Main_logger.addHandler(ch)'''

    test_patient_data = {
    "entry": [
        {
            "fullUrl": "urn:uuid:4ee53233-844d-50b5-32c4-eff7de5fbfdd",
            "resource": {
                "resourceType": "Patient",
                "id": "4ee53233-844d-50b5-32c4-eff7de5fbfdd",
                "meta": {
                    "profile": ["http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient"]
                },
                "text": {
                    "status": "generated",
                    "div": "<div xmlns=\"http://www.w3.org/1999/xhtml\">Generated by <a href=\"https://github.com/synthetichealth/synthea\">Synthea</a>.Version identifier: 3a65f56. Person seed: -7800982323046788640 Population seed: 1755716738658</div>"
                },
                "extension": [
                    {
                        "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race",
                        "extension": [
                            {
                                "url": "ombCategory",
                                "valueCoding": {
                                    "system": "urn:oid:2.16.840.1.113883.6.238",
                                    "code": "2054-5",
                                    "display": "Black or African American"
                                }
                            },
                            {
                                "url": "text",
                                "valueString": "Black or African American"
                            }
                        ]
                    },
                    {
                        "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity",
                        "extension": [
                            {
                                "url": "ombCategory",
                                "valueCoding": {
                                    "system": "urn:oid:2.16.840.1.113883.6.238",
                                    "code": "2186-5",
                                    "display": "Not Hispanic or Latino"
                                }
                            },
                            {
                                "url": "text",
                                "valueString": "Not Hispanic or Latino"
                            }
                        ]
                    },
                    {
                        "url": "http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName",
                        "valueString": "Candice681 Marks830"
                    },
                    {
                        "url": "http://hl7.org/fhir/us/core/StructureDefinition/us-core-birthsex",
                        "valueCode": "F"
                    },
                    {
                        "url": "http://hl7.org/fhir/StructureDefinition/patient-birthPlace",
                        "valueAddress": {
                            "city": "Seekonk",
                            "state": "Massachusetts",
                            "country": "US"
                        }
                    },
                    {
                        "url": "http://synthetichealth.github.io/synthea/disability-adjusted-life-years",
                        "valueDecimal": 21.40718108461066
                    },
                    {
                        "url": "http://synthetichealth.github.io/synthea/quality-adjusted-life-years",
                        "valueDecimal": 48.592818915389344
                    }
                ],
                "identifier": [
                    {
                        "system": "https://github.com/synthetichealth/synthea",
                        "value": "4ee53233-844d-50b5-32c4-eff7de5fbfdd"
                    },
                    {
                        "type": {
                            "coding": [
                                {
                                    "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                                    "code": "MR",
                                    "display": "Medical Record Number"
                                }
                            ],
                            "text": "Medical Record Number"
                        },
                        "system": "http://hospital.smarthealthit.org",
                        "value": "4ee53233-844d-50b5-32c4-eff7de5fbfdd"
                    },
                    {
                        "type": {
                            "coding": [
                                {
                                    "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                                    "code": "SS",
                                    "display": "Social Security Number"
                                }
                            ],
                            "text": "Social Security Number"
                        },
                        "system": "http://hl7.org/fhir/sid/us-ssn",
                        "value": "999-81-1696"
                    },
                    {
                        "type": {
                            "coding": [
                                {
                                    "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                                    "code": "DL",
                                    "display": "Driver's license number"
                                }
                            ],
                            "text": "Driver's license number"
                        },
                        "system": "urn:oid:2.16.840.1.113883.4.3.25",
                        "value": "S99934087"
                    },
                    {
                        "type": {
                            "coding": [
                                {
                                    "system": "http://terminology.hl7.org/CodeSystem/v2-0203",
                                    "code": "PPN",
                                    "display": "Passport Number"
                                }
                            ],
                            "text": "Passport Number"
                        },
                        "system": "http://hl7.org/fhir/sid/passport-USA",
                        "value": "X17415579X"
                    }
                ],
                "name": [
                    {
                        "use": "official",
                        "family": "Sipes176",
                        "given": ["Armida530"],
                        "prefix": ["Mrs."]
                    },
                    {
                        "use": "maiden",
                        "family": "McKenzie376",
                        "given": ["Armida530"],
                        "prefix": ["Mrs."]
                    }
                ],
                "telecom": [
                    {
                        "system": "phone",
                        "value": "555-147-8220",
                        "use": "home"
                    }
                ],
                "gender": "female",
                "birthDate": "1954-04-18",
                "address": [
                    {
                        "extension": [
                            {
                                "url": "http://hl7.org/fhir/StructureDefinition/geolocation",
                                "extension": [
                                    {"url": "latitude", "valueDecimal": 41.91817130004856},
                                    {"url": "longitude", "valueDecimal": -70.8768795282484}
                                ]
                            }
                        ],
                        "line": ["838 Vandervort Loaf Apt 89"],
                        "city": "Middleborough",
                        "state": "MA",
                        "postalCode": "00000",
                        "country": "US"
                    }
                ],
                "maritalStatus": {
                    "coding": [
                        {
                            "system": "http://terminology.hl7.org/CodeSystem/v3-MaritalStatus",
                            "code": "M",
                            "display": "Married"
                        }
                    ],
                    "text": "Married"
                },
                "multipleBirthBoolean": False,
                "communication": [
                    {
                        "language": {
                            "coding": [
                                {
                                    "system": "urn:ietf:bcp:47",
                                    "code": "en-US",
                                    "display": "English (United States)"
                                }
                            ],
                            "text": "English (United States)"
                        }
                    }
                ]
            },
            "request": {
                "method": "POST",
                "url": "Patient"
            }
        }
    ]
}
with open("C:\\Users\\tonim\\Downloads\\output\\test_patient.json", "w", encoding="utf-8") as f:
    json.dump(test_patient_data, f, indent=2)
test_file = r"C:\Users\tonim\Downloads\output\test_patient.json"
#test_file = r"C:\Users\tonim\Downloads\output\fhir\Patients\Floyd420_Streich926_42f4db2f-b049-c9a1-a961-7a944ea72e48.json"
#patients_output_file = r"C:\Users\tonim\Downloads\output\parquet\patients.parquet"
# 
conn = psycopg2.connect("dbname=FHIR_staging user=postgres password=new_password host=localhost")   
patients_data = [clean_nested(p) for p in (parse_patient_file(test_file))]

insert_patients(patients_data, conn)

if not patients_data:
        Main_logger.debug("No patient records found.")
else:
        Main_logger.debug(f"Parsed {len(patients_data)} patients.")

#conn = psycopg2.connect("dbname=FHIR_staging user=youruser password=new_password host=localhost")








addy: 838 Vandervort Loaf Apt 89


In [None]:
import json
import logging

# --- Helpers ---
def clean_nested(data):
    """Recursively replace empty dicts/lists with None."""
    if isinstance(data, dict):
        if not data:
            return None
        return {k: clean_nested(v) for k, v in data.items()}
    elif isinstance(data, list):
        if not data:
            return None
        return [clean_nested(v) for v in data]
    return data



def safe_get(d, *keys, default=None):
    """Safely extract nested values from a dict."""
    for k in keys:
        if isinstance(d, dict):
            d = d.get(k, default)
        else:
            return default
    return d

def extract_identifier(resource, code):
    """Extract an identifier of a given type code (SS, DL, PPN)."""
    for ident in resource.get("identifier", []):
        type_code = safe_get(ident.get("type", {}).get("coding", [{}])[0], "code")
        if type_code == code:
            return ident.get("value")
    return None

def extract_extensions(resource, url):
    """Extract FHIR extensions matching a URL."""
    result = {}
    for ext in resource.get("extension", []) or []:
        if ext.get("url") == url:
            for inner in ext.get("extension", []) or []:
                key = inner.get("url")
                value = inner.get("valueString") or safe_get(inner, "valueCoding", "display")
                if key:
                    result[key] = value
    return result if result else None

def extract_geolocation(address):
    """Extract latitude/longitude from a FHIR address extension."""
    latitude = longitude = None
    for ext in address.get("extension", []) or []:
        if ext.get("url") == "http://hl7.org/fhir/StructureDefinition/geolocation":
            for geo in ext.get("extension", []) or []:
                if geo.get("url") == "latitude":
                    latitude = geo.get("valueDecimal")
                elif geo.get("url") == "longitude":
                    longitude = geo.get("valueDecimal")
    return latitude, longitude


#logging.basicConfig(level=logging.INFO)
def insert_practitioners(practitioner_list, conn):
    """
        Inserts patients, addresses, and telecoms into Postgres.
        Expects a list of patient dicts (already parsed).
    """
    insertPatients_logger = logger(
        "INS_patients", 
        "C:\\Users\\tonim\\Downloads\\output\\fhir\\ins_practitioner_log"
    )
    
    cur = conn.cursor()

    # Queries
    practitioner_query = """
    INSERT INTO fhir_staging.practitioners_fhir_raw (
        practitioner_id, first_name, last_name, prefix, gender, birth_date, 
        resource
    ) VALUES (%s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (patient_id) DO NOTHING
    """

    address_query = """
    INSERT INTO fhir_staging.practioner_addresses (
        practitioner_id, line, city, state, postal_code, country, latitude, longitude
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """

    telecom_query = """
    INSERT INTO fhir_staging.practitioner_telecoms (
        patient_id, system, value, use, extension
    ) VALUES (%s, %s, %s, %s, %s)
    """

for practitioner in practioner_list:
        # Insert patient
        cur.execute(practitioner_query, (
            patient.get("patient_id"),
            patient.get("first_name"),
            patient.get("last_name"),
            patient.get("prefix"),
            patient.get("gender"),
            patient.get("birth_date"),
            patient.get("deceased_date_time"),
            patient.get("ssn"),
            patient.get("drivers_license"),
            patient.get("passport"),
            patient.get("marital_status"),
            patient.get("race"),
            patient.get("ethnicity"),
            patient.get("birth_place"),
            json.dumps(patient.get("resource")) if patient.get("resource") else None
        ))



def parse_practitioner_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

        if "entry" not in data:
            return []

    practitioners = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Practitioner":
            continue
        id = resource.get("id")
        names = resource.get("name", [])
        name = names[0] if names else {}
        given_name = safe_get(name.get("given", [None])[0])
        last_name = name.get("family")
        prefix = safe_get(name.get("prefix", [None])[0])
        gender = resource.get("gender"),
        extensions = safe_get(resource, "extension", default=[])
        identifiers = safe_get(resource, "identifier", default=[])
       
        address = resource.get("address", [])

        telecom = resource.get("telecom", [])
        practitioners.append({
            "id" : id,
            "first_name" : given_name,
            "last_name" : last_name,
            "prefix" : prefix,
            "gender" : gender,
            "extensions" : extensions,
            "identifiers" : identifiers,
            "address" : address,
            "telecom": telecom
        })
        
    return practitioners

if __name__ == "__main__":

    Main_logger = logger(
        "Main", 
        "C:\\Users\\tonim\\Downloads\\output\\fhir\\Main_log"
    )
    #with open("C:\\Users\\tonim\\Downloads\\output\\test_patient.json", "w", encoding="utf-8") as f:
    #json.dump(test_patient_data, f, indent=2)
    
    test_file = r"C:\Users\tonim\Downloads\output\fhir\Practitioner\practitionerInformation1755716738658.json"

    practitioner_data = [clean_nested(p) for p in (parse_practitioner_file(test_file))]

    if not practitioner_data:
        Main_logger.debug("No practitioner records found.")
    else:
        Main_logger.debug(f"Parsed {len(practitioner_data)} practitioners.")


In [None]:
def parse_encounter_file(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    if "entry" not in data:
        return []

    encounters = []
    for entry in data["entry"]:
        resource = entry.get("resource", {})
        if resource.get("resourceType") != "Encounter":
            continue

        coding = get_first_coding(resource, "type")
        encounters.append({
            "id": resource.get("id"),
            "patient_ref": safe_get(resource, "subject", "reference"),
            "status": resource.get("status"),
            "code": coding.get("code"),
            "description": coding.get("display"),
            "start_date_time": safe_get(resource, "period", "start"),
            "end_date_time": safe_get(resource, "period", "end")
        })
    return encounters

In [None]:
#test code
import json
import logging

logging.basicConfig(level=logging.INFO)

test_data = {
    "entry": [
        {
            "resource": {
                "resourceType": "DiagnosticReport",
                "id": "dr-123",
                "code": {
                    "coding": [
                        {
                            "system": "http://loinc.org",
                            "code": "12345-6",
                            "display": "CBC Panel"
                        }
                    ]
                },
                "subject": {"reference": "Patient/p-1"},
                "encounter": {"reference": "Encounter/e-1"},
                "effectiveDateTime": "2025-08-28T14:00:00Z",
                "result": [
                    {"reference": "Observation/o-1", "display": "Hemoglobin"},
                    {"reference": "Observation/o-2", "display": "Platelet count"}
                ]
            }
        }
    ]
}

# Save to file for testing
with open("test_diagnostic.json", "w", encoding="utf-8") as f:
    json.dump(test_data, f, indent=2)

# Run your function
parsed = parse_diagnostic_reports("test_diagnostic.json")
print(json.dumps(parsed, indent=2))
