In [None]:
#----Patient ETL
import psycopg2
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
#from tqdm import tqdm
from datetime import datetime, timezone
#import datetime  # module
#from datetime import datetime as dt  # class, aliased to avoid conflict
#from datetime import datetime as tz
#import datetime
import json

# Path to your service account JSON key
key_path = "/Users/toniventura/keys/bq_key.json" 

# Create credentials and BigQuery client
#credentials = service_account.Credentials.from_service_account_file(key_path)
credentials = service_account.Credentials.from_service_account_file(key_path)

# Postgres config
PG_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "fhir",
    "user": "toniventura",
    "password": "fhir_project"
}
#records =[]

# BigQuery config
#BQ_PROJECT = "your-gcp-project"
BQ_PROJECT = "fhir-synthea-data"
BQ_DATASET = "fhir_curated"
#client = bigquery.Client(project=BQ_PROJECT)
client = bigquery.Client(project="fhir-synthea-data", credentials=credentials)
#client = bigquery.Client(project=BQ_PROJECT, credentials=credentials)
dataset_ref = bigquery.Dataset(f"{BQ_PROJECT}.{BQ_DATASET}")

# Helper: fetch staged data
def fetch_staged_data(table, batch_size=10000):
    try:
        conn = psycopg2.connect(**PG_CONFIG)
        cur = conn.cursor()
        #cur.execute(f"SELECT * FROM fhir_staging.{table}")
        cur.execute("SELECT * FROM fhir_staging.patients_fhir_raw LIMIT 5;")
        while True:
            rows = cur.fetchmany(batch_size)
            print(f"rows: {rows}")
            if not rows:
                break
            yield rows
        cur.close()
        conn.close()
    except Exception as e:
        print(f"Postgres connection or query failed: {e}")
        


# Helper: insert dataframe into BigQuery
def insert_to_bq(df, table_name):
    table_id = f"{BQ_PROJECT}.{BQ_DATASET}.{table_name}"
    job = client.load_table_from_dataframe(df, table_id)
    job.result()  # wait for completion

# Example: Transform & load Patients
def transform_patients(rows):
    #print("transforming data")
    records = []
    print(f"rows: {len(rows)}")
    for r in rows:
        print("inside loop")
        rid, resource = r[1], r[2] # adjust index if needed
        #print(f"rid: {rid}")
        #print(f"resource: {resource}")
        #birth_date = resource.get("birthdate")
        records.append({
            "patient_id": rid,
            "first_name": resource.get("name", [{}])[0].get("given", [""])[0],
            "last_name": resource.get("name", [{}])[0].get("family", ""),
            #"birth_date": datetime.date.fromisoformat(resource.get("birthDate"))
                #if resource.get("birthDate") else None,
            "birth_date": datetime.fromisoformat(resource.get("birthDate"))
                if resource.get("birthDate") else None,
            "gender": resource.get("gender"),
            #"load_timestamp" : datetime.datetime.utcnow()
            "load_timestamp": datetime.now(timezone.utc)
        })
    return pd.DataFrame(records)

# Main ETL loop
def etl_patients():
    try:
        for batch in tqdm(fetch_staged_data("patients_fhir_raw")):
            df = transform_patients(batch)
            if not df.empty:
                insert_to_bq(df, "patients")
                print("***Inserting***")
    except Exception as e:
        print(f"Error in patients ETL: {e}")

if __name__ == "__main__":
    etl_patients()




In [None]:
#---Practitioner ETL
import psycopg2
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
from datetime import datetime, timezone
import json
import pprint

# Path to your service account JSON key
key_path = "/Users/toniventura/keys/bq_key.json"

# Create credentials and BigQuery client
credentials = service_account.Credentials.from_service_account_file(key_path)
BQ_PROJECT = "fhir-synthea-data"
BQ_DATASET = "fhir_curated"
client = bigquery.Client(project=BQ_PROJECT, credentials=credentials)

# Postgres config
PG_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "fhir",
    "user": "toniventura",
    "password": "fhir_project"
}

# Helper: fetch staged data
def fetch_staged_data(table, batch_size=10000):
    try:
        conn = psycopg2.connect(**PG_CONFIG)
        cur = conn.cursor()
        cur.execute(f"SELECT * FROM fhir_staging.{table} LIMIT 5;")
        while True:
            rows = cur.fetchmany(batch_size)
            if not rows:
                break
            yield rows
        cur.close()
        conn.close()
    except Exception as e:
        print(f"Postgres connection or query failed: {e}")

# Helper: insert dataframe into BigQuery
def insert_to_bq(df, table_name):
    table_id = f"{BQ_PROJECT}.{BQ_DATASET}.{table_name}"
    job = client.load_table_from_dataframe(df, table_id)
    job.result()

# Transform Practitioners
def transform_practitioners(rows):
    records = []
    for r in rows:
        rid, resource = r[1], r[2]  # adjust index if needed

        #Initialize variables so they exist even if not found
        npi = None
        license_number = None
        other_ids = []

    for ident in resource.get("identifier", []):
        # Debugging: show the whole identifier object
        pprint.pprint(ident)

        system = ident.get("system")
        value = ident.get("value")

        print(f"system: {system}")
        print(f"value: {value}")

        if system == "http://hl7.org/fhir/sid/us-npi":
            npi = value
        elif system == "http://example.org/license-number":
            license_number = value
        else:
            other_ids.append(value)


        print("---- Results ----")
        print(f"NPI: {npi}")
        print(f"License Number: {license_number}")
        print(f"Other IDs: {other_ids}")
        name_info = resource.get("name", [{}])[0]

        records.append({
            "practitioner_id": rid,
            "first_name": name_info.get("given", [""])[0],
            "last_name": name_info.get("family", ""),
            "prefix": name_info.get("prefix", [""])[0] if name_info.get("prefix") else None,
            "gender": resource.get("gender"),
            #"birth_date": datetime.fromisoformat(resource.get("birthDate")) if resource.get("birthDate") else None,
            "npi": npi,
            "license_number": license_number,
            "primary_email": next((t.get("value") for t in resource.get("telecom", []) if t.get("system") == "email"), None),
            "primary_phone": next((t.get("value") for t in resource.get("telecom", []) if t.get("system") == "phone"), None),
            "load_timestamp": datetime.now(timezone.utc)
        })
    return pd.DataFrame(records)

# Main ETL loop
def etl_practitioners():
    try:
        for batch in fetch_staged_data("practitioners_fhir_raw"):
            df = transform_practitioners(batch)
            if not df.empty:
                insert_to_bq(df, "practitioners")
                print("***Inserted batch***")
    except Exception as e:
        print(f"Error in practitioners ETL: {e}")

if __name__ == "__main__":
    etl_practitioners()


{'system': 'http://hl7.org/fhir/sid/us-npi', 'value': '9999928192'}
system: http://hl7.org/fhir/sid/us-npi
value: 9999928192
---- Results ----
NPI: 9999928192
License Number: None
Other IDs: []
***Inserted batch***


In [15]:
#---Practitioner Roles ETL
import psycopg2
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
from datetime import datetime, timezone
import json
import pprint

# Path to your service account JSON key
key_path = "/Users/toniventura/keys/bq_key.json"

# Create credentials and BigQuery client
credentials = service_account.Credentials.from_service_account_file(key_path)
BQ_PROJECT = "fhir-synthea-data"
BQ_DATASET = "fhir_curated"
client = bigquery.Client(project=BQ_PROJECT, credentials=credentials)

# Postgres config
PG_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "fhir",
    "user": "toniventura",
    "password": "fhir_project"
}

def insert_to_bq(df, table_name):
    table_id = f"{BQ_PROJECT}.{BQ_DATASET}.{table_name}"
    job = client.load_table_from_dataframe(df, table_id)
    job.result()

# Transform Practitioners
def transform_practitioner_roles(rows):
    records = []
    for r in rows:
        rid, resource = r[1], r[2]

        # Initialize columns
        specialty_text = resource.get("specialty", {})[0].get("text")
        #specialty_code = resource.get("specialty", {})[0].get("code",[{}][0].get("coding",[{}][0].get("code")))
        specialty_code = resource.get("specialty", [{}])[0].get("coding", [{}])[0].get("code")
        role_text = resource.get("code", [{}])[0].get("text")
        role_code = resource.get("code", [{}])[0].get("coding", [{}])[0].get("code")
        
        print(specialty_code)
        print(specialty_text)
        print(role_text)
        print(role_code)

        

        records.append({
            "practitioner_role_id": rid,
            "practitioner_npi": resource.get("practitioner", {}).get("identifier").get("value"),
            "organization_id": resource.get("organization", {}).get("identifier").get("value"),
            "specialty_code": specialty_code,
            "specialty_text": specialty_text,
            "role_text" : role_text, 
            "role_code": role_code, 
            #                 if resource.get("telecom") and resource["telecom"][0].get("system")=="email" else None,
            #"role_text": resource.get("telecom", [{}])[0].get("value") 
                              #if resource.get("telecom") and resource["telecom"][0].get("system")=="phone" else None,
            "load_timestamp": datetime.now(timezone.utc)
        })

    return pd.DataFrame(records)


# Main ETL loop
def etl_practitioner_roles():
    try:
        for batch in fetch_staged_data("practitioner_roles_fhir_raw"):
            df = transform_practitioner_roles(batch)
            if not df.empty:
                insert_to_bq(df, "practitioner_roles")
                print("***Inserted batch***")
    except Exception as e:
        print(f"Error in practitioner roles ETL: {e}")

if __name__ == "__main__":
    etl_practitioner_roles()


208D00000X
General Practice Physician
General Practice Physician
208D00000X
208D00000X
General Practice Physician
General Practice Physician
208D00000X
208D00000X
General Practice Physician
General Practice Physician
208D00000X
208D00000X
General Practice Physician
General Practice Physician
208D00000X
208D00000X
General Practice Physician
General Practice Physician
208D00000X
***Inserted batch***


In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
from datetime import datetime



# Path to your service account JSON key
key_path = "/Users/toniventura/keys/bq_key.json"  # 

# Create credentials and BigQuery client
credentials = service_account.Credentials.from_service_account_file(key_path)
BQ_PROJECT = "fhir-synthea-data"
BQ_DATASET = "fhir_curated"
client = bigquery.Client(project=BQ_PROJECT, credentials=credentials)

# Create dataset if it doesn't exist
dataset_ref = bigquery.Dataset(f"{BQ_PROJECT}.{BQ_DATASET}")
try:
    client.get_dataset(dataset_ref)  # Check if dataset exists
    print(f"Dataset '{BQ_DATASET}' already exists.")
except:
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"  # Choose your region
    client.create_dataset(dataset)
    print(f"Dataset '{BQ_DATASET}' created.")

# 4️Define table schema
table_id = f"{BQ_PROJECT}.{BQ_DATASET}.patients"
schema = [
    bigquery.SchemaField("patient_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("first_name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("last_name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("birth_date", "DATE", mode="NULLABLE"),
    bigquery.SchemaField("gender", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("load_timestamp", "TIMESTAMP", mode="REQUIRED")
]

# Create table if it doesn't exist
try:
    client.get_table(table_id)
    print(f"Table 'patients' already exists.")
except:
    table = bigquery.Table(table_id, schema=schema)
    client.create_table(table)
    print(f"Table 'patients' created.")

# Insert a sample row
rows_to_insert = [
    {
        "patient_id": "p001",
        "first_name": "John",
        "last_name": "Doe",
        "birth_date": "1980-01-01",
        "gender": "M",
        "load_timestamp": datetime.utcnow().isoformat()
    }
]

errors = client.insert_rows_json(table_id, rows_to_insert)
if errors:
    print("Encountered errors while inserting rows:", errors)
else:
    print("Sample row inserted successfully.")

# 7️⃣ Query the table to verify
query = f"SELECT * FROM `{table_id}` LIMIT 5"
df = client.query(query).to_dataframe()
print(df)
