In [12]:
import psycopg2
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
#from tqdm import tqdm
from datetime import datetime, timezone
#import datetime  # module
#from datetime import datetime as dt  # class, aliased to avoid conflict
#from datetime import datetime as tz
#import datetime
import json

# Path to your service account JSON key
key_path = "/Users/toniventura/keys/bq_key.json" 

# Create credentials and BigQuery client
#credentials = service_account.Credentials.from_service_account_file(key_path)
credentials = service_account.Credentials.from_service_account_file(key_path)

# Postgres config
PG_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "database": "fhir",
    "user": "toniventura",
    "password": "fhir_project"
}
#records =[]

# BigQuery config
#BQ_PROJECT = "your-gcp-project"
BQ_PROJECT = "fhir-synthea-data"
BQ_DATASET = "fhir_curated"
#client = bigquery.Client(project=BQ_PROJECT)
client = bigquery.Client(project="fhir-synthea-data", credentials=credentials)
#client = bigquery.Client(project=BQ_PROJECT, credentials=credentials)
dataset_ref = bigquery.Dataset(f"{BQ_PROJECT}.{BQ_DATASET}")

# Helper: fetch staged data
def fetch_staged_data(table, batch_size=10000):
    try:
        conn = psycopg2.connect(**PG_CONFIG)
        cur = conn.cursor()
        #cur.execute(f"SELECT * FROM fhir_staging.{table}")
        cur.execute("SELECT * FROM fhir_staging.patients_fhir_raw LIMIT 5;")
        while True:
            rows = cur.fetchmany(batch_size)
            print(f"rows: {rows}")
            if not rows:
                break
            yield rows
        cur.close()
        conn.close()
    except Exception as e:
        print(f"Postgres connection or query failed: {e}")
        


# Helper: insert dataframe into BigQuery
def insert_to_bq(df, table_name):
    table_id = f"{BQ_PROJECT}.{BQ_DATASET}.{table_name}"
    job = client.load_table_from_dataframe(df, table_id)
    job.result()  # wait for completion

# Example: Transform & load Patients
def transform_patients(rows):
    #print("transforming data")
    records = []
    print(f"rows: {len(rows)}")
    for r in rows:
        print("inside loop")
        rid, resource = r[1], r[2] # adjust index if needed
        #print(f"rid: {rid}")
        #print(f"resource: {resource}")
        #birth_date = resource.get("birthdate")
        records.append({
            "patient_id": rid,
            "first_name": resource.get("name", [{}])[0].get("given", [""])[0],
            "last_name": resource.get("name", [{}])[0].get("family", ""),
            #"birth_date": datetime.date.fromisoformat(resource.get("birthDate"))
                #if resource.get("birthDate") else None,
            "birth_date": datetime.fromisoformat(resource.get("birthDate"))
                if resource.get("birthDate") else None,
            "gender": resource.get("gender"),
            #"load_timestamp" : datetime.datetime.utcnow()
            "load_timestamp": datetime.now(timezone.utc)
        })
    return pd.DataFrame(records)

# Main ETL loop
def etl_patients():
    try:
        for batch in tqdm(fetch_staged_data("patients_fhir_raw")):
            df = transform_patients(batch)
            if not df.empty:
                insert_to_bq(df, "patients")
                print("***Inserting***")
    except Exception as e:
        print(f"Error in patients ETL: {e}")

if __name__ == "__main__":
    etl_patients()




0it [00:00, ?it/s]

rows: [(1, 'fd31932d-adf2-5412-7e68-96fd10ab541e', {'id': 'fd31932d-adf2-5412-7e68-96fd10ab541e', 'meta': {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient']}, 'name': [{'use': 'official', 'given': ['Aaron697', 'Reginald96'], 'family': 'Bednar518'}], 'text': {'div': '<div xmlns="http://www.w3.org/1999/xhtml">Generated by <a href="https://github.com/synthetichealth/synthea">Synthea</a>.Version identifier: 3a65f56\n .   Person seed: -4497557701425379834  Population seed: 1755716738658</div>', 'status': 'generated'}, 'gender': 'male', 'address': [{'city': 'Newton', 'line': ['662 Ryan Mission'], 'state': 'MA', 'country': 'US', 'extension': [{'url': 'http://hl7.org/fhir/StructureDefinition/geolocation', 'extension': [{'url': 'latitude', 'valueDecimal': 42.40281601687627}, {'url': 'longitude', 'valueDecimal': -71.21917372631331}]}], 'postalCode': '02472'}], 'telecom': [{'use': 'home', 'value': '555-235-2967', 'system': 'phone'}], 'birthDate': '2020-12-29', 'extens

1it [00:03,  3.53s/it]

***Inserting***
rows: []





In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
from datetime import datetime



# Path to your service account JSON key
key_path = "/Users/toniventura/keys/bq_key.json"  # 

# Create credentials and BigQuery client
credentials = service_account.Credentials.from_service_account_file(key_path)
BQ_PROJECT = "fhir-synthea-data"
BQ_DATASET = "fhir_curated"
client = bigquery.Client(project=BQ_PROJECT, credentials=credentials)

# Create dataset if it doesn't exist
dataset_ref = bigquery.Dataset(f"{BQ_PROJECT}.{BQ_DATASET}")
try:
    client.get_dataset(dataset_ref)  # Check if dataset exists
    print(f"Dataset '{BQ_DATASET}' already exists.")
except:
    dataset = bigquery.Dataset(dataset_ref)
    dataset.location = "US"  # Choose your region
    client.create_dataset(dataset)
    print(f"Dataset '{BQ_DATASET}' created.")

# 4️⃣ Define table schema
table_id = f"{BQ_PROJECT}.{BQ_DATASET}.patients"
schema = [
    bigquery.SchemaField("patient_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("first_name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("last_name", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("birth_date", "DATE", mode="NULLABLE"),
    bigquery.SchemaField("gender", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("load_timestamp", "TIMESTAMP", mode="REQUIRED")
]

# 5️⃣ Create table if it doesn't exist
try:
    client.get_table(table_id)
    print(f"Table 'patients' already exists.")
except:
    table = bigquery.Table(table_id, schema=schema)
    client.create_table(table)
    print(f"Table 'patients' created.")

# 6️⃣ Insert a sample row
rows_to_insert = [
    {
        "patient_id": "p001",
        "first_name": "John",
        "last_name": "Doe",
        "birth_date": "1980-01-01",
        "gender": "M",
        "load_timestamp": datetime.utcnow().isoformat()
    }
]

errors = client.insert_rows_json(table_id, rows_to_insert)
if errors:
    print("Encountered errors while inserting rows:", errors)
else:
    print("Sample row inserted successfully.")

# 7️⃣ Query the table to verify
query = f"SELECT * FROM `{table_id}` LIMIT 5"
df = client.query(query).to_dataframe()
print(df)
