In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import text
import time
from io import StringIO
from google.cloud import storage
import datetime
from google.cloud import bigquery
import pytz
from pandas_gbq import to_gbq

In [2]:
#setup big query 
client = bigquery.Client()

In [14]:
#function to load dataframe to bigquery
def load_dataframe_to_bigquery(df, full_table_id, project_id, if_exists='replace'):
    """
    Load a DataFrame to a Google BigQuery table and return a status message.

    Parameters:
    df (DataFrame): The pandas DataFrame to load.
    dataset_id (str): The dataset ID in BigQuery.
    table_id (str): The table ID within the dataset.
    project_id (str): Your Google Cloud project ID.
    if_exists (str): Action to take if the table already exists. Options are 'fail', 'replace', 'append'.

    Returns:
    str: Success or error message.
    """
    try:
        # Load the DataFrame to the BigQuery table
        to_gbq(df, full_table_id, project_id=project_id, if_exists=if_exists, progress_bar=True)

        # Optionally, check the number of rows in the BigQuery table
        client = bigquery.Client(project=project_id)
        query = f"SELECT COUNT(*) as total_rows FROM `{full_table_id}`"
        query_job = client.query(query)
        results = query_job.result()

        for row in results:
            total_rows = row.total_rows

        return f"Load successful. Total rows in the table now: {total_rows}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

In [36]:
#load cpt_hcpcs.json into dataframe
# Set service account key
service_account_key_path = 'different-state-hospital-price-fd662d2f48c2.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_key_path

# Specify JSON file path
bucket_path = 'gs://different-state-hospital-prices/cpt_hcpcs.json'

# Read JSON file into DataFrame
df_cpt_hcpcs = pd.read_json(bucket_path, lines=True)

print(df_cpt_hcpcs.info())
df_cpt_hcpcs.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               object
 1   short_description  object
 2   long_description   object
dtypes: object(3)
memory usage: 75.3+ MB
None


Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [37]:
#cpt_hcpcs transformation
df_cpt_hcpcs = df_cpt_hcpcs.astype({
    'code': 'string',
    'short_description': 'string',
    'long_description': 'string'
})
df_cpt_hcpcs = df_cpt_hcpcs.fillna('None')

df_cpt_hcpcs = df_cpt_hcpcs.drop_duplicates()

print(df_cpt_hcpcs.info())
df_cpt_hcpcs.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               string
 1   short_description  string
 2   long_description   string
dtypes: string(3)
memory usage: 75.3 MB
None


Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [38]:
#load hospitals.json into dataframe
bucket_path = 'gs://different-state-hospital-prices/hospitals.json'
df_hospitals = pd.read_json(bucket_path, lines=True)

print(df_hospitals.info())
df_hospitals.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   npi_number      1400 non-null   object 
 1   name            1400 non-null   object 
 2   url             1400 non-null   object 
 3   street_address  1351 non-null   object 
 4   city            1379 non-null   object 
 5   state           1377 non-null   object 
 6   zip_code        1357 non-null   object 
 7   publish_date    610 non-null    float64
dtypes: float64(1), object(7)
memory usage: 87.6+ KB
None


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code,publish_date
0,1003139775.0,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,,1609459000000.0
1,1003260480.0,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209,
2,1003281452.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,1609459000000.0
3,1003362997.0,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510,1609459000000.0
4,1003389206.0,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020,


In [39]:
#hospitals transformation
#drop the uncessary column
df_hospitals = df_hospitals.drop(columns = ['publish_date'])

#Unify npi_number
df_hospitals['npi_number'] = df_hospitals['npi_number'].astype(str)
df_hospitals['npi_number'] = df_hospitals['npi_number'].str.replace(r'\D', '', regex=True)
df_hospitals['npi_number'] = df_hospitals['npi_number'].apply(lambda x: x[:10] if len(x) >= 10 else None).astype('string')

#unify zip_code
df_hospitals['zip_code'] = df_hospitals['zip_code'].astype('string')
df_hospitals['zip_code'] = df_hospitals['zip_code'].str.replace('.0','')

#Reference data for valid states
valid_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# Validates standard 5-digit or ZIP+4 formats
zip_code_pattern = r'^\d{5}(-\d{4})?$'

#Validates standard npi number
npi_number_pattern = r'\d{10}$'

# Filter DataFrame based on valid states and zip code pattern
df_hospitals = df_hospitals[df_hospitals['state'].isin(valid_states) & df_hospitals['zip_code'].str.match(zip_code_pattern) & df_hospitals['npi_number'].str.match(npi_number_pattern)]

# covert other types using astype
df_hospitals = df_hospitals.astype({
    'name': 'string',
    'url': 'string',
    'street_address':'string',
    'city':'string',
    'state':'string',
})

df_hospitals = df_hospitals.fillna('None')

df_hospitals = df_hospitals.drop_duplicates(subset=['npi_number'])
print(df_hospitals.info())
df_hospitals.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1226 entries, 1 to 1399
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   npi_number      1226 non-null   string
 1   name            1226 non-null   string
 2   url             1226 non-null   string
 3   street_address  1226 non-null   string
 4   city            1226 non-null   string
 5   state           1226 non-null   string
 6   zip_code        1226 non-null   string
dtypes: string(7)
memory usage: 76.6 KB
None


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code
1,1003260480,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209
2,1003281452,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011
3,1003362997,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510
4,1003389206,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020
5,1003811290,Providence Health,http://www.yourprovidencehealth.com/Content/Up...,2435 Forest Drive,Columbia,SC,29204


In [8]:
#load prices.json into dataframe
bucket_path = 'gs://different-state-hospital-prices/prices.json'

df_prices = pd.DataFrame()

# Create a JSON reader
json_reader = pd.read_json(bucket_path, lines=True, chunksize=100000)

# Process each chunk
start_time_total = time.time()

for i, chunk in enumerate(json_reader):
    # Start time for current loop
    start_time_loop = time.time()
    
    # Clean 'npi_number' column
    chunk['npi_number'] = chunk['npi_number'].astype(str)
    chunk['npi_number'] = chunk['npi_number'].str.replace(r'\D', '')
    chunk['npi_number'] = chunk['npi_number'].apply(lambda x: x[:10] if len(x) >= 10 else None).astype('string')

    # change dtype
    chunk = chunk.astype({
        'code': 'string',
        'payer': 'string'
    })
    
    # Append to dataframe
    df_prices = pd.concat([df_prices, chunk], ignore_index=True)
    
    # End time for current loop
    end_time_loop = time.time()
    
    # Print time taken for current loop
    print(f"Loop {i+1} time taken: {end_time_loop - start_time_loop:.2f} seconds")
    
    # Print memory usage
    print(f"Loop {i+1} memory usage: {df_prices.memory_usage().sum() / 1024 / 1024:.2f} MB")

df_prices = df_prices[df_prices['npi_number'].str.match(npi_number_pattern)]
    
df_prices = df_prices.fillna('none')

df_prices = df_prices.drop_duplicates()

df_prices['price'] = pd.to_numeric(df_prices['price'], errors='coerce')
# End time
end_time_total = time.time()

# Calculate total time taken
total_time = end_time_total - start_time_total

print(f"Total time taken: {total_time} seconds")
print(df_prices.head())
print(df_prices.info())

Loop 1 time taken: 0.06 seconds
Loop 1 memory usage: 3.05 MB
Loop 2 time taken: 0.10 seconds
Loop 2 memory usage: 6.10 MB
Loop 3 time taken: 0.17 seconds
Loop 3 memory usage: 9.16 MB
Loop 4 time taken: 0.19 seconds
Loop 4 memory usage: 12.21 MB
Loop 5 time taken: 0.18 seconds
Loop 5 memory usage: 15.26 MB
Loop 6 time taken: 0.21 seconds
Loop 6 memory usage: 18.31 MB
Loop 7 time taken: 0.22 seconds
Loop 7 memory usage: 21.36 MB
Loop 8 time taken: 0.24 seconds
Loop 8 memory usage: 24.41 MB
Loop 9 time taken: 0.27 seconds
Loop 9 memory usage: 27.47 MB
Loop 10 time taken: 0.29 seconds
Loop 10 memory usage: 30.52 MB
Loop 11 time taken: 0.31 seconds
Loop 11 memory usage: 33.57 MB
Loop 12 time taken: 0.31 seconds
Loop 12 memory usage: 36.62 MB
Loop 13 time taken: 0.34 seconds
Loop 13 memory usage: 39.67 MB
Loop 14 time taken: 0.38 seconds
Loop 14 memory usage: 42.72 MB
Loop 15 time taken: 0.38 seconds
Loop 15 memory usage: 45.78 MB
Loop 16 time taken: 0.39 seconds
Loop 16 memory usage: 48.83 

Loop 128 time taken: 4.79 seconds
Loop 128 memory usage: 390.63 MB
Loop 129 time taken: 3.12 seconds
Loop 129 memory usage: 393.68 MB
Loop 130 time taken: 3.42 seconds
Loop 130 memory usage: 396.73 MB
Loop 131 time taken: 3.41 seconds
Loop 131 memory usage: 399.78 MB
Loop 132 time taken: 3.36 seconds
Loop 132 memory usage: 402.83 MB
Loop 133 time taken: 3.88 seconds
Loop 133 memory usage: 405.88 MB
Loop 134 time taken: 3.19 seconds
Loop 134 memory usage: 408.94 MB
Loop 135 time taken: 3.94 seconds
Loop 135 memory usage: 411.99 MB
Loop 136 time taken: 3.53 seconds
Loop 136 memory usage: 415.04 MB
Loop 137 time taken: 4.12 seconds
Loop 137 memory usage: 418.09 MB
Loop 138 time taken: 3.91 seconds
Loop 138 memory usage: 421.14 MB
Loop 139 time taken: 3.39 seconds
Loop 139 memory usage: 424.19 MB
Loop 140 time taken: 3.81 seconds
Loop 140 memory usage: 427.25 MB
Loop 141 time taken: 3.57 seconds
Loop 141 memory usage: 430.30 MB
Loop 142 time taken: 3.89 seconds
Loop 142 memory usage: 433.3

Loop 251 time taken: 8.62 seconds
Loop 251 memory usage: 765.99 MB
Loop 252 time taken: 8.60 seconds
Loop 252 memory usage: 769.04 MB
Loop 253 time taken: 8.74 seconds
Loop 253 memory usage: 772.09 MB
Loop 254 time taken: 8.60 seconds
Loop 254 memory usage: 775.15 MB
Loop 255 time taken: 8.64 seconds
Loop 255 memory usage: 778.20 MB
Loop 256 time taken: 8.55 seconds
Loop 256 memory usage: 781.25 MB
Loop 257 time taken: 9.10 seconds
Loop 257 memory usage: 784.30 MB
Loop 258 time taken: 9.40 seconds
Loop 258 memory usage: 787.35 MB
Loop 259 time taken: 9.33 seconds
Loop 259 memory usage: 790.41 MB
Loop 260 time taken: 10.02 seconds
Loop 260 memory usage: 793.46 MB
Loop 261 time taken: 9.81 seconds
Loop 261 memory usage: 796.51 MB
Loop 262 time taken: 9.69 seconds
Loop 262 memory usage: 799.56 MB
Loop 263 time taken: 9.88 seconds
Loop 263 memory usage: 802.61 MB
Loop 264 time taken: 9.99 seconds
Loop 264 memory usage: 805.66 MB
Loop 265 time taken: 9.80 seconds
Loop 265 memory usage: 808.

Loop 373 time taken: 8.55 seconds
Loop 373 memory usage: 1138.31 MB
Loop 374 time taken: 8.50 seconds
Loop 374 memory usage: 1141.36 MB
Loop 375 time taken: 8.82 seconds
Loop 375 memory usage: 1144.41 MB
Loop 376 time taken: 8.95 seconds
Loop 376 memory usage: 1147.46 MB
Loop 377 time taken: 8.97 seconds
Loop 377 memory usage: 1150.51 MB
Loop 378 time taken: 9.00 seconds
Loop 378 memory usage: 1153.56 MB
Loop 379 time taken: 9.35 seconds
Loop 379 memory usage: 1156.62 MB
Loop 380 time taken: 9.31 seconds
Loop 380 memory usage: 1159.67 MB
Loop 381 time taken: 9.02 seconds
Loop 381 memory usage: 1162.72 MB
Loop 382 time taken: 9.15 seconds
Loop 382 memory usage: 1165.77 MB
Loop 383 time taken: 9.40 seconds
Loop 383 memory usage: 1168.82 MB
Loop 384 time taken: 9.44 seconds
Loop 384 memory usage: 1171.88 MB
Loop 385 time taken: 10.34 seconds
Loop 385 memory usage: 1174.93 MB
Loop 386 time taken: 9.93 seconds
Loop 386 memory usage: 1177.98 MB
Loop 387 time taken: 8.90 seconds
Loop 387 memo

Loop 493 time taken: 18.37 seconds
Loop 493 memory usage: 1504.52 MB
Loop 494 time taken: 17.44 seconds
Loop 494 memory usage: 1507.57 MB
Loop 495 time taken: 17.28 seconds
Loop 495 memory usage: 1510.62 MB
Loop 496 time taken: 18.01 seconds
Loop 496 memory usage: 1513.67 MB
Loop 497 time taken: 17.33 seconds
Loop 497 memory usage: 1516.72 MB
Loop 498 time taken: 15.89 seconds
Loop 498 memory usage: 1519.78 MB
Loop 499 time taken: 15.60 seconds
Loop 499 memory usage: 1522.83 MB
Loop 500 time taken: 14.83 seconds
Loop 500 memory usage: 1525.88 MB
Loop 501 time taken: 17.38 seconds
Loop 501 memory usage: 1528.93 MB
Loop 502 time taken: 18.05 seconds
Loop 502 memory usage: 1531.98 MB
Loop 503 time taken: 19.38 seconds
Loop 503 memory usage: 1535.03 MB
Loop 504 time taken: 18.78 seconds
Loop 504 memory usage: 1538.09 MB
Loop 505 time taken: 19.64 seconds
Loop 505 memory usage: 1541.14 MB
Loop 506 time taken: 18.90 seconds
Loop 506 memory usage: 1544.19 MB
Loop 507 time taken: 17.47 seconds

Loop 612 time taken: 17.17 seconds
Loop 612 memory usage: 1867.68 MB
Loop 613 time taken: 18.79 seconds
Loop 613 memory usage: 1870.73 MB
Loop 614 time taken: 18.86 seconds
Loop 614 memory usage: 1873.78 MB
Loop 615 time taken: 20.79 seconds
Loop 615 memory usage: 1876.83 MB
Loop 616 time taken: 19.68 seconds
Loop 616 memory usage: 1879.88 MB
Loop 617 time taken: 20.26 seconds
Loop 617 memory usage: 1882.93 MB
Loop 618 time taken: 20.44 seconds
Loop 618 memory usage: 1885.99 MB
Loop 619 time taken: 20.16 seconds
Loop 619 memory usage: 1889.04 MB
Loop 620 time taken: 17.61 seconds
Loop 620 memory usage: 1892.09 MB
Loop 621 time taken: 17.60 seconds
Loop 621 memory usage: 1895.14 MB
Loop 622 time taken: 18.05 seconds
Loop 622 memory usage: 1898.19 MB
Loop 623 time taken: 20.41 seconds
Loop 623 memory usage: 1901.25 MB
Loop 624 time taken: 20.26 seconds
Loop 624 memory usage: 1904.30 MB
Loop 625 time taken: 21.07 seconds
Loop 625 memory usage: 1907.35 MB
Loop 626 time taken: 20.50 seconds

In [40]:
#parper for load
# Adding surrogate keys for each dimension table
df_cpt_hcpcs['cpt_id'] = np.arange(1, len(df_cpt_hcpcs) + 1)
df_hospitals['hospital_id'] = np.arange(1, len(df_hospitals) + 1)

# Mapping foreign keys in the fact table
df_prices['cpt_id'] = df_prices['code'].map(df_cpt_hcpcs.set_index('code')['cpt_id'])
df_prices['hospital_id'] = df_prices['npi_number'].map(df_hospitals.set_index('npi_number')['hospital_id'])

# Creating a unique list of payers and mapping payer_id
unique_payers = pd.DataFrame(df_prices['payer'].unique(), columns=['payer'])
unique_payers['payer_id'] = np.arange(1, len(unique_payers) + 1)
df_prices['payer_id'] = df_prices['payer'].map(unique_payers.set_index('payer')['payer_id'])

# Prepare final DataFrames for the new schema
dim_cpt_hcpcs = df_cpt_hcpcs[['cpt_id', 'code', 'short_description', 'long_description']]
dim_hospitals = df_hospitals[['hospital_id', 'npi_number', 'name', 'url', 'street_address', 'city', 'state', 'zip_code']]
dim_payer = unique_payers[['payer_id', 'payer']]
fact_prices = df_prices[['cpt_id', 'hospital_id', 'payer_id', 'price']]

print(dim_cpt_hcpcs.info())
print(dim_hospitals.info())
print(dim_payer.info())
print(fact_prices.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 4 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   cpt_id             int32 
 1   code               string
 2   short_description  string
 3   long_description   string
dtypes: int32(1), string(3)
memory usage: 87.8 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 1226 entries, 1 to 1399
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   hospital_id     1226 non-null   int32 
 1   npi_number      1226 non-null   string
 2   name            1226 non-null   string
 3   url             1226 non-null   string
 4   street_address  1226 non-null   string
 5   city            1226 non-null   string
 6   state           1226 non-null   string
 7   zip_code        1226 non-null   string
dtypes: int32(1), string(7)
memory usage: 81.4 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6

In [15]:
# Load DimCptHcpcs to the BigQuery table
result = load_dataframe_to_bigquery(dim_cpt_hcpcs, 'different_states_hospital_price.DimCptHcpcs', 'different-state-hospital-price')
print(result)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Load successful. Total rows in the table now: 3287818


In [16]:
# Load DimHospitals to the BigQuery table
result = load_dataframe_to_bigquery(dim_hospitals, 'different_states_hospital_price.DimHospitals', 'different-state-hospital-price', if_exists='replace')
print(result)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Load successful. Total rows in the table now: 1226


In [19]:
# Load DimPayer to the BigQuery table
result = load_dataframe_to_bigquery(dim_payer, 'different_states_hospital_price.DimPayer', 'different-state-hospital-price', if_exists='replace')
print(result)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Load successful. Total rows in the table now: 6894


In [31]:
# Load FactPrices to the BigQuery table
result = load_dataframe_to_bigquery(fact_prices, 'different_states_hospital_price.FactPrices', 'different-state-hospital-price', if_exists='replace')
print(result)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Load successful. Total rows in the table now: 72095169
