In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import sqlalchemy
from sqlalchemy import create_engine
import time
from tqdm.notebook import tqdm
from io import StringIO
from google.cloud import storage

In [2]:
#setup redshift conncetion
REDSHIFT_ENDPOINT = 'juntian-cluster.cl4tv7kd0trb.us-east-1.redshift.amazonaws.com:5439/dev'
REDSHIFT_USER = 'awsuser'
REDSHIFT_PASS = 'arn:aws:secretsmanager:us-east-1:381492032483:secret:redshift!different-states-hospital-price-awsuser-FrEDZG'
REDSHIFT_PORT = '5439'
REDSHIFT_DB = 'different-states-hospital-price'

# Create the connection
connection = f"postgresql+psycopg2://{REDSHIFT_USER}:{REDSHIFT_PASS}@{REDSHIFT_ENDPOINT}:{REDSHIFT_PORT}/{REDSHIFT_DB}"
        
#Create engine
engine = create_engine(connection)

In [3]:
#load cpt_hcpcs.json into dataframe
# Set service account key
service_account_key_path = 'different-state-hospital-price-fd662d2f48c2.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_key_path

# Specify JSON file path
bucket_path = 'gs://different-state-hospital-prices/cpt_hcpcs.json'

# Read JSON file into DataFrame
cpt_hcpcs_df = pd.read_json(bucket_path, lines=True)

print(cpt_hcpcs_df.info())
cpt_hcpcs_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               object
 1   short_description  object
 2   long_description   object
dtypes: object(3)
memory usage: 75.3+ MB
None


Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [4]:
#cpt_hcpcs transformation
df_cpt_hcpcs = df_cpt_hcpcs.astype({
    'code': 'string',
    'short_description': 'string',
    'long_description': 'string'
})
df_cpt_hcpcs = df_cpt_hcpcs.fillna('None')

df_cpt_hcpcs = df_cpt_hcpcs.drop_duplicates()

print(df_cpt_hcpcs.info())
df_cpt_hcpcs.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               string
 1   short_description  string
 2   long_description   string
dtypes: string(3)
memory usage: 75.3 MB
None


Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [5]:
#load hospitals.json into dataframe
bucket_path = 'gs://different-state-hospital-prices/hospitals.json'
df_hospitals = pd.read_json(bucket_path, lines=True)

print(df_hospitals.info())
df_hospitals.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   npi_number      1400 non-null   object 
 1   name            1400 non-null   object 
 2   url             1400 non-null   object 
 3   street_address  1351 non-null   object 
 4   city            1379 non-null   object 
 5   state           1377 non-null   object 
 6   zip_code        1357 non-null   object 
 7   publish_date    610 non-null    float64
dtypes: float64(1), object(7)
memory usage: 87.6+ KB
None


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code,publish_date
0,1003139775.0,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,,1609459000000.0
1,1003260480.0,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209,
2,1003281452.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,1609459000000.0
3,1003362997.0,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510,1609459000000.0
4,1003389206.0,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020,


In [6]:
#hospitals transformation
#drop the uncessary column
hospitals_df = hospitals_df.drop(['publish_date'],axis='columns')

#Unify npi_number
hospitals_df['npi_number'] = hospitals_df['npi_number'].astype(str)
hospitals_df['npi_number'] = hospitals_df['npi_number'].str.replace(r'\D', '', regex=True)
hospitals_df['npi_number'] = hospitals_df['npi_number'].apply(lambda x: x[:10] if len(x) >= 10 else None).astype('string')

#unify zip_code
hospitals_df['zip_code'] = hospitals_df['zip_code'].astype(str)
hospitals_df['zip_code'] = hospitals_df['zip_code'].str.replace('.0','')

# covert other types using astype
hospitals_df = hospitals_df.astype({
    'name': 'string',
    'url': 'string',
    'street_address':'string',
    'city':'string',
    'state':'string',
})

hospitals_df = hospitals_df.fillna('None')

hospitals_df = hospitals_df.drop_duplicates()
print(hospitals_df.info())
hospitals_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1378 entries, 0 to 1399
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   npi_number      1378 non-null   string
 1   name            1378 non-null   string
 2   url             1378 non-null   string
 3   street_address  1378 non-null   string
 4   city            1378 non-null   string
 5   state           1378 non-null   string
 6   zip_code        1378 non-null   object
dtypes: object(1), string(6)
memory usage: 86.1+ KB
None


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code
0,1003139775,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,
1,1003260480,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209
2,1003281452,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011
3,1003362997,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510
4,1003389206,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020


In [8]:
#load prices.json into dataframe
bucket_path = 'gs://different-state-hospital-prices/prices.json'

df_prices = pd.DataFrame()

# Create a JSON reader
json_reader = pd.read_json(bucket_path, lines=True, chunksize=10000)

# Process each chunk
for chunk in json_reader:
    # Clean 'npi_number' column
    chunk['npi_number'] = chunk['npi_number'].astype(str)
    chunk['npi_number'] = chunk['npi_number'].str.replace(r'\D', '', regex=True)
    chunk['npi_number'] = chunk['npi_number'].apply(lambda x: x[:10] if len(x) >= 10 else None).astype('string')

    # change dtype
    chunk = chunk.astype({
        'code': 'string',
        'payer': 'string'
    })
    
    # Append to dataframe
    df_prices = pd.concat([df_prices, chunk], ignore_index=True)
    
    print(len(df_prices))
    
df_prices = df_prices.fillna('none')

df_prices = df_prices.drop_duplicates()

print(df_prices.head())
print(df_prices.info())

5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
345000
350000
355000
360000
365000
370000
375000
380000
385000
390000
395000
400000
405000
410000
415000
420000
425000
430000
435000
440000
445000


KeyboardInterrupt: 

In [None]:
#parper for load
cpt_hcpcs_df['cpt__hcpcs_id'] = np.arange(1, len(cpt_hcpcs_df) + 1)
hospitals_df['hospital_id'] = np.arange(1, len(hospitals_df) + 1)
prices_df['fact_id'] = np.arange(1, len(prices_df) + 1)

# Map foreign keys
code_to_cpt_id = dict(zip(df_cpt_hcpcs['code'], df_cpt_hcpcs['cpt_id']))
npi_to_hospital_id = dict(zip(df_hospitals['npi_number'], df_hospitals['hospital_id']))

df_prices['cpt_id'] = df_prices['code'].map(code_to_cpt_id)
df_prices['hospital_id'] = df_prices['npi_number'].map(npi_to_hospital_id)

# Create a new DataFrame for payers
df_payer = pd.DataFrame({
    'payer': pd.unique(df_prices['payer']),
    'payer_id': np.arange(1, len(pd.unique(df_prices['payer'])) + 1)
})
payer_to_payer_id = dict(zip(df_payer['payer'], df_payer['payer_id']))
df_prices['payer_id'] = df_prices['payer'].map(payer_to_payer_id)

# Final DataFrames for the new schema
dim_cpt_hcpcs = df_cpt_hcpcs[['cpt_id', 'code', 'short_description', 'long_description']]
dim_hospitals = df_hospitals[['hospital_id', 'npi_number', 'name', 'url', 'street_address', 'city', 'state', 'zip_code']]
dim_payer = df_payer[['payer_id', 'payer']]
fact_prices = df_prices[['fact_id', 'cpt_id', 'hospital_id', 'payer_id', 'price']]

print(dim_cpt_hcpcs.info())
print(dim_hospitals.info())
print(dim_payer.info())
print(fact_prices.info())