In [1]:
import pandas as pd
import numpy as np
import requests
import json
import os
import sqlalchemy
import time
from tqdm.notebook import tqdm
from io import StringIO
from google.cloud import storage

In [2]:
#date transformation function
def process_dates(df,date_column):
    # Convert the date column to datetime
    df[date_column] = pd.to_datetime(df[date_column], unit='ms', errors='coerce')
    print(df[date_column].head())
    
    #Splitting the date into multiple unit
    df[f'{date_column}_year'] = df[date_column].dt.year.astype('Int64').astype('string').replace('<NA>', 'None')
    df[f'{date_column}_quarter'] = df[date_column].dt.quarter.astype('Int64').astype('string').replace('<NA>', 'None')
    df[f'{date_column}_month'] = df[date_column].dt.month.astype('Int64').astype('string').replace('<NA>', 'None')
    df[f'{date_column}_day'] = df[date_column].dt.day.astype('Int64').astype('string').replace('<NA>', 'None')
    
    # Format the date column
    df[date_column] = df[date_column].dt.strftime('%Y-%m-%d').astype('string').fillna('None')
    
    return df

In [3]:
#load cpt_hcpcs.json into dataframe
# Set service account key
service_account_key_path = 'different-state-hospital-price-fd662d2f48c2.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_key_path

# Specify JSON file path
bucket_path = 'gs://different-state-hospital-prices/cpt_hcpcs.json'

# Read JSON file into DataFrame
cpt_hcpcs_df = pd.read_json(bucket_path, lines=True)

print(cpt_hcpcs_df.info())
cpt_hcpcs_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               object
 1   short_description  object
 2   long_description   object
dtypes: object(3)
memory usage: 75.3+ MB
None


Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [4]:
#cpt_hcpcs transformation
cpt_hcpcs_df = cpt_hcpcs_df.astype({
    'code': 'string',
    'short_description': 'string',
    'long_description': 'string'
})
cpt_hcpcs_df = cpt_hcpcs_df.fillna('None')

cpt_hcpcs_df = cpt_hcpcs_df.drop_duplicates()

print(cpt_hcpcs_df.info())
cpt_hcpcs_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               string
 1   short_description  string
 2   long_description   string
dtypes: string(3)
memory usage: 75.3 MB
None


Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [5]:
#load hospitals.json into dataframe
bucket_path = 'gs://different-state-hospital-prices/hospitals.json'
hospitals_df = pd.read_json(bucket_path, lines=True)

print(hospitals_df.info())
hospitals_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   npi_number      1400 non-null   object 
 1   name            1400 non-null   object 
 2   url             1400 non-null   object 
 3   street_address  1351 non-null   object 
 4   city            1379 non-null   object 
 5   state           1377 non-null   object 
 6   zip_code        1357 non-null   object 
 7   publish_date    610 non-null    float64
dtypes: float64(1), object(7)
memory usage: 87.6+ KB
None


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code,publish_date
0,1003139775.0,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,,1609459000000.0
1,1003260480.0,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209,
2,1003281452.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,1609459000000.0
3,1003362997.0,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510,1609459000000.0
4,1003389206.0,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020,


In [6]:
#hospitals transformation
#covert npi number to string
hospitals_df['npi_number'] = hospitals_df['npi_number'].astype(str)
hospitals_df['npi_number'] = hospitals_df['npi_number'].str.replace(r'\D', '', regex=True)
hospitals_df['npi_number'] = hospitals_df['npi_number'].apply(lambda x: x[:10] if len(x) >= 10 else None).astype('string')

hospitals_df['zip_code'] = hospitals_df['zip_code'].astype(str)
hospitals_df['zip_code'] = hospitals_df['zip_code'].str.replace('.0','')

# covert other types using astype
hospitals_df = hospitals_df.astype({
    'name': 'string',
    'url': 'string',
    'street_address':'string',
    'city':'string',
    'state':'string',
})

hospitals_df = process_dates(hospitals_df,'publish_date')

hospitals_df = hospitals_df.fillna('None')

hospitals_df = hospitals_df.drop_duplicates()
print(hospitals_df.info())
hospitals_df.head()

0   2021-01-01
1          NaT
2   2021-01-01
3   2021-01-01
4          NaT
Name: publish_date, dtype: datetime64[ns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   npi_number            1400 non-null   string
 1   name                  1400 non-null   string
 2   url                   1400 non-null   string
 3   street_address        1400 non-null   string
 4   city                  1400 non-null   string
 5   state                 1400 non-null   string
 6   zip_code              1400 non-null   string
 7   publish_date          1400 non-null   string
 8   publish_date_year     1400 non-null   string
 9   publish_date_quarter  1400 non-null   string
 10  publish_date_month    1400 non-null   string
 11  publish_date_day      1400 non-null   string
dtypes: string(12)
memory usage: 131.4 KB
None


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code,publish_date,publish_date_year,publish_date_quarter,publish_date_month,publish_date_day
0,1003139775,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,,2021-01-01,2021.0,1.0,1.0,1.0
1,1003260480,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209,Unknown,,,,
2,1003281452,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01,2021.0,1.0,1.0,1.0
3,1003362997,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510,2021-01-01,2021.0,1.0,1.0,1.0
4,1003389206,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020,Unknown,,,,


In [7]:
#load prices.json into dataframe
bucket_path = 'gs://different-state-hospital-prices/prices.json'
prices_df = pd.read_json(bucket_path, lines=True)

print(prices_df.info())
prices_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72724852 entries, 0 to 72724851
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   code        object 
 1   npi_number  object 
 2   payer       object 
 3   price       float64
dtypes: float64(1), object(3)
memory usage: 2.2+ GB
None


Unnamed: 0,code,npi_number,payer,price
0,00000A,1053358010.0,CASH,75047.0
1,00000A,1336186394.0,CASH,75047.0
2,00001U,1003139775.0,CASH,457.23
3,00001U,1053824292.0,CASH,972.0
4,00001U,1417901406.0,CASH,296.0


In [8]:
test_df = prices_df.copy(deep=True)

In [12]:
prices_df['npi_number'] = prices_df['npi_number'].astype(str)
prices_df['npi_number'] = prices_df['npi_number'].str.replace(r'\D', '', regex=True)
prices_df['npi_number'] = prices_df['npi_number'].apply(lambda x: x[:10] if len(x) >= 10 else None).astype('string')

prices_df = prices_df.astype({
    'code':'string',
    'payer':'string'})

print(prices_df.info())
prices_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72724852 entries, 0 to 72724851
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   code        string 
 1   npi_number  string 
 2   payer       string 
 3   price       float64
dtypes: float64(1), string(3)
memory usage: 2.2 GB
None


Unnamed: 0,code,npi_number,payer,price
0,00000A,1053358010,CASH,75047.0
1,00000A,1336186394,CASH,75047.0
2,00001U,1003139775,CASH,457.23
3,00001U,1053824292,CASH,972.0
4,00001U,1417901406,CASH,296.0
