In [1]:
#import libaries
import pandas as pd
import numpy as np
import requests
import json
import os
import sqlalchemy
import time
from tqdm.notebook import tqdm
from io import StringIO
from google.cloud import storage

In [6]:
#extract data function
def extract_data(database_name, table):
    engine = sqlalchemy.create_engine(f'mysql+pymysql://root@localhost/{database_name}')
    query = f'SELECT * FROM {table};'
    chunk_size = 100000
    counter = 0
    all_chunks = []
    #read by chunk
    for chunk in pd.read_sql_query(query, engine, chunksize=chunk_size):
        start_time = time.time()
        print(f'Processing Chunk {counter}...')
        all_chunks.append(chunk)
        counter += 1
        end_time = time.time()
        total_time = end_time - start_time
        print(f'Chunk {counter} processed in {total_time:.2f} seconds')
    if all_chunks:
        df = pd.concat(all_chunks, ignore_index=True)
    else:
        df = pd.DataFrame()
    return df

In [7]:
#function to store data to the google cloud
def store_to_google_bucket(df, df_name):
    try:
        service_account_key_path = 'different-state-hospital-price-fd662d2f48c2.json'
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_key_path
        # Initialize the GCP storage client
        client = storage.Client()
        json_data = df.to_json(orient='records', lines=True)
        bucket_name = 'different-state-hospital-prices'
        destination_blob_name = f'{df_name}.json'
        # Get the bucket object
        bucket = client.get_bucket(bucket_name)
        # Create a new blob and upload the JSON data
        blob = bucket.blob(destination_blob_name)
        blob.upload_from_string(json_data, content_type='application/json')
        print(f'Data successfully uploaded to {destination_blob_name}')
    except Exception as e:
        print(f'An error occurred: {e}')

In [8]:
cpt_hcpcs_df = extract_data('hospital-price-transparency', 'cpt_hcpcs')
cpt_hcpcs_df.head()

Processing Chunk 0...
Chunk 1 processed in 0.00 seconds
Processing Chunk 1...
Chunk 2 processed in 0.00 seconds
Processing Chunk 2...
Chunk 3 processed in 0.00 seconds
Processing Chunk 3...
Chunk 4 processed in 0.00 seconds
Processing Chunk 4...
Chunk 5 processed in 0.00 seconds
Processing Chunk 5...
Chunk 6 processed in 0.00 seconds
Processing Chunk 6...
Chunk 7 processed in 0.00 seconds
Processing Chunk 7...
Chunk 8 processed in 0.00 seconds
Processing Chunk 8...
Chunk 9 processed in 0.00 seconds
Processing Chunk 9...
Chunk 10 processed in 0.00 seconds
Processing Chunk 10...
Chunk 11 processed in 0.00 seconds
Processing Chunk 11...
Chunk 12 processed in 0.00 seconds
Processing Chunk 12...
Chunk 13 processed in 0.00 seconds
Processing Chunk 13...
Chunk 14 processed in 0.00 seconds
Processing Chunk 14...
Chunk 15 processed in 0.00 seconds
Processing Chunk 15...
Chunk 16 processed in 0.00 seconds
Processing Chunk 16...
Chunk 17 processed in 0.00 seconds
Processing Chunk 17...
Chunk 18 p

Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [9]:
store_to_google_bucket(cpt_hcpcs_df, 'cpt_hcpcs')

Data successfully uploaded to cpt_hcpcs.json


In [10]:
hospitals_df = extract_data('hospital-price-transparency', 'hospitals')
hospitals_df.head()

Processing Chunk 0...
Chunk 1 processed in 0.00 seconds


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code,publish_date
0,1003139775.0,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,,2021-01-01
1,1003260480.0,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209,
2,1003281452.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01
3,1003362997.0,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510,2021-01-01
4,1003389206.0,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020,


In [11]:
store_to_google_bucket(hospitals_df, 'hospitals')

Data successfully uploaded to hospitals.json


In [12]:
prices_df = extract_data('hospital-price-transparency', 'prices')
prices_df.head()

Processing Chunk 0...
Chunk 1 processed in 0.00 seconds
Processing Chunk 1...
Chunk 2 processed in 0.00 seconds
Processing Chunk 2...
Chunk 3 processed in 0.00 seconds
Processing Chunk 3...
Chunk 4 processed in 0.00 seconds
Processing Chunk 4...
Chunk 5 processed in 0.00 seconds
Processing Chunk 5...
Chunk 6 processed in 0.00 seconds
Processing Chunk 6...
Chunk 7 processed in 0.00 seconds
Processing Chunk 7...
Chunk 8 processed in 0.00 seconds
Processing Chunk 8...
Chunk 9 processed in 0.00 seconds
Processing Chunk 9...
Chunk 10 processed in 0.00 seconds
Processing Chunk 10...
Chunk 11 processed in 0.00 seconds
Processing Chunk 11...
Chunk 12 processed in 0.00 seconds
Processing Chunk 12...
Chunk 13 processed in 0.00 seconds
Processing Chunk 13...
Chunk 14 processed in 0.00 seconds
Processing Chunk 14...
Chunk 15 processed in 0.00 seconds
Processing Chunk 15...
Chunk 16 processed in 0.00 seconds
Processing Chunk 16...
Chunk 17 processed in 0.00 seconds
Processing Chunk 17...
Chunk 18 p

Processing Chunk 142...
Chunk 143 processed in 0.00 seconds
Processing Chunk 143...
Chunk 144 processed in 0.00 seconds
Processing Chunk 144...
Chunk 145 processed in 0.00 seconds
Processing Chunk 145...
Chunk 146 processed in 0.00 seconds
Processing Chunk 146...
Chunk 147 processed in 0.00 seconds
Processing Chunk 147...
Chunk 148 processed in 0.00 seconds
Processing Chunk 148...
Chunk 149 processed in 0.00 seconds
Processing Chunk 149...
Chunk 150 processed in 0.00 seconds
Processing Chunk 150...
Chunk 151 processed in 0.00 seconds
Processing Chunk 151...
Chunk 152 processed in 0.00 seconds
Processing Chunk 152...
Chunk 153 processed in 0.00 seconds
Processing Chunk 153...
Chunk 154 processed in 0.00 seconds
Processing Chunk 154...
Chunk 155 processed in 0.00 seconds
Processing Chunk 155...
Chunk 156 processed in 0.00 seconds
Processing Chunk 156...
Chunk 157 processed in 0.00 seconds
Processing Chunk 157...
Chunk 158 processed in 0.00 seconds
Processing Chunk 158...
Chunk 159 proces

Processing Chunk 279...
Chunk 280 processed in 0.00 seconds
Processing Chunk 280...
Chunk 281 processed in 0.00 seconds
Processing Chunk 281...
Chunk 282 processed in 0.00 seconds
Processing Chunk 282...
Chunk 283 processed in 0.00 seconds
Processing Chunk 283...
Chunk 284 processed in 0.00 seconds
Processing Chunk 284...
Chunk 285 processed in 0.00 seconds
Processing Chunk 285...
Chunk 286 processed in 0.00 seconds
Processing Chunk 286...
Chunk 287 processed in 0.00 seconds
Processing Chunk 287...
Chunk 288 processed in 0.00 seconds
Processing Chunk 288...
Chunk 289 processed in 0.00 seconds
Processing Chunk 289...
Chunk 290 processed in 0.00 seconds
Processing Chunk 290...
Chunk 291 processed in 0.00 seconds
Processing Chunk 291...
Chunk 292 processed in 0.00 seconds
Processing Chunk 292...
Chunk 293 processed in 0.00 seconds
Processing Chunk 293...
Chunk 294 processed in 0.00 seconds
Processing Chunk 294...
Chunk 295 processed in 0.00 seconds
Processing Chunk 295...
Chunk 296 proces

Processing Chunk 416...
Chunk 417 processed in 0.00 seconds
Processing Chunk 417...
Chunk 418 processed in 0.00 seconds
Processing Chunk 418...
Chunk 419 processed in 0.00 seconds
Processing Chunk 419...
Chunk 420 processed in 0.00 seconds
Processing Chunk 420...
Chunk 421 processed in 0.00 seconds
Processing Chunk 421...
Chunk 422 processed in 0.00 seconds
Processing Chunk 422...
Chunk 423 processed in 0.00 seconds
Processing Chunk 423...
Chunk 424 processed in 0.00 seconds
Processing Chunk 424...
Chunk 425 processed in 0.00 seconds
Processing Chunk 425...
Chunk 426 processed in 0.00 seconds
Processing Chunk 426...
Chunk 427 processed in 0.00 seconds
Processing Chunk 427...
Chunk 428 processed in 0.00 seconds
Processing Chunk 428...
Chunk 429 processed in 0.00 seconds
Processing Chunk 429...
Chunk 430 processed in 0.00 seconds
Processing Chunk 430...
Chunk 431 processed in 0.00 seconds
Processing Chunk 431...
Chunk 432 processed in 0.00 seconds
Processing Chunk 432...
Chunk 433 proces

Processing Chunk 553...
Chunk 554 processed in 0.00 seconds
Processing Chunk 554...
Chunk 555 processed in 0.00 seconds
Processing Chunk 555...
Chunk 556 processed in 0.00 seconds
Processing Chunk 556...
Chunk 557 processed in 0.00 seconds
Processing Chunk 557...
Chunk 558 processed in 0.00 seconds
Processing Chunk 558...
Chunk 559 processed in 0.00 seconds
Processing Chunk 559...
Chunk 560 processed in 0.00 seconds
Processing Chunk 560...
Chunk 561 processed in 0.00 seconds
Processing Chunk 561...
Chunk 562 processed in 0.00 seconds
Processing Chunk 562...
Chunk 563 processed in 0.00 seconds
Processing Chunk 563...
Chunk 564 processed in 0.00 seconds
Processing Chunk 564...
Chunk 565 processed in 0.00 seconds
Processing Chunk 565...
Chunk 566 processed in 0.00 seconds
Processing Chunk 566...
Chunk 567 processed in 0.00 seconds
Processing Chunk 567...
Chunk 568 processed in 0.00 seconds
Processing Chunk 568...
Chunk 569 processed in 0.00 seconds
Processing Chunk 569...
Chunk 570 proces

Processing Chunk 690...
Chunk 691 processed in 0.00 seconds
Processing Chunk 691...
Chunk 692 processed in 0.00 seconds
Processing Chunk 692...
Chunk 693 processed in 0.00 seconds
Processing Chunk 693...
Chunk 694 processed in 0.00 seconds
Processing Chunk 694...
Chunk 695 processed in 0.00 seconds
Processing Chunk 695...
Chunk 696 processed in 0.00 seconds
Processing Chunk 696...
Chunk 697 processed in 0.00 seconds
Processing Chunk 697...
Chunk 698 processed in 0.00 seconds
Processing Chunk 698...
Chunk 699 processed in 0.00 seconds
Processing Chunk 699...
Chunk 700 processed in 0.00 seconds
Processing Chunk 700...
Chunk 701 processed in 0.00 seconds
Processing Chunk 701...
Chunk 702 processed in 0.00 seconds
Processing Chunk 702...
Chunk 703 processed in 0.00 seconds
Processing Chunk 703...
Chunk 704 processed in 0.00 seconds
Processing Chunk 704...
Chunk 705 processed in 0.00 seconds
Processing Chunk 705...
Chunk 706 processed in 0.00 seconds
Processing Chunk 706...
Chunk 707 proces

Unnamed: 0,code,npi_number,payer,price
0,00000A,1053358010.0,CASH,75047.0
1,00000A,1336186394.0,CASH,75047.0
2,00001U,1003139775.0,CASH,457.23
3,00001U,1053824292.0,CASH,972.0
4,00001U,1417901406.0,CASH,296.0


In [13]:
store_to_google_bucket(prices_df, 'prices')

Data successfully uploaded to prices.json
