In [1]:
#import libaries
import pandas as pd
import numpy as np
import requests
import json
import os
import sqlalchemy
import time
from tqdm.notebook import tqdm
from io import StringIO
from google.cloud import storage

In [2]:
#extract data function
def extract_data(database_name, table):
    engine = sqlalchemy.create_engine(f'mysql+pymysql://root@localhost/{database_name}')
    query = f'SELECT * FROM {table};'
    chunk_size = 100000
    counter = 0
    all_chunks = []
    #read by chunk
    for chunk in pd.read_sql_query(query, engine, chunksize=chunk_size):
        start_time = time.time()
        print(f'Processing Chunk {counter}...')
        all_chunks.append(chunk)
        counter += 1
        end_time = time.time()
        total_time = end_time - start_time
        print(f'Chunk {counter} processed in {total_time:.2f} seconds')
    if all_chunks:
        df = pd.concat(all_chunks, ignore_index=True)
    else:
        df = pd.DataFrame()
    return df

In [3]:
#function to store data to the google cloud
def store_to_google_bucket(df, df_name):
    try:
        service_account_key_path = 'different-state-hospital-price-fd662d2f48c2.json'
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_key_path
        # Initialize the GCP storage client
        client = storage.Client()
        json_data = df.to_json(orient='records', lines=True)
        bucket_name = 'different-state-hospital-prices'
        destination_blob_name = f'{df_name}.json'
        # Get the bucket object
        bucket = client.get_bucket(bucket_name)
        # Create a new blob and upload the JSON data
        blob = bucket.blob(destination_blob_name)
        blob.upload_from_string(json_data, content_type='application/json')
        print(f'Data successfully uploaded to {destination_blob_name}')
    except Exception as e:
        print(f'An error occurred: {e}')

In [4]:
cpt_hcpcs_df = extract_data('hospital-price-transparency', 'cpt_hcpcs')
print(cpt_hcpcs_df.shape)
cpt_hcpcs_df.head()

Processing Chunk 0...
Chunk 1 processed in 0.00 seconds
Processing Chunk 1...
Chunk 2 processed in 0.00 seconds
Processing Chunk 2...
Chunk 3 processed in 0.00 seconds
Processing Chunk 3...
Chunk 4 processed in 0.00 seconds
Processing Chunk 4...
Chunk 5 processed in 0.00 seconds
Processing Chunk 5...
Chunk 6 processed in 0.00 seconds
Processing Chunk 6...
Chunk 7 processed in 0.00 seconds
Processing Chunk 7...
Chunk 8 processed in 0.00 seconds
Processing Chunk 8...
Chunk 9 processed in 0.00 seconds
Processing Chunk 9...
Chunk 10 processed in 0.00 seconds
Processing Chunk 10...
Chunk 11 processed in 0.00 seconds
Processing Chunk 11...
Chunk 12 processed in 0.00 seconds
Processing Chunk 12...
Chunk 13 processed in 0.00 seconds
Processing Chunk 13...
Chunk 14 processed in 0.00 seconds
Processing Chunk 14...
Chunk 15 processed in 0.00 seconds
Processing Chunk 15...
Chunk 16 processed in 0.00 seconds
Processing Chunk 16...
Chunk 17 processed in 0.00 seconds
Processing Chunk 17...
Chunk 18 p

Unnamed: 0,code,short_description,long_description
0,00000A,DVC REVASC 6X20MM 200CM,
1,00001U,RBC DNA HEA 35 AG PLA,
2,"00001U,1",RBC DNA HEA 35 AG PLA,
3,00013,PT INDIVIDUAL GYM,
4,0001A,HC ADM PFIZER SARSCOV2 30MCG/0.3ML 1ST,


In [5]:
store_to_google_bucket(cpt_hcpcs_df, 'cpt_hcpcs')

Data successfully uploaded to cpt_hcpcs.json


In [6]:
hospitals_df = extract_data('hospital-price-transparency', 'hospitals')
print(hospitals_df.shape)
hospitals_df.head()

Processing Chunk 0...
Chunk 1 processed in 0.00 seconds
(1400, 8)


Unnamed: 0,npi_number,name,url,street_address,city,state,zip_code,publish_date
0,1003139775.0,HCA Virginia,https://hcavirginia.com/about/legal/pricing-tr...,901 E. Cary St Suite 210,Richmond,VA,,2021-01-01
1,1003260480.0,Brookwood Baptist Medical Center,https://www.brookwoodbaptisthealth.com/docs/gl...,2010 Brookwood Medical Center Dr.,Birmingham,AL,35209,
2,1003281452.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01
3,1003362997.0,CHI Health St. Elizabeth,https://www.chihealth.com/content/dam/chi-heal...,555 S. 70Th St.,Lincoln,NE,68510,2021-01-01
4,1003389206.0,Merrill pioneer hospital,https://www.avera.org/app/files/public/79147/m...,"1100 S 10th Ave, Ste 100",Rock Rapids,IA,51246-2020,


In [7]:
store_to_google_bucket(hospitals_df, 'hospitals')

Data successfully uploaded to hospitals.json


In [None]:
prices_df = extract_data('hospital-price-transparency', 'prices')
print(prices_df.shape)
prices_df.head()

In [None]:
store_to_google_bucket(prices_df, 'prices')