In [75]:
#import libaries
import pandas as pd
import numpy as np
import requests
import json
import os
from tqdm.notebook import tqdm
from io import StringIO
from google.cloud import storage

In [80]:
def download_with_progress(url):
    response = requests.get(url, stream=True)
    total_size_in_bytes = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kibibyte
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    data = []
    for data_chunk in response.iter_content(block_size):
        progress_bar.update(len(data_chunk))
        data.append(data_chunk)
    progress_bar.close()
    return b"".join(data)

In [81]:
#extract data function
def extract_data_dolthub(repo, branch, table):
    url = f"https://www.dolthub.com/csv/dolthub/{repo}/{branch}/{table}"
    csv_data = download_with_progress(url)
    df_raw = pd.read_csv(StringIO(csv_data.decode('utf-8')))
    print(df_raw.info())
    print(df_raw.shape)
    return df_raw

In [82]:
#function to store data to the google cloud
def store_to_google_bucket(df, df_name):
    service_account_key_path = 'different-state-hospital-price-fd662d2f48c2.json'
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = service_account_key_path
    # Initialize the GCP storage client
    client = storage.Client()
    json_data = df.to_json(orient='records')
    bucket_name = 'different-state-hospital-prices'
    destination_blob_name = f'{df_name}.json'
    # Get the bucket object
    bucket = client.get_bucket(bucket_name)
    # Create a new blob and upload the JSON data
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(json_data, content_type='application/json')
    print(f'Data successfully uploaded to {destination_blob_name}')

In [83]:
cpt_hcpcs_df = extract_data_dolthub('hospital-price-transparency', 'master', 'cpt_hcpcs')

0.00iB [00:00, ?iB/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3287818 entries, 0 to 3287817
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   code               object
 1   short_description  object
 2   long_description   object
dtypes: object(3)
memory usage: 75.3+ MB
None
(3287818, 3)


  df_raw = pd.read_csv(StringIO(csv_data.decode('utf-8')))


In [85]:
store_to_google_bucket(cpt_hcpcs_df, 'cpt_hcpcs')

Data successfully uploaded to cpt_hcpcs.json


In [None]:
prices_df = extract_data_dolthub('hospital-price-transparency', 'master', 'prices')

In [None]:
store_to_google_bucket(prices_df, 'prices')

In [None]:
hospitals_df = extract_data_dolthub('hospital-price-transparency', 'master', 'hospitals')

In [None]:
store_to_google_bucket(hospitals_df, 'hospitals')