## Index
[Importing modules](#Importing-required-modules)\
[Importing data](#Importing-file-and-unzipping)\
[Cleaning](#Cleaning)

### Importing required modules

* `requests` used for importing the downloaded zip file
* `tqdm` for progress bars, file size, elapsed time and download speeds
* `zipfile` for opening the zip file
* `concurrent.futures` for multiprocessing
* `pandas` as datatype

In [2]:
import requests
from tqdm.auto import tqdm
import zipfile
import concurrent.futures
import pandas as pd
from io import StringIO

### Importing file and unzipping

In [3]:
#Download data to hospitalPriceData.zip in working directory
hospitalPriceData_zip_url = 'https://www.dolthub.com/csv/dolthub/hospital-price-transparency-v3/iptu80riko4il5qij5asr8nalodeo9ut?include_bom=0'

#session to efficiently use data streaming
session = requests.Session()
response = session.get(hospitalPriceData_zip_url, stream= True)
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 1024**2 #1 MiB blockst
progress_bar = tqdm(total= total_size_in_bytes, unit= 'B', unit_scale= True)
with open("hospitalPriceData.zip", "wb") as file:
    for data in response.iter_content(block_size):
        progress_bar.update(len(data))
        file.write(data)
progress_bar.close()

#2.31 GB, TOOK 1hr 15min WITH 675mb ETHERNET (though dolthub was the bottleneck)

2.31GB [1:15:14, 511kB/s] 


In [10]:
#%%time

#Extract files from hospitalPriceData.zip
zf = zipfile.ZipFile('hospitalPriceData.zip')
with concurrent.futures.ProcessPoolExecutor() as executor:
    executor.map(zf.extractall(), zf.infolist())

#3min 18s wall time normal
#3min 7s wall time multithreaded

CPU times: user 1min 19s, sys: 35.9 s, total: 1min 55s
Wall time: 3min 7s


### Cleaning

In [6]:
hospitals_df = pd.read_csv('hospitals.csv', low_memory=False)
print(hospitals_df.head())

  cms_certification_num                                   name  \
0                010001        SOUTHEAST HEALTH MEDICAL CENTER   
1                010005  MARSHALL MEDICAL CENTERS SOUTH CAMPUS   
2                010006           NORTH ALABAMA MEDICAL CENTER   
3                010007               MIZELL MEMORIAL HOSPITAL   
4                010008            CRENSHAW COMMUNITY HOSPITAL   

                      address      city state   zip5  beds  phone_number  \
0      1108 ROSS CLARK CIRCLE    DOTHAN    AL  36301   420    3347938701   
1  2505 U S HIGHWAY 431 NORTH      BOAZ    AL  35957   240    2565938310   
2         1701 VETERANS DRIVE  FLORENCE    AL  35630   338    2567688400   
3               702 N MAIN ST       OPP    AL  36467    99    3344933541   
4         101 HOSPITAL CIRCLE   LUVERNE    AL  36049    65    3343353374   

                                        homepage_url  \
0  https://www.southeasthealth.org/southeast-heal...   
1                   https://www.ma

In [3]:
#%%time

prices_df = pd.read_csv('prices.csv', low_memory=True)
print(prices_df.head())
#11 min low_memory= True

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


   cms_certification_num                         payer code  \
0                  10001  AMERIGROUP MEDICAID [350002]  100   
1                  10001       BLUE ADVANTAGE [308003]  100   
2                  10001    BLUE CROSS OF ALA [200001]  100   
3                  10001           CARESOURCE [100121]  100   
4                  10001  CONNECTICUT GENERAL [100009]  100   

  internal_revenue_code units        description inpatient_outpatient  \
0                 MS100   NaN  SEIZURES WITH MCC          UNSPECIFIED   
1                 MS100   NaN  SEIZURES WITH MCC          UNSPECIFIED   
2                 MS100   NaN  SEIZURES WITH MCC          UNSPECIFIED   
3                 MS100   NaN  SEIZURES WITH MCC          UNSPECIFIED   
4                 MS100   NaN  SEIZURES WITH MCC          UNSPECIFIED   

      price code_disambiguator  
0  26744.41               NONE  
1  26744.41               NONE  
2  26744.41               NONE  
3  26744.41               NONE  
4  26744.41      

In [9]:
print(hospitals_df.shape)
#5992 hospitals
print(prices_df.shape)
#296210747 priced items

#avg of 49434.3703271 priced items per hospital

(5992, 11)
(296210747, 9)


In [2]:
%%time

def extract(file_name, memory_status):
    prices_df = pd.read_csv(file_name, low_memory= memory_status)

with concurrent.futures.ProcessPoolExecutor() as executor:
    executor.map(extract('prices.csv', False))

print(prices_df.head())