In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from more_pyspark import *

spark = SparkSession.builder.appName('Ops').getOrCreate()

22/12/04 16:09:18 WARN Utils: Your hostname, nn1448lr222 resolves to a loopback address: 127.0.1.1; using 172.18.130.159 instead (on interface eth0)
22/12/04 16:09:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/04 16:09:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [96]:
crosswalk = spark.read.csv("./data/CMS_Facility_to_FIPS_Crosswalk.csv",inferSchema=True,header=True)
hospital = spark.read.csv("./data/Timely_and_Effective_Care-Hospital.csv",inferSchema=True,header=True)
poverty = spark.read.csv("./data/PovertyEstimates.csv",inferSchema=True,header=True)

In [3]:
pprint_schema(hospital)

StructType([StructField('Facility ID', DoubleType(), True),
            StructField('Facility Name', StringType(), True),
            StructField('Address', StringType(), True),
            StructField('City', StringType(), True),
            StructField('State', StringType(), True),
            StructField('ZIP Code', IntegerType(), True),
            StructField('County Name', StringType(), True),
            StructField('Phone Number', StringType(), True),
            StructField('Condition', StringType(), True),
            StructField('Measure ID', StringType(), True),
            StructField('Measure Name', StringType(), True),
            StructField('Score', StringType(), True),
            StructField('Sample', StringType(), True),
            StructField('Footnote', StringType(), True),
            StructField('Start Date', StringType(), True),
            StructField('End Date', StringType(), True)])


In [6]:
hospital.take(1) >> to_pandas

Unnamed: 0,Facility ID,Facility Name,Address,City,State,ZIP Code,County Name,Phone Number,Condition,Measure ID,Measure Name,Score,Sample,Footnote,Start Date,End Date
0,10001.0,SOUTHEAST HEALTH MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,HOUSTON,(334) 793-8701,Emergency Department,EDV,Emergency department volume,high,,,01/01/2020,12/31/2020


In [7]:
from pyspark.sql.functions import array, explode, struct, lit, col
from composable import pipeable

@pipeable
def spread(val_col, var_col, group_by_col, df):
    return  (df
             .groupBy(group_by_col)
             .pivot(val_col)
             .sum(var_col))

In [74]:
import numpy as np

hospital_clean = (hospital
                  .select("Facility ID",'State','County Name','Condition','Measure Name','Score')
                  .where(col('Condition') == 'Emergency Department')
                  .where(~col('Measure Name').rlike('Psychiatric|CT'))
                  .select("Facility ID",'State','County Name','Measure Name','Score')
                  .withColumn('Score', when(col('Score') == 'Not Available', np.nan).otherwise(col('Score')))
                  .withColumn('Score', case_when((col('Score') == 'nan', np.nan),
                                                 (col('Score') == 'low', 10001),
                                                 (col('Score') == 'medium', 10002),
                                                 (col('Score') == 'high', 10003),
                                                 (col('Score') == 'very high', 10004),
                                                 else_ = col('Score')
                                                 )
                             )
                  .withColumn('Score', col('Score').cast('float'))
                  .withColumn('Facility ID', col('Facility ID').cast('int'))
                 )

hospital_clean = ((hospital_clean
                   >> spread('Measure Name','Score',hospital_clean.columns[:3])
                  )
                  .withColumnRenamed('Average (median) time patients spent in the emergency department before leaving from the visit A lower number of minutes is better','Avg Median Time in ER')
                  .withColumn('Emergency department volume', case_when((col('Emergency department volume') == 10001, 'low'),
                                                                       (col('Emergency department volume') == 10002, 'medium'),
                                                                       (col('Emergency department volume') == 10003, 'high'),
                                                                       (col('Emergency department volume') == 10004, 'very high'),
                                                                       else_ = np.nan
                                                                       )
                             )
                  .withColumnRenamed('Facility ID','Facility_ID')
                 )

hospital_clean.sample(fraction=0.001).collect() >> to_pandas

Unnamed: 0,Facility_ID,State,County Name,Avg Median Time in ER,Emergency department volume,Left before being seen
0,50761,CA,LOS ANGELES,136.0,medium,1.0
1,520008,WI,WAUKESHA,141.0,high,1.0
2,100313,FL,GULF,96.0,low,0.0
3,440161,TN,DAVIDSON,152.0,very high,1.0
4,50424,CA,SAN DIEGO,,,
5,450422,TX,DALLAS,107.0,low,2.0
6,260020,MO,ST. LOUIS,179.0,very high,1.0
7,310022,NJ,CAMDEN,140.0,very high,2.0


In [75]:
hospital_clean.columns[:3]

['Facility_ID', 'State', 'County Name']

In [76]:
pprint_schema(hospital_clean)

StructType([StructField('Facility_ID', IntegerType(), True),
            StructField('State', StringType(), True),
            StructField('County Name', StringType(), True),
            StructField('Avg Median Time in ER', DoubleType(), True),
            StructField('Emergency department volume', StringType(), False),
            StructField('Left before being seen', DoubleType(), True)])


In [79]:
crosswalk.sample(fraction=0.001).collect() >> to_pandas

Unnamed: 0,State,Facility_ID,County Name,FIPS
0,AL,14015,CRENSHAW,1041
1,IL,140018,COOK,17031
2,ME,204006,PENOBSCOT,23019
3,MS,251329,GREENE,28041
4,PR,400117,ARECIBO,72013
5,SD,431324,JERAULD,46073
6,VT,471301,ORANGE,50017
7,WA,501338,KING,53033


In [101]:
crosswalk = (crosswalk.withColumnRenamed('Facility ID','Facility_ID'))

pprint_schema(crosswalk)

StructType([StructField('State', StringType(), True),
            StructField('Facility_ID', IntegerType(), True),
            StructField('County Name', StringType(), True),
            StructField('FIPS', IntegerType(), True)])


In [102]:
#Joining hospital data with their respective FIPs
hospital_w_FIPS = (hospital_clean
                   .join(crosswalk,
                         hospital_clean.Facility_ID == crosswalk.Facility_ID,
                         how='left'
                        )
                  )

hospital_w_FIPS.sample(fraction=0.001).collect() >> to_pandas

Unnamed: 0,Facility_ID,State,County Name,Avg Median Time in ER,Emergency department volume,Left before being seen,FIPS
0,240101,MN,BECKER,144.0,low,2.0,27005
1,450403,TX,COLLIN,129.0,medium,0.0,48085
2,460041,UT,DAVIS,136.0,medium,0.0,49011
3,431319,SD,HAAKON,,,,46055
4,170009,KS,LEAVENWORTH,119.0,low,1.0,20103


In [5]:
pprint_schema(poverty)

StructType([StructField('FIPStxt', IntegerType(), True),
            StructField('Stabr', StringType(), True),
            StructField('Area_name', StringType(), True),
            StructField('Attribute', StringType(), True),
            StructField('Value', DoubleType(), True)])


In [103]:
poverty.sample(fraction=0.0001).collect() >> to_pandas

Unnamed: 0,FIPStxt,Stabr,Area_name,Attribute,Value
0,25021,MA,Norfolk County,POV017_2020,6773.0
1,37021,NC,Buncombe County,Rural-urban_Continuum_Code_2003,2.0
2,37065,NC,Edgecombe County,CI90UBALL_2020,14135.0
3,40000,OK,Oklahoma,CI90LB017P_2020,17.8
4,47175,TN,Van Buren County,CI90UB517_2020,301.0


In [104]:
poverty_clean = (poverty
                 .withColumnRenamed('FIPStxt','FIPS')
                 .where(col('Attribute') == 'PCTPOVALL_2020')
                 )

poverty_clean = ((poverty_clean
                  >> spread('Attribute','Value',poverty_clean.columns[:3])
                 )
                 .select('FIPS','PCTPOVALL_2020')
                )

poverty_clean.sample(fraction=0.01).collect() >> to_pandas

Unnamed: 0,FIPS,PCTPOVALL_2020
0,51013,6.0
1,49005,9.2
2,49055,11.2
3,29059,24.1
4,27165,9.7
5,48083,16.0
6,25023,7.2
7,8087,10.5
8,27117,9.2
9,19173,10.5


In [109]:
#Joining hospital_w_FIPS with poverty
hospital_w_FIPS_n_poverty = (hospital_w_FIPS
                             .join(poverty_clean, 
                                   hospital_w_FIPS.FIPS == poverty_clean.FIPS,
                                   how='left'
                                  )
                            )

hospital_w_FIPS_n_poverty.collect() >> to_pandas

Unnamed: 0,Facility_ID,State,County Name,Avg Median Time in ER,Emergency department volume,Left before being seen,FIPS,PCTPOVALL_2020
0,50278.0,CA,LOS ANGELES,153.0,very high,2.0,6037.0,13.2
1,10157.0,AL,COLBERT,112.0,low,1.0,1033.0,14.4
2,231308.0,MI,ALGER,130.0,low,0.0,26003.0,12.1
3,241368.0,MN,STEARNS,119.0,low,0.0,27145.0,10.6
4,241334.0,MN,NICOLLET,112.0,low,1.0,27103.0,7.2
...,...,...,...,...,...,...,...,...
4701,440130.0,TN,OBION,112.0,low,1.0,47131.0,15.1
4702,451373.0,TX,WARD,112.0,,,48475.0,9.8
4703,490043.0,VA,LOUDOUN,173.0,very high,0.0,51107.0,3.2
4704,420072.0,SC,HAMPTON,127.0,low,2.0,45049.0,19.6


In [123]:
AVG_Time_in_ER_and_Poverty = (hospital_w_FIPS_n_poverty
                              .groupBy('PCTPOVALL_2020')
                              .agg(mean(col('Avg Median Time in ER')).alias('AVG_of_Avg_Median_Time_in_ER'))
                              .where(~isnan(col('AVG_of_Avg_Median_Time_in_ER')))
                             )
AVG_Time_in_ER_and_Poverty.collect() >> to_pandas

Unnamed: 0,PCTPOVALL_2020,AVG_of_Avg_Median_Time_in_ER
0,26.7,108.000000
1,26.4,126.000000
2,23.8,107.000000
3,5.4,155.333333
4,3.5,168.000000
...,...,...
77,5.8,147.000000
78,25.6,130.000000
79,20.6,132.800000
80,16.0,144.384615


In [124]:
AVG_Left_before_being_seen_and_Poverty = (hospital_w_FIPS_n_poverty
                                          .groupBy('PCTPOVALL_2020')
                                          .agg(mean(col('Left before being seen')).alias('AVG_Left_before_being_seen'))
                                          .where(~isnan(col('AVG_Left_before_being_seen')))
                                         )

AVG_Left_before_being_seen_and_Poverty.collect() >> to_pandas

Unnamed: 0,PCTPOVALL_2020,AVG_Left_before_being_seen
0,26.7,1.666667
1,26.4,2.0
2,5.4,1.666667
3,3.5,1.0
4,23.9,4.285714
5,26.6,2.0
6,35.3,5.0
7,3.7,1.0
8,26.8,1.0
9,5.7,0.833333


In [129]:
CTN_Emergency_department_volume_and_Poverty = (hospital_w_FIPS_n_poverty
                                               .groupBy('Emergency department volume')
                                               .count()
                                              )
CTN_Emergency_department_volume_and_Poverty.collect() >> to_pandas

Unnamed: 0,Emergency department volume,count
0,low,1704
1,high,515
2,medium,960
3,,1078
4,very high,449
