In [None]:
BASE_DIR = '/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [None]:
#pyspark intitialization

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from scipy.stats import pearsonr
import warnings
import altair as alt
import numpy as np

warnings.filterwarnings('ignore')
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('org queries') \
    .config("spark.driver.memory", "8g")\
    .getOrCreate() 

sc = spark.sparkContext

In [None]:
#2019 payments file
general_payments_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'OP_DTL_GNRL_PGYR2019_P06302021.csv')


hcp_payments_df = general_payments_df.where(
    F.col('Covered_Recipient_Type') == 'Covered Recipient Physician')\
    .select(F.col('Physician_Profile_ID'),
           F.col('Physician_First_Name'),
           F.col('Physician_Middle_Name'),
           F.col('Physician_Last_Name'),
           F.col('Recipient_Primary_Business_Street_Address_Line1'),
           F.col('Recipient_State'),
           F.col('Recipient_City'),
           F.col('Recipient_Zip_Code'),
           F.col('Total_Amount_of_Payment_USDollars'))
# changing payment column to double
hcp_payments_df = hcp_payments_df.withColumn("Total_Amount_of_Payment_USDollars",hcp_payments_df["Total_Amount_of_Payment_USDollars"].cast('double'))
#sum payment column for each healthcare provider
hcp_payments_df = hcp_payments_df.groupBy('Physician_Profile_ID').agg(F.sum('Total_Amount_of_Payment_USDollars').alias("sum_payment"))

#mapping file
ppi_npi_matches_df_schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("NPI", StringType(), True),
    StructField("Physician_Profile_ID", StringType(), True),
    StructField("Score", FloatType(), True),
    StructField("NatAddr", StringType(), True),
    StructField("AddrScore", FloatType(), True),
    StructField("SupplAddr", StringType(), True),
    StructField("NatStateCity", StringType(), True),
    StructField("StateCityScore", FloatType(), True),
    StructField("SupplStateCity", StringType(), True),
    StructField("NatTaxonomy", StringType(), True),
    StructField("TaxonomyScore", FloatType(), True),
    StructField("SupplTaxonomy", StringType(), True),
    StructField("NatMiddleName", StringType(), True),
    StructField("MiddleNameScore", FloatType(), True),
    StructField("SupplMiddleName", StringType(), True)])

ppi_npi_matches_df = spark.read\
    .csv(BASE_DIR + 'data_processing/matched_out/hcp_matches.csv',header=False, schema=ppi_npi_matches_df_schema)

# join payments file with mapping file to filter out physicians that don't have payment information

hcp_payments_df = hcp_payments_df.join(on='Physician_Profile_ID', other=ppi_npi_matches_df)



#We obtain the organization for each physican from PhysicianComparePhoneNumberFile.csv

npi_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'PhysicianComparePhoneNumberFile.csv')\
    .withColumnRenamed(" NPI", "NPI")\
    .withColumnRenamed(" org_pac_id", "org_pac_id")\
    .withColumnRenamed(' phn_numbr', 'phn_numbr')\
    .withColumnRenamed(" frst_nm", "f_name")\
    .withColumnRenamed(" lst_nm", "l_name")\
    .withColumnRenamed(" mid_nm", "m_name")\
    .withColumnRenamed(" adr_ln_1", "adr_ln_1")\
    .withColumnRenamed(" adr_ln_2", "adr_ln_2")\
    .withColumnRenamed(" pri_spec", "pri_spec")\
    .withColumnRenamed(" st", "st")\
    .withColumnRenamed(" cty", "cty")\
    .withColumnRenamed(" zip", "zip")

#joining the organization with payments and physican information
hcp_cms_org_payments_df = hcp_payments_df.join(on='NPI', other=npi_df)
hcp_cms_org_payments_df.show(truncate=False)
hcp_cms_org_payments_df.columns


In [None]:
#selecting only organization and total payment information from the merged file

hcp_cms_org_payments_df = hcp_cms_org_payments_df.select(F.col('org_pac_id'),F.col('sum_payment'))
hcp_cms_org_payments_df = hcp_cms_org_payments_df.dropna()
hcp_cms_org_payments_df.show()

# summing payments at organization level since we only have performance ratings for organization in the cms website
cms_org_payments_df = hcp_cms_org_payments_df.groupBy('org_pac_id').agg(F.count('org_pac_id').alias('count_of_doctors'),
                                                               F.sum('sum_payment').alias('payment_amount'))
#we normalize payment column by number of doctors. This is done so that bigger organizations may have more doctors receiving payments.
cms_org_payments_df = cms_org_payments_df.withColumn("payment_normalized", (F.col("payment_amount") / F.col("count_of_doctors")))
#org_payments_df.printSchema()

In [None]:
# performance ratings file
cms_org_ratings_df = spark.read.options(header='True').csv(BASE_DIR + 'grp_public_reporting.csv')

#remove cols not used in our analysis
remove_cols = [' ACO_ID_1',' ACO_nm_1',' ACO_ID_2',' ACO_nm_2', ' ACO_ID_3', ' ACO_nm_3',' attestation_value']

cms_org_ratings_df = cms_org_ratings_df.drop(*remove_cols)

#rename columns to remove space and change datatype for float cols to float
cms_org_ratings_df = cms_org_ratings_df.withColumnRenamed(' prf_rate', 'prf_rate')\
                               .withColumnRenamed(' measure_cd', 'measure_cd')\
                               .withColumnRenamed(' patient_count', 'patient_count')\
                               .withColumnRenamed(' star_value', 'star_value')\
                               .withColumnRenamed(' star_value', 'star_value')\
                               .withColumnRenamed(' five_star_benchmark', 'five_star_benchmark')\
                               .withColumnRenamed(' measure_title', 'measure_title')\
                               .withColumnRenamed(' org_PAC_ID', 'org_pac_id')

float_cols = ['prf_rate','patient_count','star_value','five_star_benchmark']

#drop rows if all values in score cols have null values
cms_org_ratings_df = cms_org_ratings_df.dropna(how='all',subset=float_cols)

#cast float_cols to float
for col_name in float_cols:
    cms_org_ratings_df = cms_org_ratings_df.withColumn(col_name, F.col(col_name).cast('float'))
cms_org_ratings_df.show()
print(cms_org_ratings_df.columns)

In [None]:
#join org_payments and org_ratings to merge payments data and perf ratings data at organization level
cms_org_ratings_payments_df = cms_org_ratings_df.join(on='org_pac_id', other=cms_org_payments_df)
cms_org_ratings_payments_df.show(truncate=False)

In [None]:
cms_org_ratings_payments_df.dtypes

In [None]:
#covert pyspark df to pandas df to use for our visualizations
cms_org_ratings_payments_pddf = cms_org_ratings_payments_df.toPandas()
#disable the altair error when dataset rows is > 5000
alt.data_transformers.disable_max_rows()

#payments are in logarithmic scale to account for outliers
cms_payments_ratings_chart = alt.Chart(cms_org_ratings_payments_pddf).mark_point().encode(
    x=alt.X('payment_normalized:Q',scale=alt.Scale(type='log')),
    y=alt.Y('prf_rate:Q')
)
cms_payments_ratings_chart
#(payments_ratings_chart).properties(width=800,height=600)

In [None]:
cms_payments_ratings_chart.facet(
    'measure_cd:N',
    columns=2
)

In [None]:
float_cols_corr = cms_org_ratings_payments_pddf.corr().reset_index()
float_cols_corr

In [None]:
corrs_by_measurecode_df = cms_org_ratings_payments_pddf.groupby('measure_cd')[['prf_rate','star_value','payment_amount','payment_normalized']].corr()
corrs_by_measurecode_df

In [None]:
#https://stackoverflow.com/questions/25571882/pandas-columns-correlation-with-statistical-significance
cms_org_ratings_by_code_df = cms_org_ratings_payments_pddf.groupby('measure_cd')[['prf_rate','star_value','payment_amount','payment_normalized']]
pvals_by_measurecode_df = cms_org_ratings_by_code_df.corr(method=lambda x, y: pearsonr(x, y)[1])
pvals_as_asteriks = pvals_by_measurecode_df.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
corrs_by_measurecode_df = corrs_by_measurecode_df.round(5).astype(str)
(corrs_by_measurecode_df + pvals_as_asteriks).reset_index()

In [None]:
'''getting measure_title for interesting measures based on above analysis'''
#Radiation Consideration for Adult CT: Utilization of Dose Lowering Techniques
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_436_overall")).show(truncate=False)
#Dementia: Education and Support of Caregivers for Patients with Dementia
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_288_overall")).show(truncate=False)
#MIPS_GRP_277_overall|Sleep Apnea: Severity Assessment at Initial Diagnosis
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_277_overall")).show(truncate=False)
#Appropriate Foley catheter use in the emergency department
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "QCDR_GRP_ACEP31_overall")).show(truncate=False)
#Rheumatoid Arthritis (RA): Tuberculosis Screening
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_176_overall")).show(truncate=False)
#Parkinson's Disease: Cognitive Impairment or Dysfunction Assessment for Patients with Parkinson's Disease
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_291_overall")).show(truncate=False)
#Appropriate management of anticoagulation in the peri-procedural period rate - EGD
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "QCDR_GRP_GIQIC10_overall")).show(truncate=False)
#Dementia: Functional Status Assessment
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_282_overall")).show(truncate=False)
#Parkinson's Disease: Rehabilitative Therapy Options
cms_org_ratings_payments_df.filter((cms_org_ratings_payments_df['measure_cd']  == "MIPS_GRP_293_overall")).show(truncate=False)
cms_org_ratings_payments_df.where(col('measure_title').like("%Elderly%")).show()
#MIPS_GRP_238_overall

In [None]:
 cms_org_ratings_payments_pddf.to_csv(BASE_DIR + 'data_processing/ratings/cms_org_payments_ratings.csv', header=True, index=False)