In [None]:
BASE_DIR = '/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [None]:
#pyspark intitialization

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from scipy.stats import pearsonr
import warnings
import altair as alt
import numpy as np
from pyspark.sql.functions import udf

warnings.filterwarnings('ignore')
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('org queries') \
    .config("spark.driver.memory", "8g")\
    .getOrCreate() 

sc = spark.sparkContext

In [None]:
#2019 payments file
general_payments_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'OP_DTL_GNRL_PGYR2019_P06302021.csv')


hcp_payments_df = general_payments_df.where(
    F.col('Covered_Recipient_Type') == 'Covered Recipient Physician')\
    .select(F.col('Physician_Profile_ID'),
           F.col('Physician_First_Name'),
           F.col('Physician_Middle_Name'),
           F.col('Physician_Last_Name'),
           F.col('Recipient_Primary_Business_Street_Address_Line1'),
           F.col('Recipient_State'),
           F.col('Recipient_City'),
           F.col('Recipient_Zip_Code'),
           F.col('Total_Amount_of_Payment_USDollars'),
           F.col('Nature_of_Payment_or_Transfer_of_Value'))
'''
hcp_payment_types_df = hcp_payments_df.groupBy(
    #'Nature_of_Payment_or_Transfer_of_Value'
    ['Physician_Profile_ID', 'Nature_of_Payment_or_Transfer_of_Value']
).agg(
    F.sum('Total_Amount_of_Payment_USDollars').alias("sum_payment"),
    F.avg('Total_Amount_of_Payment_USDollars').alias("avg_payment"),
)
hcp_payment_types_df.show(truncate=False)
'''

# changing payment column to double
hcp_payments_df = hcp_payments_df.withColumn(
    "Total_Amount_of_Payment_USDollars",
    hcp_payments_df["Total_Amount_of_Payment_USDollars"].cast('double')
)
#sum payment column for each healthcare provider
hcp_payments_df = hcp_payments_df.groupBy(
    ['Physician_Profile_ID', 'Nature_of_Payment_or_Transfer_of_Value']
).agg(
    F.sum('Total_Amount_of_Payment_USDollars').alias("sum_payment")
)

def shortenedName(payment_type):
    if payment_type == 'Compensation for services other than consulting':
        return 'Other'
    if payment_type == 'Compensation for serving as faculty or as a speaker for an accredited or certified continuing education program':
        return 'Speaking' #'Speaking (non-accredited)'
    if payment_type == 'Compensation for serving as faculty or as a speaker for a non-accredited and noncertified continuing education program':
        return 'Speaking' #'Speaking (accredited)'
    if payment_type == 'Compensation for services other than consulting, including serving as faculty or as a speaker at a venue other than a continuing education program':
        return 'Speaking' #'Speaking (non-education)'
    if payment_type == 'Space rental or facility fees (teaching hospital only)':
        return 'Space rental'
    if payment_type == 'Current or prospective ownership or investment interest':
        return 'Ownership interest'
    if payment_type == 'Charitable Contribution':
        return 'Charity'
    return payment_type

shortenedNameUdf = F.udf(shortenedName, StringType())

hcp_payments_df = hcp_payments_df.withColumn(
    'Nature_of_Payment_or_Transfer_of_Value', 
    shortenedNameUdf(F.col('Nature_of_Payment_or_Transfer_of_Value'))
)

hcp_payments_summary_df = hcp_payments_df.groupBy(
    'Nature_of_Payment_or_Transfer_of_Value'
).agg(
    F.sum('sum_payment').alias("sum_payment"),
    F.avg('sum_payment').alias("avg_payment"),
    F.count('Physician_Profile_ID').alias('num_physicians')
)
hcp_payments_summary_df.show(truncate=False)

#mapping file
ppi_npi_matches_df_schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("NPI", StringType(), True),
    StructField("Physician_Profile_ID", StringType(), True),
    StructField("Score", FloatType(), True),
    StructField("NatAddr", StringType(), True),
    StructField("AddrScore", FloatType(), True),
    StructField("SupplAddr", StringType(), True),
    StructField("NatStateCity", StringType(), True),
    StructField("StateCityScore", FloatType(), True),
    StructField("SupplStateCity", StringType(), True),
    StructField("NatTaxonomy", StringType(), True),
    StructField("TaxonomyScore", FloatType(), True),
    StructField("SupplTaxonomy", StringType(), True),
    StructField("NatMiddleName", StringType(), True),
    StructField("MiddleNameScore", FloatType(), True),
    StructField("SupplMiddleName", StringType(), True)])

ppi_npi_matches_df = spark.read.csv(BASE_DIR + 'data_processing/filtered_out/filtered_hcp_matches.csv',
                                    header=False, schema=ppi_npi_matches_df_schema)

# join payments file with mapping file to filter out physicians that don't have payment information

hcp_payments_df = hcp_payments_df.join(on='Physician_Profile_ID', other=ppi_npi_matches_df)



#We obtain the organization for each physican from PhysicianComparePhoneNumberFile.csv

npi_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'PhysicianComparePhoneNumberFile.csv')\
    .withColumnRenamed(" NPI", "NPI")\
    .withColumnRenamed(" org_pac_id", "org_pac_id")\
    .withColumnRenamed(' phn_numbr', 'phn_numbr')\
    .withColumnRenamed(" frst_nm", "f_name")\
    .withColumnRenamed(" lst_nm", "l_name")\
    .withColumnRenamed(" mid_nm", "m_name")\
    .withColumnRenamed(" adr_ln_1", "adr_ln_1")\
    .withColumnRenamed(" adr_ln_2", "adr_ln_2")\
    .withColumnRenamed(" pri_spec", "pri_spec")\
    .withColumnRenamed(" st", "st")\
    .withColumnRenamed(" cty", "cty")\
    .withColumnRenamed(" zip", "zip")

#joining the organization with payments and physican information
hcp_cms_payments_df = hcp_payments_df.join(on='NPI', other=npi_df)
hcp_cms_payments_df.show(truncate=False)
hcp_cms_payments_df.count()

In [None]:
#selecting only physician and total payment information from the merged file
hcp_cms_payments_df = hcp_cms_payments_df.select(
    F.col('NPI'),
    F.col('Nature_of_Payment_or_Transfer_of_Value'),
    F.col('sum_payment')
)
hcp_cms_payments_df = hcp_cms_payments_df.dropna()
hcp_cms_payments_df.show()

# summing payments at physician level
cms_payments_df = hcp_cms_payments_df.groupBy(
    ['NPI', 'Nature_of_Payment_or_Transfer_of_Value']
).agg(
    F.count('NPI').alias('count_of_payments'),
    F.sum('sum_payment').alias('payment_amount')
)

In [None]:
# performance ratings file
cms_ratings_df = spark.read.options(header='True').csv(BASE_DIR + 'ec_public_reporting.csv')

#remove cols not used in our analysis
remove_cols = [' APM_affl_1',' APM_affl_2',' APM_affl_3',' APM_affl_4', ' attestation_value']

cms_ratings_df = cms_ratings_df.drop(*remove_cols)

#rename columns to remove space and change datatype for float cols to float
cms_ratings_df = cms_ratings_df.withColumnRenamed(' prf_rate', 'prf_rate')\
                               .withColumnRenamed(' measure_cd', 'measure_cd')\
                               .withColumnRenamed(' invs_msr', 'invs_msr')\
                               .withColumnRenamed(' patient_count', 'patient_count')\
                               .withColumnRenamed(' star_value', 'star_value')\
                               .withColumnRenamed(' five_star_benchmark', 'five_star_benchmark')\
                               .withColumnRenamed(' measure_title', 'measure_title')

float_cols = ['prf_rate','patient_count','star_value','five_star_benchmark']

#drop rows if all values in score cols have null values
cms_ratings_df = cms_ratings_df.dropna(how='all',subset=float_cols)

#cast float_cols to float
for col_name in float_cols:
    cms_ratings_df = cms_ratings_df.withColumn(col_name, F.col(col_name).cast('float'))

'''
# We should invert inverse measures, but the UDF is causing EOFError for some undebuggable reason
def inv_measures(invs_msr, prf_rate):
    try:
        if invs_msr.strip() == 'Y':
            return 100 - prf_rate
        return prf_rate
    except:
        print(invs_msr, prf_rate)
        return prf_rate

inv_measures_udf = F.udf(inv_measures, FloatType())

cms_ratings_df = cms_ratings_df.withColumn(
    'prf_rate', 
    inv_measures_udf(F.col('invs_msr'), F.col('prf_rate'))
)
'''

cms_ratings_df.show(truncate=False)
print(cms_ratings_df.columns)

In [None]:
#join org_payments and org_ratings to merge payments data and perf ratings data at physician level
cms_ratings_payments_df = cms_ratings_df.join(on='NPI', other=cms_payments_df)
cms_ratings_payments_df.show(truncate=False)

In [None]:
cms_ratings_payments_df.dtypes

In [None]:
#top_10_mcds= ['PI_GRP_EP_1','PI_GRP_PEA_1','MIPS_GRP_130_overall','MIPS_GRP_128_overall','MIPS_GRP_238_overall','PI_GRP_HIE_1','MIPS_GRP_226_overall','MIPS_GRP_238_2+','MIPS_GRP_111_overall','MIPS_GRP_226_screenedForUse']
#cms_org_ratings_payments_df = cms_org_ratings_payments_df.filter(cms_org_ratings_payments_df.measure_cd.isin(top_10_mcds))
#cms_org_ratings_payments_df.show(truncate=False)

In [None]:
c = 5
m = cms_ratings_payments_df.agg(F.avg('prf_rate').alias("agg")).collect()[0]['agg']

def get_adjusted_rating(avg_rating,num_reviews):
    adjusted = (c * m + avg_rating * num_reviews) / (c + num_reviews)
    return adjusted

ratings_adjusted_udf = udf(get_adjusted_rating, FloatType())
cms_ratings_payments_df = cms_ratings_payments_df.withColumn("adj_prf_rate", ratings_adjusted_udf('prf_rate', 'patient_count'))
cms_ratings_payments_df.show()
cms_ratings_payments_df.dtypes

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

print(hcp_payments_summary_df.columns)
summary_pddf = hcp_payments_summary_df.toPandas().dropna()
#summary_pddf = summary_pddf[summary_pddf['Nature_of_Payment_or_Transfer_of_Value']]

chart1 = alt.Chart(summary_pddf).mark_bar().encode(
    y=alt.Y("Nature_of_Payment_or_Transfer_of_Value:N", sort='-x', title=None),
    x=alt.X('sum_payment:Q', scale=alt.Scale(type='log'), title='Total Payments')
).properties(height=200, width=125)

chart2 = alt.Chart(summary_pddf).mark_bar().encode(
    x=alt.X('avg_payment:Q', title='Payment per Physician', scale=alt.Scale(type='log')),
    y=alt.Y("Nature_of_Payment_or_Transfer_of_Value:N", sort='-x', title=None),
).properties(height=200, width=125)

chart3 = alt.Chart(summary_pddf).mark_bar().encode(
    y=alt.Y("Nature_of_Payment_or_Transfer_of_Value:N", sort='-x', title=None),
    x=alt.X('num_physicians:Q', scale=alt.Scale(type='log'), title='# Physicians')
).properties(height=200, width=125)

(chart1 | chart2 | chart3).configure_axis(labelLimit=200)

In [None]:
#covert pyspark df to pandas df to use for our visualizations
cms_ratings_payments_pddf = cms_ratings_payments_df.toPandas()
#disable the altair error when dataset rows is > 5000
alt.data_transformers.disable_max_rows()

# https://stackoverflow.com/a/54722093
plot_df = cms_ratings_payments_pddf.groupby(
    'Nature_of_Payment_or_Transfer_of_Value',
    group_keys=False,
    as_index=False
).apply(lambda x: x.sample(min(len(x), 1000)))

plot_df = plot_df[plot_df['payment_amount'] > 10]
plot_df = plot_df[plot_df['payment_amount'] < 2000000]

#payments are in logarithmic scale to account for outliers
cms_payments_ratings_chart = alt.Chart(plot_df).mark_point(
    opacity=0.5
).encode(
    x=alt.X('payment_amount:Q', title=['Physician', 'Payments (USD)'],
            scale=alt.Scale(type='log', domain=[10, 2000000])),
    y=alt.Y('adj_prf_rate:Q', title='Performance Rating'),
    color=alt.Color('Nature_of_Payment_or_Transfer_of_Value:N', legend=None),
    facet=alt.Facet('Nature_of_Payment_or_Transfer_of_Value:N', columns=6,
                    header=alt.Header(title=None,labelFontWeight="bold",labelColor="maroon"))
)
#cms_payments_ratings_chart.transform_sample(1000)
(cms_payments_ratings_chart).properties(
    width=110,
    height=130
)

In [None]:
top_10 = cms_ratings_payments_pddf.groupby('measure_cd')['prf_rate'].count().nlargest(10)
#top_10 = top_10.tolist()
#top_10_mcds= ['PI_GRP_EP_1','PI_GRP_PEA_1','MIPS_GRP_130_overall','MIPS_GRP_128_overall','MIPS_GRP_238_overall','PI_GRP_HIE_1','MIPS_GRP_226_overall','MIPS_GRP_238_2+','MIPS_GRP_111_overall','MIPS_GRP_226_screenedForUse']
top_10_cds = top_10.index.tolist()
top_10_cds

In [None]:
'''
cms_payments_ratings_chart.facet(
    'measure_cd:N',
    rows=2
)
'''

plot_pddf = cms_ratings_payments_pddf.copy()
plot_pddf['measure_title'] = plot_pddf['measure_title'].map({
    'Documentation of Current Medications in the Medical Record': 'Medication Documentation',
    'Pneumococcal Vaccination Status for Older Adults': 'Pneumococcal Vaccination',
    'Preventive Care and Screening: Body Mass Index (BMI) Screening and Follow-Up Plan': 'Body Mass Index Screening',
    'Preventive Care and Screening: Tobacco Use: Screening and Cessation Intervention': 'Tobacco Screening/Cessation',
    'Provide Patients Electronic Access to Their Health Information': 'Patient Electronic Records',
    'Support Electronic Referral Loops By Sending Health Information': 'Electronic Record Transfer',
    'Use of High-Risk Medications in the Elderly': 'High-Risk Medications'
}).fillna(plot_pddf['measure_title'])
plot_pddf = plot_pddf[plot_pddf['measure_cd'].isin(top_10_cds)]

# https://stackoverflow.com/a/54722093
plot_pddf = plot_pddf.groupby(
    'measure_title',
    group_keys=False,
    as_index=False
).apply(lambda x: x.sample(min(len(x), 1000)))
plot_pddf = plot_pddf[plot_pddf['payment_amount'] > 10]
plot_pddf = plot_pddf[plot_pddf['payment_amount'] < 2000000]

#print(org_rating_payments_plot_pddf.groupby('measure_title').count())

alt.Chart(plot_pddf).mark_circle().encode(
    x=alt.X('payment_amount:Q', title=['Physician', 'Payments (USD)'],
            scale=alt.Scale(type='log', domain=[10, 2000000])),
    y=alt.Y('adj_prf_rate:Q', scale=alt.Scale(domain=[0, 100]), title='Performance Rating'),
    color=alt.Color('measure_title:N', legend=None),
    facet=alt.Facet('measure_title:N', columns=4,
                    header=alt.Header(title=None,labelFontWeight="bold",labelColor="maroon"))
).properties(
    width=120,
    height=125,
)

In [None]:
float_cols_corr = cms_ratings_payments_pddf.corr().reset_index()
float_cols_corr

In [None]:
corrs_by_measurecode_df = cms_ratings_payments_pddf.groupby('measure_cd')[
    ['adj_prf_rate','prf_rate','star_value','payment_amount']
].corr()
corrs_by_measurecode_df

In [None]:
top_100 = cms_ratings_payments_pddf.groupby('measure_cd')['prf_rate'].count().nlargest(100)
top_100 = top_100.index.tolist()
top_100

In [None]:
#https://stackoverflow.com/questions/25571882/pandas-columns-correlation-with-statistical-significance
filtered_ratings_df = cms_ratings_payments_pddf[cms_ratings_payments_pddf['measure_cd'].isin(top_100)]
cms_ratings_by_code_df = filtered_ratings_df.groupby('measure_cd')[['prf_rate','star_value','payment_amount']]
pvals_by_measurecode_df = cms_ratings_by_code_df.corr(method=lambda x, y: pearsonr(x, y)[1])
pvals_as_asteriks = pvals_by_measurecode_df.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
corrs_by_measurecode_df = corrs_by_measurecode_df.round(5).astype(str)
(corrs_by_measurecode_df + pvals_as_asteriks).reset_index().to_csv('/home/thanuja/Dropbox/coursera/Milestone1/physician_ratings_corrs.csv')