In [None]:
BASE_DIR='/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [None]:
import pandas as pd
from IPython.display import Image

#pyspark intitialization

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from scipy.stats import pearsonr
from pyspark.sql.functions import udf
import warnings
import altair as alt
import numpy as np

warnings.filterwarnings('ignore')
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('org queries') \
    .config("spark.driver.memory", "8g")\
    .getOrCreate() 

sc = spark.sparkContext

In [None]:
google_org_ratings_df = spark.read.options(header='True').csv(BASE_DIR + 'google_org_ratings.csv')
float_cols = ['num_reviews','avg_rating','max_rating']
for col_name in float_cols:
    google_org_ratings_df = google_org_ratings_df.withColumn(col_name, F.col(col_name).cast('float'))

In [None]:
google_org_ratings_df.head(10)

In [None]:
'''
C=10 -> confidence in the prior
m=mean(all reviews)
rating=(C*m + curr_rating * curr_num_reviews) / (C + curr_num_reviews)
'''

bayesian_formula_img = BASE_DIR + "bayesian_formula.png"
Image(filename = bayesian_formula_img, width=500, height=500)


In [None]:
c = 5
m = google_org_ratings_df.agg(F.avg('avg_rating').alias("agg")).collect()[0]['agg']

def get_adjusted_rating(avg_rating,num_reviews):
    adjusted = (c * m + avg_rating * num_reviews) / (c + num_reviews)
    return adjusted

ratings_adjusted_udf = udf(get_adjusted_rating, FloatType())
google_org_ratings_df = google_org_ratings_df.withColumn("adjusted_ratings", ratings_adjusted_udf('avg_rating', 'num_reviews'))
google_org_ratings_df.show()
google_org_ratings_df.dtypes

In [None]:
cms_org_ratings_payments_schema = StructType([
    StructField("org_pac_id", StringType(), True),
    StructField("org_nm", StringType(), True),
    StructField("measure_cd", StringType(), True),
    StructField("measure_title", StringType(), True),
    StructField("invs_msr", StringType(), True),
    StructField("prf_rate", FloatType(), True),
    StructField("patient_count", FloatType(), True),
    StructField("star_value", FloatType(), True),
    StructField("five_star_benchmark", FloatType(), True),
    StructField("collection_type", StringType(), True),
    StructField("CCXP_ind", StringType(), True),
    StructField("count_of_doctors", FloatType(), True),
    StructField("payment_amount", FloatType(), True),
    StructField("payment_normalized", FloatType(), True)])
cms_org_ratings_payments_df = spark.read.options(header='True').csv(BASE_DIR + 'data_processing/ratings/cms_org_payments_ratings.csv',schema=cms_org_ratings_payments_schema)
cms_org_ratings_payments_df.dtypes

cms_org_ratings_payments_df = cms_org_ratings_payments_df.groupBy("org_pac_id", "org_nm")\
   .agg(F.avg("prf_rate").alias("prf_rate"),
      F.avg("star_value").alias("star_value"),
      F.avg("count_of_doctors").alias("count_of_doctors"),
      F.avg("payment_amount").alias("payment_amount"),
     F.avg("payment_normalized").alias("payment_normalized"))

cms_org_ratings_payments_df.show()

In [None]:
cms_google_org_ratings_payments_df = cms_org_ratings_payments_df.join(on='org_pac_id', other=google_org_ratings_df)
#cms_google_org_ratings_payments_df.show(truncate=False)
#cms_google_org_ratings_payments_df.groupBy("org_pac_id").count().where("count > 1").drop("count").show()

In [None]:
#covert pyspark df to pandas df to use for our visualizations
cms_google_org_ratings_payments_pddf = cms_google_org_ratings_payments_df.toPandas()
plot_pddf = cms_google_org_ratings_payments_pddf.copy()
plot_pddf['payment_log'] = np.log10(plot_pddf['payment_normalized'])
plot_pddf = plot_pddf[plot_pddf['payment_normalized'] > 10]
plot_pddf = plot_pddf[plot_pddf['payment_normalized'] < 1000000]
#disable the altair error when dataset rows is > 5000
#alt.data_transformers.disable_max_rows()

#payments are in logarithmic scale to account for outliers
payments_mips_ratings_chart = alt.Chart(plot_pddf.sample(2000)).mark_point().encode(
    x=alt.X('payment_normalized:Q', title='Payment per Physician (USD)',
            scale=alt.Scale(type='log', domain=[10, 1000000]),
            axis=alt.Axis(titleColor='orange')),
    y=alt.Y('prf_rate:Q', title='Average MIPS Performance Rating',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleColor='green'))
).properties(width=200,height=200)
payments_mips_ratings_chart
#(payments_ratings_chart).properties(width=800,height=600)
#cms_google_org_ratings_payments_pddf.dtypes

In [None]:
cms_google_org_ratings_payments_pddf.corr("spearman")
print(cms_google_org_ratings_payments_pddf['payment_normalized'].max())
print(cms_google_org_ratings_payments_pddf['payment_normalized'].min())

In [None]:
#disable the altair error when dataset rows is > 5000
alt.data_transformers.disable_max_rows()

lin_scale = alt.Scale(
    domain=[2,6],
    range=['white', 'blue'],
    type='linear'
)

log_scale = alt.Scale(
    domain=[1, 1000000],
    range=['white', 'blue'],
    type='log'
)

plot_df = cms_google_org_ratings_payments_pddf

#payments are in logarithmic scale to account for outliers
prfrate_ratings_chart = alt.Chart(
    cms_google_org_ratings_payments_pddf.sample(5000),
    title='Performance Rating vs. Google Rating vs. Payment per Physician'
).mark_circle().encode(
    x=alt.X('adjusted_ratings:Q', title='Google Rating (adj.)', scale=alt.Scale(domain=[1.5, 5.0])),
    y=alt.Y('prf_rate:Q', title='Performance Rating'),
    size=alt.Size('payment_normalized:Q', title='Payment per Physician',
                  scale=alt.Scale(type='log', domain=[0.1, 1000000], range=[0.1, 150])),
    color=alt.Color('payment_normalized:Q',
                    title='Payment per Physician',
                    scale = alt.Scale(domain=[0.1, 1000000], range=['white', 'blue'], type='log')),
    opacity=alt.Opacity('payment_normalized:Q',
                       scale = alt.Scale(domain=[0.1, 1000000], range=[0.1, 1], type='log'))
    #color=alt.Color('payment_log:Q', scale=scale),
    #size=alt.Size('payment_log:Q')
).properties(width=600,height=600)
prfrate_ratings_chart

In [None]:
starval_payments_chart = alt.Chart(cms_google_org_ratings_payments_pddf.dropna().sample(5000)).mark_point().encode(
    x=alt.X('star_value:Q'),
    y=alt.Y('payment_normalized:Q',scale=alt.Scale(type='log'))
)
starval_payments_chart

In [None]:
plot_pddf = cms_google_org_ratings_payments_pddf.dropna()
plot_pddf = plot_pddf[plot_pddf['adjusted_ratings'] > 2.0].sample(2000)
plot_pddf = plot_pddf[plot_pddf['payment_normalized'] > 10]
plot_pddf = plot_pddf[plot_pddf['payment_normalized'] < 1000000]

payments_google_ratings_chart =  alt.Chart(plot_pddf).mark_point().encode(
    x=alt.X('payment_normalized:Q', title='Payment per Physician (USD)',
            scale=alt.Scale(type='log', domain=[10, 1000000]),
            axis=alt.Axis(titleColor='orange')),
    y=alt.Y('adjusted_ratings:Q', title='Google Rating (adj.)',
            scale=alt.Scale(domain=[2, 5]),
            axis=alt.Axis(titleColor='blue'))
).properties(width=200,height=200)
payments_google_ratings_chart

In [None]:
plot_pddf = cms_google_org_ratings_payments_pddf.dropna()
plot_pddf = plot_pddf[plot_pddf['adjusted_ratings'] > 2.0].sample(2000)

mips_google_ratings_chart = alt.Chart(plot_pddf).mark_point().encode(
    x=alt.X('prf_rate:Q', title='Average MIPS Performance Rating',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleColor='green')),
    y=alt.Y('adjusted_ratings:Q', title='Google Rating (adj.)',
            scale=alt.Scale(domain=[2, 5]),
            axis=alt.Axis(titleColor='blue'))
).properties(width=200,height=200)
mips_google_ratings_chart
payments_mips_ratings_chart | payments_google_ratings_chart | mips_google_ratings_chart

In [None]:
#'Preventive Care and Screening'

plot_pddf = spark.read.options(header='True').csv(BASE_DIR + 'data_processing/ratings/cms_org_payments_ratings.csv',schema=cms_org_ratings_payments_schema)
plot_pddf = plot_pddf.filter(plot_pddf.measure_title.startswith('Preventive Care and Screening'))

plot_pddf = plot_pddf.groupBy("org_pac_id", "org_nm")\
   .agg(F.avg("prf_rate").alias("prf_rate"),
      F.avg("star_value").alias("star_value"),
      F.avg("count_of_doctors").alias("count_of_doctors"),
      F.avg("payment_amount").alias("payment_amount"),
     F.avg("payment_normalized").alias("payment_normalized"))

plot_pddf = plot_pddf.join(on='org_pac_id', other=google_org_ratings_df)

plot_pddf = plot_pddf.toPandas().dropna()
plot_pddf = plot_pddf[plot_pddf['adjusted_ratings'] > 2.0].sample(2000)

preventative_mips_google_ratings_chart = alt.Chart(
    plot_pddf
).mark_point().encode(
    x=alt.X('prf_rate:Q', title='Average Preventative MIPS Rating',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(titleColor='green')),
    y=alt.Y('adjusted_ratings:Q', title='Google Rating (adj.)',
            scale=alt.Scale(domain=[2, 5]),
            axis=alt.Axis(titleColor='blue'))
).properties(width=200,height=200)
preventative_mips_google_ratings_chart

In [None]:
mean_payments_norm = cms_google_org_ratings_payments_pddf["payment_normalized"].mean()
print(mean_payments_norm)
mean_payments = cms_google_org_ratings_payments_pddf["payment_amount"].mean()
print(mean_payments)

In [None]:
median_payments_norm = cms_google_org_ratings_payments_pddf["payment_normalized"].median()
print(median_payments_norm)
median_payments = cms_google_org_ratings_payments_pddf["payment_amount"].median()
print(median_payments)

In [None]:
above_median_pn= cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_normalized"] > median_payments_norm]
print(len(above_median_pn))
above_median = cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_amount"] > median_payments]
print(len(above_median))
below_median_pn= cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_normalized"] < median_payments_norm]
print(len(below_median_pn))
below_median = cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_amount"] < median_payments]
print(len(below_median))

In [None]:
above_mean_pn= cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_normalized"] > mean_payments_norm]
print(len(above_mean_pn))
above_mean = cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_amount"] > mean_payments]
print(len(above_mean))
below_mean_pn= cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_normalized"] < mean_payments_norm]
print(len(below_mean_pn))
below_mean = cms_google_org_ratings_payments_pddf[cms_google_org_ratings_payments_pddf["payment_amount"] < mean_payments]
print(len(below_mean))

In [None]:
print(above_median_pn['adjusted_ratings'].mean())
print(above_median['adjusted_ratings'].mean())
print(below_median_pn['adjusted_ratings'].mean())
print(below_median['adjusted_ratings'].mean())
print(above_mean_pn['adjusted_ratings'].mean())
print(above_mean['adjusted_ratings'].mean())
print(below_mean_pn['adjusted_ratings'].mean())
print(below_mean['adjusted_ratings'].mean())

In [None]:
print(above_median_pn['adjusted_ratings'].median())
print(above_median['adjusted_ratings'].median())
print(below_median_pn['adjusted_ratings'].median())
print(below_median['adjusted_ratings'].median())
print(above_mean_pn['adjusted_ratings'].median())
print(above_mean['adjusted_ratings'].median())
print(below_mean_pn['adjusted_ratings'].median())
print(below_mean['adjusted_ratings'].median())

In [None]:
#2019 payments file
general_payments_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'OP_DTL_GNRL_PGYR2019_P06302021.csv')


hcp_payments_df = general_payments_df.where(
    F.col('Covered_Recipient_Type') == 'Covered Recipient Physician')\
    .select(F.col('Physician_Profile_ID'),
           F.col('Physician_First_Name'),
           F.col('Physician_Middle_Name'),
           F.col('Physician_Last_Name'),
           F.col('Recipient_Primary_Business_Street_Address_Line1'),
           F.col('Recipient_State'),
           F.col('Recipient_City'),
           F.col('Recipient_Zip_Code'),
           F.col('Total_Amount_of_Payment_USDollars'))
# changing payment column to double
hcp_payments_df = hcp_payments_df.withColumn("Total_Amount_of_Payment_USDollars",hcp_payments_df["Total_Amount_of_Payment_USDollars"].cast('double'))
#sum payment column for each healthcare provider
hcp_payments_df = hcp_payments_df.groupBy('Physician_Profile_ID').agg(F.sum('Total_Amount_of_Payment_USDollars').alias("sum_payment"))

#mapping file
ppi_npi_matches_df_schema = StructType([
    StructField("FirstName", StringType(), True),
    StructField("LastName", StringType(), True),
    StructField("NPI", StringType(), True),
    StructField("Physician_Profile_ID", StringType(), True),
    StructField("Score", FloatType(), True),
    StructField("NatAddr", StringType(), True),
    StructField("AddrScore", FloatType(), True),
    StructField("SupplAddr", StringType(), True),
    StructField("NatStateCity", StringType(), True),
    StructField("StateCityScore", FloatType(), True),
    StructField("SupplStateCity", StringType(), True),
    StructField("NatTaxonomy", StringType(), True),
    StructField("TaxonomyScore", FloatType(), True),
    StructField("SupplTaxonomy", StringType(), True),
    StructField("NatMiddleName", StringType(), True),
    StructField("MiddleNameScore", FloatType(), True),
    StructField("SupplMiddleName", StringType(), True)])

ppi_npi_matches_df = spark.read\
    .csv(BASE_DIR + 'data_processing/filtered_out/filtered_hcp_matches.csv',header=False, schema=ppi_npi_matches_df_schema)

# join payments file with mapping file to filter out physicians that don't have payment information

hcp_payments_df = hcp_payments_df.join(on='Physician_Profile_ID', other=ppi_npi_matches_df)



#We obtain the organization for each physican from PhysicianComparePhoneNumberFile.csv

npi_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'PhysicianComparePhoneNumberFile.csv')\
    .withColumnRenamed(" NPI", "NPI")\
    .withColumnRenamed(" org_pac_id", "org_pac_id")\
    .withColumnRenamed(' phn_numbr', 'phn_numbr')\
    .withColumnRenamed(" frst_nm", "f_name")\
    .withColumnRenamed(" lst_nm", "l_name")\
    .withColumnRenamed(" mid_nm", "m_name")\
    .withColumnRenamed(" adr_ln_1", "adr_ln_1")\
    .withColumnRenamed(" adr_ln_2", "adr_ln_2")\
    .withColumnRenamed(" pri_spec", "pri_spec")\
    .withColumnRenamed(" st", "st")\
    .withColumnRenamed(" cty", "cty")\
    .withColumnRenamed(" zip", "zip")

#joining the organization with payments and physican information
hcp_cms_org_payments_df = hcp_payments_df.join(on='NPI', other=npi_df)
hcp_cms_org_payments_df.show(truncate=False)
hcp_cms_org_payments_df.columns

In [None]:
#selecting only organization and total payment information from the merged file

#hcp_cms_org_payments_df = hcp_cms_org_payments_df.select(F.col('org_pac_id'),F.col('st'),F.substring('zip', 1,5).alias('zip'))
hcp_cms_org_payments_df = hcp_cms_org_payments_df.select(F.col('org_pac_id'),F.col('st'))

hcp_cms_org_payments_df = hcp_cms_org_payments_df.dropna()
hcp_cms_org_payments_df.dropDuplicates().show()

In [None]:
above_mean_pn.head(10)

In [None]:
org_outlier_lst= above_mean_pn["org_pac_id"].tolist()
print(len(org_outlier_lst))

In [None]:
outlier_info_df = hcp_cms_org_payments_df[hcp_cms_org_payments_df['org_pac_id'].isin(org_outlier_lst)]
#outlier_info_df.show()
outlier_info_pdf = outlier_info_df.toPandas()
outlier_df = outlier_info_pdf.drop_duplicates()


In [None]:
outlier_df = pd.merge(outlier_df, above_mean_pn, on='org_pac_id', how='inner')
outlier_df.head(10)

In [None]:
outlier_plot_df = outlier_df.sort_values(by='payment_normalized', ascending=False)
outlier_plot_df = outlier_df.head(20)
scale = alt.Scale(
    domain=[2,4.5],
    range=['pink', 'green'],
    type='linear'
)

alt.Chart(outlier_plot_df, title='Highest Paid Organizations').mark_bar().encode(
    x=alt.X('payment_normalized:Q', title='Payment per Physician'),
    y=alt.Y("org_nm:N", sort='-x', title=None),
    color=alt.Color('adjusted_ratings:Q', scale=scale, title='Google Rating (adj.)'),
).configure_axis(labelLimit=400).properties(height=200)

In [None]:
#below_mean_pn = below_mean_pn.sort_values(by='payment_normalized', ascending=False)
below_mean_pn_50 = below_mean_pn.sample(20)
scale = alt.Scale(
    domain=[1,5],
    range=['pink', 'green'],
    type='linear'
)

alt.Chart(below_mean_pn_50, title='Organizations by Payment (Sample of Bottom 50%)').mark_bar().encode(
    x=alt.X('payment_normalized:Q', title='Payment per Physician'),
    y=alt.Y("org_nm:N", sort='-x', title=None),
    color=alt.Color('adjusted_ratings:Q', scale=scale, title='Google Rating (adj.)'),
).configure_axis(labelLimit=400).properties(height=300)