In [65]:
BASE_DIR = '/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [66]:
#pyspark intitialization

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from scipy.stats import pearsonr
from itertools import chain
import warnings
import altair as alt
import numpy as np
import pandas as pd

warnings.filterwarnings('ignore')
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('org queries') \
    .config("spark.driver.memory", "8g")\
    .getOrCreate() 

sc = spark.sparkContext

In [67]:
#2019 payments file
general_payments_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'OP_DTL_GNRL_PGYR2019_P06302021.csv')

physicians_suppl_df = spark.read.option("header",True)\
    .csv(BASE_DIR + 'OP_PH_PRFL_SPLMTL_P06302021.csv')


In [68]:
general_payments_df.columns

['Change_Type',
 'Covered_Recipient_Type',
 'Teaching_Hospital_CCN',
 'Teaching_Hospital_ID',
 'Teaching_Hospital_Name',
 'Physician_Profile_ID',
 'Physician_First_Name',
 'Physician_Middle_Name',
 'Physician_Last_Name',
 'Physician_Name_Suffix',
 'Recipient_Primary_Business_Street_Address_Line1',
 'Recipient_Primary_Business_Street_Address_Line2',
 'Recipient_City',
 'Recipient_State',
 'Recipient_Zip_Code',
 'Recipient_Country',
 'Recipient_Province',
 'Recipient_Postal_Code',
 'Physician_Primary_Type',
 'Physician_Specialty',
 'Physician_License_State_code1',
 'Physician_License_State_code2',
 'Physician_License_State_code3',
 'Physician_License_State_code4',
 'Physician_License_State_code5',
 'Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name',
 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID',
 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
 'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State',
 'Applicable_Manufacturer_or_Applica

In [69]:
physicians_suppl_df.columns

['Physician_Profile_ID',
 'Associated_Physician_Profile_ID_1',
 'Associated_Physician_Profile_ID_2',
 'Physician_Profile_First_Name',
 'Physician_Profile_Middle_Name',
 'Physician_Profile_Last_Name',
 'Physician_Profile_Suffix',
 'Physician_Profile_Alternate_First_Name',
 'Physician_Profile_Alternate_Middle_Name',
 'Physician_Profile_Alternate_Last_Name',
 'Physician_Profile_Alternate_Suffix',
 'Physician_Profile_Address_Line_1',
 'Physician_Profile_Address_Line_2',
 'Physician_Profile_City',
 'Physician_Profile_State',
 'Physician_Profile_Zipcode',
 'Physician_Profile_Country_Name',
 'Physician_Profile_Province_Name',
 'Physician_Profile_Primary_Specialty',
 'Physician_Profile_OPS_Taxonomy_1',
 'Physician_Profile_OPS_Taxonomy_2',
 'Physician_Profile_OPS_Taxonomy_3',
 'Physician_Profile_OPS_Taxonomy_4',
 'Physician_Profile_OPS_Taxonomy_5',
 'Physician_Profile_License_State_Code_1',
 'Physician_Profile_License_State_Code_2',
 'Physician_Profile_License_State_Code_3',
 'Physician_Profile

In [70]:
physicians_payments_2019_df = general_payments_df.where(
    F.col('Covered_Recipient_Type') == 'Covered Recipient Physician')\
    .select(F.col('Physician_Profile_ID'),
           F.col('Physician_First_Name'),
           F.col('Physician_Middle_Name'),
           F.col('Physician_Last_Name'),
           F.col('Recipient_State'),
           F.col('Recipient_City'),
           F.col('Physician_Specialty'),
           F.col('Total_Amount_of_Payment_USDollars'),
           F.col('Date_of_Payment'),
           F.col('Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name'),
           F.col('Nature_of_Payment_or_Transfer_of_Value'))



In [71]:
top200_payments_physicians=physicians_payments_2019_df.groupby("Physician_Profile_ID","Physician_First_Name","Physician_Middle_Name","Physician_Last_Name","Recipient_State","Recipient_City").agg(sum("Total_Amount_of_Payment_USDollars").alias("payments_total"),count("Physician_Profile_ID").alias("num_of_payments")).sort(desc("payments_total"))
top200_payments_physicians = top200_payments_physicians.limit(5)
top200_payments_physicians.show(truncate=False)

top200_payments_physicians_pddf = top200_payments_physicians.toPandas()
print(top200_payments_physicians_pddf["payments_total"].max())
print(top200_payments_physicians_pddf["payments_total"].min())

+--------------------+--------------------+---------------------+-------------------+---------------+--------------+-------------------+---------------+
|Physician_Profile_ID|Physician_First_Name|Physician_Middle_Name|Physician_Last_Name|Recipient_State|Recipient_City|payments_total     |num_of_payments|
+--------------------+--------------------+---------------------+-------------------+---------------+--------------+-------------------+---------------+
|258909              |ANDREW              |JEREMY               |COOPER             |FL             |CLEARWATER    |5.00217156E7       |66             |
|204133              |STEVEN              |null                 |BOLLING            |MI             |ANN ARBOR     |3.718393121000001E7|25             |
|44706               |HANSEN              |null                 |YUAN               |FL             |Naples        |3.1047764E7        |1              |
|288926              |STEPHEN             |S                    |BURKHART         

In [72]:
top200_ppi = top200_payments_physicians.select('Physician_Profile_ID').rdd.flatMap(lambda x: x).collect()
top200_lnames = top200_payments_physicians.select('Physician_Last_Name').rdd.flatMap(lambda x: x).collect()

In [73]:
top200_payments_breakdown = physicians_payments_2019_df.filter((physicians_payments_2019_df.Physician_Profile_ID).isin(top200_ppi))
top200_payments_breakdown.count()

268

In [74]:
top200_payments_breakdown_pddf = top200_payments_breakdown.toPandas()

In [42]:
top200_payments_breakdown_pddf.sample(5)

Unnamed: 0,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Recipient_State,Recipient_City,Physician_Specialty,Total_Amount_of_Payment_USDollars,Date_of_Payment,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Nature_of_Payment_or_Transfer_of_Value
201,204133,STEVEN,F,BOLLING,MI,ANN ARBOR,Allopathic & Osteopathic Physicians|Thoracic S...,173.84,08/30/2019,"W. L. Gore & Associates, Inc.",Travel and Lodging
129,288926,STEPHEN,S,BURKHART,TX,SAN ANTONIO,Allopathic & Osteopathic Physicians|Orthopaedi...,54.73,11/01/2019,"Arthrex, Inc.",Food and Beverage
173,1166415,WILLIAM,JAY,BINDER,CA,BEVERLY HILLS,Allopathic & Osteopathic Physicians|Plastic Su...,26790.0,03/13/2019,Allergan Inc.,Royalty or License
10,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,12712.5,11/13/2019,"Medical Device Business Services, Inc.",Consulting Fee
122,288926,STEPHEN,S,BURKHART,TX,SAN ANTONIO,Allopathic & Osteopathic Physicians|Orthopaedi...,28.53,11/01/2019,"Arthrex, Inc.",Food and Beverage


In [83]:
physicans = pd.DataFrame({'Physician_Last_Name':top200_lnames})
selection = alt.selection_multi(fields=['Physician_Last_Name'])
color = alt.condition(selection, alt.Color('Physician_Last_Name:N'), alt.value('lightgray'))
physicans_selector = alt.Chart(physicans).mark_rect().encode(y='Physician_Last_Name', color=color).add_selection(selection)
timeline_top200_payments_chart = alt.Chart(top200_payments_breakdown_pddf).mark_line().encode(
    x=alt.X('Date_of_Payment:T'),
    y=alt.Y("Total_Amount_of_Payment_USDollars:Q"),
    color="Physician_Last_Name:N"
).add_selection(
    selection
).transform_filter(
    selection
)
physicans_selector|timeline_top200_payments_chart

In [80]:
top200_payments_breakdown_pddf.head(10)

Unnamed: 0,Physician_Profile_ID,Physician_First_Name,Physician_Middle_Name,Physician_Last_Name,Recipient_State,Recipient_City,Physician_Specialty,Total_Amount_of_Payment_USDollars,Date_of_Payment,Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name,Nature_of_Payment_or_Transfer_of_Value
0,258909,ANDREW,,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,14.53,06/04/2019,Arthrosurface Incorporated,Food and Beverage
1,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,49921760.55,10/01/2019,"DePuy Synthes Products, Inc.",Compensation for services other than consultin...
2,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,62.86,03/07/2019,DePuy Synthes Sales Inc.,Food and Beverage
3,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,105.0,03/23/2019,DePuy Synthes Sales Inc.,Food and Beverage
4,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,67.53,01/29/2019,DePuy Synthes Sales Inc.,Food and Beverage
5,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,2250.0,09/27/2019,"Medical Device Business Services, Inc.",Consulting Fee
6,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,2250.0,10/15/2019,"Medical Device Business Services, Inc.",Consulting Fee
7,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,2700.0,02/21/2019,"Medical Device Business Services, Inc.",Consulting Fee
8,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,900.0,10/31/2019,"Medical Device Business Services, Inc.",Consulting Fee
9,258909,ANDREW,JEREMY,COOPER,FL,CLEARWATER,Allopathic & Osteopathic Physicians|Orthopaedi...,2250.0,11/14/2019,"Medical Device Business Services, Inc.",Consulting Fee


In [75]:
input_dropdown = alt.binding_select(options=['COOPER', 'BOLLING', 'YUAN', 'BURKHART', 'BINDER'], name='Physician')
selection = alt.selection_single(fields=['Physician_Last_Name'], bind=input_dropdown)

alt.Chart(top200_payments_breakdown_pddf).mark_line().encode(
    x=alt.X('Date_of_Payment:T'),
    y=alt.Y("Total_Amount_of_Payment_USDollars:Q"),
    color="Physician_Last_Name:N"
).add_selection(
    selection
).transform_filter(
    selection
)

In [60]:
print(top200_lnames)

['COOPER', 'BOLLING', 'YUAN', 'BURKHART', 'BINDER']
