In [None]:
import pandas as pd
import numpy as np
import altair as alt
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 200)

In [None]:
BASE_DIR = '/home/thanuja/Dropbox/coursera/Milestone1/data/'

In [None]:
#votes by zip
votes_df = pd.read_csv(BASE_DIR + 'data_processing/vote_share/votes_by_zip.csv')
votes_df

In [None]:
#physican payments 2019 file columns subset
payment_cols = ['Physician_Profile_ID', 'Recipient_State', 'Recipient_City', 'Recipient_Zip_Code', 'Total_Amount_of_Payment_USDollars']
general_payments_df = pd.read_csv(BASE_DIR + 'OP_DTL_GNRL_PGYR2019_P06302021.csv',usecols=payment_cols, low_memory=True)

#extract 5 digit zip as float and do dropna and then convert it to int
general_payments_df['ZIP'] = general_payments_df['Recipient_Zip_Code'].str.slice(stop=5).astype(np.float32)
general_payments_df.dropna(inplace=True)
general_payments_df['ZIP'] = general_payments_df['ZIP'].astype(np.int32)
general_payments_df.drop(columns=['Recipient_Zip_Code'], inplace=True)
#rename columns
general_payments_df.rename(columns={'Recipient_City': 'City', 'Recipient_State': 'State'}, inplace=True)
general_payments_df['City'] = general_payments_df['City'].str.upper()
general_payments_df

In [None]:
#col : cost of living
# read csv that has col adjustments by city and state
col_df = pd.read_csv(BASE_DIR + 'advisorsmith_cost_of_living_index.csv')
col_df.rename(columns={'Cost of Living Index': 'city_col'}, inplace=True)
col_df['City'] = col_df['City'].str.upper()

#take average of city col to get state col
col_state_df = col_df.groupby('State', as_index=False).mean()
col_state_df.rename(columns={'city_col': 'state_col'}, inplace=True)

#merge payments with city col and then drop 'City'
general_payments_adjusted_df = general_payments_df.merge(col_df, on=['State', 'City'], how='left')
general_payments_adjusted_df.drop(columns=['City'], inplace=True)
#merge payments with stat col and then drop 'State'
general_payments_adjusted_df = general_payments_adjusted_df.merge(col_state_df, on=['State'], how='left')
general_payments_adjusted_df.drop(columns=['State'], inplace=True)

#populate col with city_col and with state_col when city_col is not available
general_payments_adjusted_df['col'] = general_payments_adjusted_df['city_col'].combine_first(general_payments_adjusted_df['state_col']) / 100.0
general_payments_adjusted_df.drop(columns=['state_col', 'city_col'], inplace=True)
#general_payments_adjusted_df.fillna(1.0, inplace=True)
general_payments_adjusted_df = general_payments_adjusted_df.fillna({'col':1.0})
#adjusted payments after taking into account col
general_payments_adjusted_df['adjusted_payment'] =\
    general_payments_adjusted_df['Total_Amount_of_Payment_USDollars'] / general_payments_adjusted_df['col']
general_payments_adjusted_df
print(general_payments_adjusted_df.dtypes)
ft = general_payments_adjusted_df[general_payments_adjusted_df['ZIP']==601]
ft

In [None]:
#merge payments with votes on zip
payment_per_physician_by_zip_df=general_payments_adjusted_df.groupby(['Physician_Profile_ID','ZIP'])['adjusted_payment'].agg('sum').reset_index()
payment_per_physician_by_zip_df

zip_payments_df = payment_per_physician_by_zip_df.groupby('ZIP').agg(
    num_physicians=pd.NamedAgg(column='Physician_Profile_ID', aggfunc=len),
    avg_payment=pd.NamedAgg(column='adjusted_payment', aggfunc=np.mean)
)

'''
num_physicians_by_zip=payment_per_physician_by_zip_df.groupby(['ZIP'])['Physician_Profile_ID'].agg('count').reset_index(name="num_physicians")
num_physicians_by_zip
num_payments_by_zip=payment_per_physician_by_zip_df.groupby(['ZIP'])['adjusted_payment'].agg('sum').reset_index(name="payment")
num_payments_by_zip
zip_payments_df = num_payments_by_zip.merge(num_physicians_by_zip, on='ZIP')
zip_payments_df['avg_payment'] = zip_payments_df['payment']/zip_payments_df['num_physicians']
zip_payments_df
'''
zip_payments_df['lg_payments'] = np.log10(zip_payments_df['avg_payment'])
#zip_payments_df['avg_payment'].plot(kind='hist')
#zip_payments_df.head(10)
bins = np.linspace(start=0, stop=6, num=20)
print('bins', bins)
histogram = pd.cut(zip_payments_df['lg_payments'], bins=bins).value_counts()
print(histogram)

#print('***', histogram[0])
cut_offs = [2.526,2.211,2.842,3.158,1.895,3.474,1.579,3.789,1.263,4.105,4.421,4.737,5.053,0.947,5.368,5.684,0.632]
zip_counts = [2806,2779,2105,1896,1596,1263,890,885,311,125,47,26,16,7,3,2]
label_cut_offs = 

alt.Chart(zip_payments_df.sample(5000)).mark_bar().encode(
    alt.X("lg_payments:Q",
          #scale=alt.Scale(type='log'), # bin=True), #alt.Bin(maxbins=20)),
          bin=alt.Bin(extent=[0, 6], maxbins=20, step=0.2)), #bin=alt.Bin(extent=[100, 6000], maxbins=20, step=0.1)),
    y='count()'
)
#zip_payments_df['lg_payments'].plot(kind='hist')

In [None]:
alt.data_transformers.enable(max_rows=None)
vote_payments_adj_df = payment_per_physician_by_zip_df.copy()
# total payments per physician vs percentage of democrat vote
alt.Chart(vote_payments_adj_df.sample(n=1000)).mark_circle(size=10).encode(
    x='DEMOCRAT',
    y=alt.Y('adjusted_payment', scale=alt.Scale(type='log'))
)

print(vote_payments_adj_df['adjusted_payment'].mean())
vote_payments_adj_df['adjusted_payment'].median()
vote_payments_adj_df['adjusted_payment'].max()
vote_payments_adj_df['adjusted_payment'].min()
vote_payments_adj_df = vote_payments_adj_df.sort_values(by ='adjusted_payment',ascending=False)
vote_payments_adj_df.head(50)

In [None]:
alt.data_transformers.enable(max_rows=None)
alt.Chart(zip_payments_df).mark_bar().encode(
   alt.X("adjusted_payment:Q", bin = True),
   y = 'count()'
)

In [None]:
#merge payments with votes on zip
vote_payments_df = general_payments_df.merge(votes_df, on='ZIP')
vote_payments_df

In [None]:
alt.data_transformers.enable(max_rows=None)
# total payments per physician vs percentage of democrat vote
alt.Chart(vote_payments_df.sample(frac=0.01)).mark_circle(size=10).encode(
    x='DEMOCRAT',
    y=alt.Y('Total_Amount_of_Payment_USDollars', scale=alt.Scale(type='log'))
)

In [None]:
#payments subset for state cols
state_cols = ['Recipient_State', 'Physician_Profile_ID', 'Total_Amount_of_Payment_USDollars']
state_payments_df = pd.read_csv(BASE_DIR + 'OP_DTL_GNRL_PGYR2019_P06302021.csv',
                                  usecols=state_cols, low_memory=True)
state_payments_df

In [None]:
#group by state and physician to get payment per physician in that state
grouped_state_payments_df = state_payments_df.groupby(['Recipient_State', 'Physician_Profile_ID'], as_index=False).sum()
#get average doctor payments per state
state_average_payments_df = grouped_state_payments_df.groupby('Recipient_State', as_index=False).mean()
state_average_payments_df

In [None]:
#state vote share
state_vote_share_df = pd.read_csv(BASE_DIR + 'data_processing/vote_share/votes_by_state.csv')
#merge state vote share with state average payments per physician
state_payment_votes_df = state_average_payments_df.merge(state_vote_share_df, on='Recipient_State')
state_payment_votes_df

In [None]:
#average payments per physcian by state color coded by political party
lower_bound = 0.45
upper_bound = 0.55

scale = alt.Scale(
    domain=[lower_bound, upper_bound],
    range=['darkred', 'darkblue'],
    type='linear'
)

clipped_df = state_payment_votes_df.copy()
clipped_df.loc[clipped_df['DEMOCRAT'] < lower_bound,'DEMOCRAT'] = lower_bound
clipped_df.loc[clipped_df['DEMOCRAT'] > upper_bound,'DEMOCRAT'] = upper_bound
#clipped_df[clipped_df['DEMOCRAT'] > upper_bound]['DEMOCRAT'] = upper_bound

alt.Chart(clipped_df).mark_bar().encode(
    x=alt.X('Recipient_State:N', sort='y'),
    y='Total_Amount_of_Payment_USDollars:Q',
    color=alt.Color('DEMOCRAT', scale=scale)
).properties(width=700)