In [1]:
import pandas as pd
import scipy.stats as stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Q1

In [2]:
df1 = pd.read_csv("npidata_pfile_20050523-20220109.csv",
                  usecols=['NPI','Provider License Number State Code_1',
                  'Provider Last Name (Legal Name)','Provider First Name'])

In [3]:
df1.head()

Unnamed: 0,NPI,Provider Last Name (Legal Name),Provider First Name,Provider License Number State Code_1
0,1679576722,WIEBE,DAVID,NE
1,1588667638,PILCHER,WILLIAM,FL
2,1497758544,,,NC
3,1306849450,,,
4,1215930367,GRESSOT,LAURENT,TX


In [4]:
df1.query("NPI=='1972507325'")


Unnamed: 0,NPI,Provider Last Name (Legal Name),Provider First Name,Provider License Number State Code_1
19012,1972507325,SMITH,MARY,MA


In [5]:
df1[(df1['Provider First Name']=='MARY') & (df1['Provider Last Name (Legal Name)']=='SMITH')]

Unnamed: 0,NPI,Provider Last Name (Legal Name),Provider First Name,Provider License Number State Code_1
19012,1972507325,SMITH,MARY,MA
63863,1356340913,SMITH,MARY,NH
127401,1629063177,SMITH,MARY,OH
129957,1316932403,SMITH,MARY,OH
145965,1386630051,SMITH,MARY,NE
202197,1093796815,SMITH,MARY,NM
264752,1548247653,SMITH,MARY,NY
284624,1417936618,SMITH,MARY,GA
315728,1609848050,SMITH,MARY,MD
398599,1841259819,SMITH,MARY,IL


#### HI,MI,MN,MS,NY,OK,SD,TN

# Q3
### low risk/reward categories: Obstetrics & Gynecology(207V00000X) and Pediatrics(208000000X)
### high risk/reward categories: Surgery(208600000X) and Orthopaedic Surgery(207X00000X)

In [6]:
df3 = pd.read_csv("npidata_pfile_20050523-20220109.csv",
                  usecols=['NPI','Provider Business Practice Location Address State Name',
                          'Provider Gender Code','Healthcare Provider Taxonomy Code_1'])

In [7]:
df3.head()

Unnamed: 0,NPI,Provider Business Practice Location Address State Name,Provider Gender Code,Healthcare Provider Taxonomy Code_1
0,1679576722,NE,M,207X00000X
1,1588667638,FL,M,207RC0000X
2,1497758544,NC,,251G00000X
3,1306849450,,,
4,1215930367,TX,M,174400000X


In [8]:
#df3.isnull().sum()

In [9]:
#df3 = df3.dropna()

In [10]:
states = ['HI','MI','MN','MS','NY','OK','SD','TN']

In [11]:
df3_state = df3[df3['Provider Business Practice Location Address State Name'].isin(states)] 


In [12]:
df3_state.head()

Unnamed: 0,NPI,Provider Business Practice Location Address State Name,Provider Gender Code,Healthcare Provider Taxonomy Code_1
7,1841293990,NY,F,231H00000X
11,1487657532,OK,F,207V00000X
12,1396748448,NY,F,363A00000X
17,1841293891,TN,M,208600000X
18,1750384707,MN,M,174400000X


In [13]:
categories = ['207V00000X','208000000X','208600000X','207X00000X']

In [14]:
df3_risk = df3_state[df3_state['Healthcare Provider Taxonomy Code_1'].isin(categories)]

In [15]:
df3_risk.head()

Unnamed: 0,NPI,Provider Business Practice Location Address State Name,Provider Gender Code,Healthcare Provider Taxonomy Code_1
11,1487657532,OK,F,207V00000X
17,1841293891,TN,M,208600000X
83,1922001957,TN,M,207V00000X
108,1700889730,TN,M,207X00000X
135,1760485817,TN,M,207V00000X


In [16]:
df3_FH = ((df3_risk['Provider Gender Code']=='F') & 
          ((df3_risk['Healthcare Provider Taxonomy Code_1'] == '208600000X')| 
           (df3_risk['Healthcare Provider Taxonomy Code_1'] == '207X00000X')))

df3_FL = ((df3_risk['Provider Gender Code']=='F') & 
          ((df3_risk['Healthcare Provider Taxonomy Code_1'] == '207V00000X')| 
           (df3_risk['Healthcare Provider Taxonomy Code_1'] == '208000000X')))

df3_MH = ((df3_risk['Provider Gender Code']=='M') & 
          ((df3_risk['Healthcare Provider Taxonomy Code_1'] == '208600000X')| 
           (df3_risk['Healthcare Provider Taxonomy Code_1'] == '207X00000X')))

df3_ML = ((df3_risk['Provider Gender Code']=='M') & 
          ((df3_risk['Healthcare Provider Taxonomy Code_1'] == '207V00000X')| 
           (df3_risk['Healthcare Provider Taxonomy Code_1'] == '208000000X')))



In [17]:
df3_ML

11         False
17         False
83          True
108        False
135         True
171        False
225        False
282        False
414        False
444         True
552         True
617        False
619         True
627        False
644         True
688        False
704        False
777         True
906         True
1055        True
1078       False
1098        True
1180       False
1299       False
1325       False
1331        True
1375        True
1440       False
1610       False
1759       False
           ...  
7084273    False
7085516    False
7085829    False
7091169    False
7093112    False
7093183    False
7099739    False
7100842    False
7101429    False
7105902    False
7106697     True
7111529    False
7121587    False
7123028    False
7124802    False
7124965    False
7126173    False
7129542    False
7130060    False
7130815    False
7131413    False
7134804    False
7136841    False
7138220    False
7138567    False
7138727    False
7140122     True
7140270    Fal

In [18]:
gr = np.array([[df3_FH.sum(),df3_FL.sum()],[df3_MH.sum(),df3_ML.sum()]])
df3_f = pd.DataFrame(gr, columns=['High Risk','Low Risk'])
df3_f.index = ['Females','Males']
df3_f

Unnamed: 0,High Risk,Low Risk
Females,1624,11839
Males,8247,7192


We use this table to find the p-value.

In [19]:
oddscratio, pvalue =stats.fisher_exact(df3_f)

In [20]:
pvalue

0.0

Fisher's exact test yielded a p-value of 0.0. Because this p-value is smaller than the common level α(0.05), the null hypothesis that gender is independent of categories prederence can be rejected. Therefore, for these data, there is insufficient evidence that a doctor's gender affects their choice of specialty. And from table 3 we can see that with the same gender, a greater proportion of Male doctors prefer high risk and high reward specialty. We support our original hypothesis that male doctors are more likely than their female peers to choose the practices that are associated with higher risk for a higher reward.



# Q4

The U.S population by state data source: https://worldpopulationreview.com/states


In [49]:
df4_pop = pd.read_csv("Pop by States.csv")
df4_pop.head()

Unnamed: 0,States,State,Pop
0,CA,California,39.664128
1,TX,Texas,30.097526
2,FL,Florida,22.177997
3,NY,New York,19.223191
4,PA,Pennsylvania,12.80519


In [50]:
df4 = pd.read_csv("npidata_pfile_20050523-20220109.csv",
                  usecols=['NPI','Provider Business Practice Location Address State Name',
                           'Entity Type Code','Healthcare Provider Taxonomy Code_1'])
df4.head()

Unnamed: 0,NPI,Entity Type Code,Provider Business Practice Location Address State Name,Healthcare Provider Taxonomy Code_1
0,1679576722,1.0,NE,207X00000X
1,1588667638,1.0,FL,207RC0000X
2,1497758544,2.0,NC,251G00000X
3,1306849450,,,
4,1215930367,1.0,TX,174400000X


In [51]:
df4_mri = df4.loc[(df4['Entity Type Code']==2) & (df4['Healthcare Provider Taxonomy Code_1']=='261QM1200X')]
df4_mri.head()           

Unnamed: 0,NPI,Entity Type Code,Provider Business Practice Location Address State Name,Healthcare Provider Taxonomy Code_1
86,1659374684,2.0,OH,261QM1200X
137,1295738243,2.0,GA,261QM1200X
155,1275536229,2.0,TX,261QM1200X
176,1881697837,2.0,WI,261QM1200X
196,1922001965,2.0,TX,261QM1200X


In [52]:
df4_count = df4_mri['Provider Business Practice Location Address State Name'].value_counts().rename_axis('States').reset_index(name='counts')
df4_count

Unnamed: 0,States,counts
0,FL,300
1,TX,254
2,CA,121
3,IL,91
4,NJ,65
5,NY,47
6,MI,47
7,MA,44
8,OH,43
9,GA,31


df4_f = df4_pop / 

In [53]:
df4_count.to_csv("n_MRI.csv")

In [56]:
df_density

Unnamed: 0,States,counts,State,Pop,density
0,FL,300,Florida,22.177997,13.52692
1,TX,254,Texas,30.097526,8.439232
2,CA,121,California,39.664128,3.050615
3,IL,91,Illinois,12.518071,7.269491
4,NJ,65,New Jersey,8.870685,7.327506
5,NY,47,New York,19.223191,2.444963
6,MI,47,Michigan,9.995212,4.702251
7,MA,44,Massachusetts,6.922107,6.356446
8,OH,43,Ohio,11.727377,3.666634
9,GA,31,Georgia,10.936299,2.834597


In [54]:
df_density = df4_count.merge(df4_pop,on = 'States')

In [55]:
df_density["density"]= df_density['counts'].div(df_density['Pop'])

In [57]:
df_density.to_csv("density_MRI.csv")