In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df_proc = pd.read_csv('data/procedures.csv')

In [3]:
df_phy = pd.read_csv('data/physicians.csv')

In [4]:
df_phy[['Cardiologist','Diagnostic Radiologist','Internal Medicine','Family Practice','Orthopedic Surgeon']]=pd.get_dummies(df_phy.specialty)[['Cardiology','Diagnostic Radiology','Internal Medicine','Family Practice','Orthopedic Surgery']]
df_phy.columns = ['physician_id','specialty','Cardiologist','Diagnostic Radiologist','Internal Medicine','Family Practice','Orthopedic Surgeon']

In [5]:
merged = pd.merge(df_proc, df_phy, left_on='physician_id',right_on='physician_id',how='outer')

In [6]:
len(merged.physician_id.unique())

36025

We have 36025 doctors. But so many of them just appear once/few times. Lets take a look at the top 1000.

In [7]:
merged.physician_id.value_counts().head(1000)

716      161
21467    160
15761    134
28869    131
13793    131
29737    128
16629    126
24658    119
31313    118
26070    118
11309    115
18910    114
1592     114
33658    113
1274     113
32145    113
25452    111
28285    108
8053     108
10835    107
248      106
31097    106
13601    106
3258     105
22049    104
34640    104
6591     102
16017    102
32393    102
23418    101
        ... 
3246      53
33024     53
14346     53
35531     53
6066      53
13777     53
12343     53
4304      53
31475     53
22889     52
31221     52
13346     52
12473     52
1834      52
30158     52
32006     52
11151     52
12013     52
5587      52
34522     52
33359     52
16000     52
30187     52
19808     52
7009      52
34304     52
29331     52
360       52
21142     52
5423      52
Name: physician_id, Length: 1000, dtype: int64

Here, we can see that the doctors with id 716 and 21467 have the highest number of rows(161 and 160 respectively), i.e they've performed the most diverse set of procedures. Also the top 1000 contains doctors performed from 160ish procedures upto 52. Going all the way to top 10000 gets us to upto 20ish procedures. That's around 9000 doctors who have performed 20-50 procedures. Not bad.

In [8]:
print(len(merged[merged.physician_id==716]['procedure_code'].unique()))
print(len(merged[merged.physician_id==21467]['procedure_code'].unique()))

118
160


118 and 160 unique procedures performed by each.

In [9]:
lst=[]
for i in merged.physician_id.unique():
    lst.append(len(merged[merged.physician_id==i]['procedure_code'].unique()))

In [10]:
d={}
for i,val in enumerate(lst):
    if i not in d.keys():
        d[i]=val

In [11]:
print(d[716])
print(d[21467])

118
160


We have the physician_id and their number of unique procedures in a dictionary.

Doctor => procedures

In [12]:
samp_doc_id = [1,2,3,4,5]
samp_proc_id = [212,313,414,515,616,717,818,919]

In [13]:
#for the sake of this example,lets say that we have
# 1 -> 212,313
# 2 -> 212
# 3 -> 313,414,818
# 4 -> 717
# 5 -> 515,616,919


In [14]:
import numpy as np

In [15]:
adj_dict = {
            1:{212,313},
            2:{212},
            3:{313,414,818},
            4:{717},
            5:{515,616,919}
            }

In [16]:
#        212,313,414,515,616,717,818,919
#    1    1   1   0   0    0   0  0   0
#    2
#    3           ETC
#    4
#    5

In [17]:
adj_mat = np.array([[1,1,0,0,0,0,0,0],[1,0,0,0,0,0,0,0],[0,1,1,0,0,0,1,0],[0,0,0,0,0,1,0,0],[0,0,0,1,1,0,0,1]]) 

In [18]:
adj_mat

array([[1, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 1]])

In [19]:
import networkx as nx
import nxpd
import graphviz
import matplotlib.pyplot as plt
from matplotlib import collections as mc
from collections import deque
from itertools import product
%matplotlib inline

In [20]:
G = nx.from_dict_of_lists(adj_dict)

In [21]:
top_50 = merged.physician_id.value_counts().head(50)

In [22]:
top_50.index.values

array([  716, 21467, 15761, 28869, 13793, 29737, 16629, 24658, 31313,
       26070, 11309, 18910,  1592, 33658,  1274, 32145, 25452, 28285,
        8053, 10835,   248, 31097, 13601,  3258, 22049, 34640,  6591,
       16017, 32393, 23418,  2180, 13359,  5208, 19741, 31562, 10958,
       10204,  7036, 11914,  3714,  6730,  2820, 30438, 31993, 20846,
       14628,  4968,  4269, 30033, 20868], dtype=int64)

In [23]:
d1={}
for i in top_50.index.values:
    if i not in d1.keys():
        d1[str(i)]=merged[merged.physician_id==i]['procedure_code'].unique()

In [24]:
G = nx.from_dict_of_lists(d1)

In [25]:
nx.write_gml(G, 'top_50.gml')

In [26]:
from collections import Counter

In [27]:
Counter([1,1,1,2,3])

Counter({1: 3, 2: 1, 3: 1})

In [28]:
kist=[]
for i in top_50.index.values:
    kist.append(merged[merged.physician_id==i]['specialty'].iloc[0])

In [29]:
Counter(kist)

Counter({'Cardiology': 2,
         'Diagnostic Radiology': 27,
         'Family Practice': 1,
         'Hematology/Oncology': 4,
         'Medical Oncology': 1,
         'Unknown': 15})

In [30]:
from collections import defaultdict

In [37]:
test_df = pd.DataFrame(columns=['prof','procs'])
for i in top_50.index.values:
        test_df = test_df.append({'prof': merged[merged.physician_id==i]['specialty'].iloc[0], 'procs':merged[merged.physician_id==i]['procedure_code'].unique()},ignore_index=True)    

In [38]:
test_df

Unnamed: 0,prof,procs
0,Diagnostic Radiology,"[19281, 20610, 32555, 70210, 70220, 70450, 704..."
1,Family Practice,"[20552, 20553, 20610, 51701, 51798, 70130, 702..."
2,Diagnostic Radiology,"[70450, 70486, 70491, 70551, 70553, 71010, 710..."
3,Unknown,"[49083, 70450, 70470, 70486, 70491, 70496, 704..."
4,Diagnostic Radiology,"[32555, 49083, 70450, 70470, 70480, 70486, 704..."
5,Hematology/Oncology,"[36415, 38220, 38221, 70491, 71020, 71250, 712..."
6,Unknown,"[10022, 20610, 22523, 22524, 32555, 36569, 490..."
7,Diagnostic Radiology,"[70450, 70486, 70491, 70551, 70553, 71010, 710..."
8,Diagnostic Radiology,"[10022, 70450, 70470, 70480, 70486, 70490, 704..."
9,Diagnostic Radiology,"[19083, 70450, 70470, 70486, 70551, 71010, 710..."


In [46]:
d=defaultdict()
for i in top_50.index.values:
        d[merged[merged.physician_id==i]['specialty'].iloc[0]].append(merged[merged.physician_id==i]['procedure_code'].unique())

KeyError: 'Diagnostic Radiology'

In [47]:
#what we need {'physician_id':{'specialty': {1,2,3,4,5}}}

In [49]:
d2={}
for i in top_50.index.values:
    if i not in d2.keys():
        d2[str(i)]=dict(merged[merged.physician_id==i]['specialty'].iloc[0])
        d2[str(i)][merged[merged.physician_id==i]['specialty'].iloc[0]] = merged[merged.physician_id==i]['procedure_code'].unique()

ValueError: dictionary update sequence element #0 has length 1; 2 is required