In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from Bio import Entrez
import requests
import json
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
from collections import defaultdict

https://www.cancer.net/find-cancer-doctor?Distance=3&Gender=All&Languages=All&longitude=-73.956633&latitude=40.7634567&user_latitude=40.763275199999995&user_longitude=-73.9567522&user_address=1233+York+Ave%2C+New+York%2C+NY+10065%2C+USA&searchString=&search_term=1233+York+Ave%2C+New+York%2C+NY+10065%2C+USA&search_type=location&url=

### Find cancer centers

In [126]:
search_url='https://www.cancer.net/navigating-cancer-care/cancer-basics/cancer-care-team/find-nci-designated-cancer-center'

In [129]:
#get html of search results page
html = requests.get(search_url).content
soup = BeautifulSoup(html,'html.parser')
## get list of cancer centers
centers = soup.find_all('h4',{'class':'field-content'})

In [132]:
len(centers)

65

In [138]:
center_list=[]
for center in centers:
    center_list.append(center.get_text())

In [136]:
urls = soup.find_all('div',{'class':'views-field views-field-field-cps-url'})

In [150]:
url_list=[]
for url_sub in urls:
    url_sub2=url_sub.find_all('a')
    for url_sub3 in url_sub2:
        url_list.append(url_sub3.get('href'))

In [157]:
center_df=pd.DataFrame({'center_name':center_list,'center_link':url_list})
center_df=center_df.sort_values(by=['center_name'])
center_df.to_csv('cancernet_center.csv',index=False)

In [154]:
print(len(center_df))

65


In [30]:
base_dir='/Volumes/Yuchen_Drive/Insight/OncoMatch'
center_df=pd.read_csv(os.path.join(base_dir, 'data/nci_center.csv'))

### Combined function

In [34]:
def extract_oncologist_info(center_name):
    base_url = 'https://www.cancer.net/find-cancer-doctor?Distance=3&Gender=All&Languages=All&longitude=&latitude=&user_latitude=&user_longitude=&user_address=&searchString={}&search_term={}&search_type=name&url='
    center_name2 = center_name.replace(' Comprehensive Cancer Center', "")
    center_name2 = center_name2.replace(' Cancer Center', "")
    if 'Dana-Farber' in center_name2:
        center_name2='Dana-Farber'
    center_name2 = center_name2.strip().replace(" ","+")
    search_url=base_url.replace("{}",center_name2)
    
    # get html of search results page
    html = requests.get(search_url).content
    soup = BeautifulSoup(html,'html.parser')
    # get list of doctors
    doctors = soup.find_all('li',{'class':'fao-result'})
    if len(doctors)==1:
        print('No doctors have been found for {}'.format(center_name))
        return None
    if len(doctors)>1:
        print('{} doctors have been found for {}'.format(len(doctors), center_name))
    
    # extract information for each doctor
    output=defaultdict(list)
    for doc in doctors:
        ## get name of doctor
        for name in doc.find_all('a',{'class':'fao-result-name-link'}):
            output['name'].append(name.get_text()[:name.get_text().find(',')])
            output['degree'].append(name.get_text()[name.get_text().find(',')+1:])
        ## get phone number of doctor
        for phone in doc.find_all('p',{'class':'fao-result-phone'}):
            output['phone'].append(phone.get_text())
        ## get address of doctor
        for add1 in doc.find_all('p',{'class':'fao-result-address-name'}):
            output['center_name2'].append(add1.get_text())
        add2_output=""
        for add2 in doc.find_all('p',{'class':'fao-result-address-street'}):
            add2_output+=add2.get_text()
        output['address'].append(add2_output)
        for add3 in doc.find_all('p',{'class':'fao-result-address-city_state'}):
            output['city_state'].append(add3.get_text())
        ## get speciality of doctor
        for specialities in doc.find_all('div',{'class':'fao-result-specialties'}):
            sepciality_output=[]
            for speciality in specialities.find_all('p'):
                sepciality_output.append(speciality.get_text())
            output['speciality'].append(sepciality_output)
        ## get certificate of doctor
        for certificates in doc.find_all('div',{'class':'fao-result-certs'}):
            certificate_output=[]
            for certificate in certificates.find_all('p'):
                certificate_output.append(certificate.get_text())
            output['certificate'].append(certificate_output)
    data_df=pd.DataFrame.from_dict(output)
    data_df['center_name']=center_name
    return data_df

In [35]:
center_name=center_df.center_name.values[0]
data_df=extract_oncologist_info(center_name)

13 doctors have been found for Abramson Cancer Center 


In [36]:
center_name=center_df.center_name.values[30]
data_df=extract_oncologist_info(center_name)

176 doctors have been found for Memorial Sloan-Kettering Cancer Center


In [37]:
def merge_oncologist_info(center_list):
    data_list=[]
    for center_name in center_list:
        data_df=extract_oncologist_info(center_name)
        if data_df is not None:
            data_list.append(data_df)
    return pd.concat(data_list)

In [38]:
center_df=pd.read_csv(os.path.join(base_dir, 'nci_center.csv'))
center_list=list(center_df.center_name.values)+ list(['Johns Hopkins', 'Cleveland Clinic', 'Massachusetts General Hospital'])
data_df=merge_oncologist_info(center_list)

13 doctors have been found for Abramson Cancer Center 
18 doctors have been found for Albert Einstein Cancer Center
No doctors have been found for Alvin J. Siteman Cancer Center
50 doctors have been found for Arizona Cancer Center
16 doctors have been found for Case Comprehensive Cancer Center
2 doctors have been found for Chao Family Comprehensive Cancer Center
57 doctors have been found for City of Hope Comprehensive Cancer Center
No doctors have been found for Cold Spring Harbor Laboratory Cancer Center
No doctors have been found for Dan L Duncan Comprehensive Cancer Center
124 doctors have been found for Dana-Farber/Harvard Cancer Center
No doctors have been found for David H. Koch Institute for Integrative Cancer Research at MIT
12 doctors have been found for Duke Cancer Institute
30 doctors have been found for Fox Chase Cancer Center
No doctors have been found for Fred Hutchinson/University of Washington Cancer Consortium
No doctors have been found for Fred and Pamela Buffett Can

In [39]:
data_df.head()

Unnamed: 0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name
0,John H. Glick,"MD, FASCO",(215) 662-6334,University of Pennsylvania-Abramson Cancer Center,"3400 Civic Center Blvd3rd Fl, Ste 3-300S","Philadelphia, PA 19104-5127, US","[Breast Cancer, Cancer Prevention]","[Internal Medicine, Medical Oncology]",Abramson Cancer Center
1,Arthur M. Feldman,MD,(215) 662-9801,University of Pennsylvania-Abramson Cancer Center,Penn Presbyterian Medcl Ctr51 N 39th St MAB St...,"Philadelphia, PA 19104, US","[Breast Cancer, Geriatrics Oncology]","[Internal Medicine, Medical Oncology]",Abramson Cancer Center
2,David M. Mintzer,MD,Search for Phone Number,Abramson Cancer Center at Pennsylvania Hospital,230 W Washington Sq Fl 2,"Philadelphia, PA 19106-3500, US","[Breast Cancer, Lung Cancer, Palliative Care/E...","[Hematology, Hospice and Palliative Medicine, ...",Abramson Cancer Center
3,David J. Vaughn,MD,(215) 349-8140,University of Pennsylvania-Abramson Cancer Center,PCAM South 10-1143400 Civic Center Blvd,"Philadelphia, PA 19104, US","[Bladder Cancer, Prostate Cancer, Testicular C...",[Medical Oncology],Abramson Cancer Center
4,Charles John Schneider,"MD, FACP",Search for Phone Number,"Hospital of the University of Pennsylvania, Ab...",Pereleman Center for Advanced Medicine3400 Civ...,"Philadelphia, PA 19104, US","[Clinical Research, Developmental Therapeutics...",[Medical Oncology],Abramson Cancer Center


In [40]:
data_df.shape

(1501, 9)

In [42]:
print('{} oncologists have been found in cancernet'.format(data_df.name.nunique()))

1470 oncologists have been found in cancernet


In [43]:
data_df.query("name=='William Wong'")

Unnamed: 0,name,degree,phone,center_name2,address,city_state,speciality,certificate,center_name
24,William Wong,MD,Search for Phone Number,MAYO CLINIC Arizona,13400 E Shea Blvd,"Scottsdale, AZ 85259-5404, US","[Breast Cancer, Clinical Research]","[Internal Medicine, Pharmacy, Radiation Oncology]",Arizona Cancer Center
85,William Wong,MD,Search for Phone Number,MAYO CLINIC Arizona,13400 E Shea Blvd,"Scottsdale, AZ 85259-5404, US","[Breast Cancer, Clinical Research]","[Internal Medicine, Pharmacy, Radiation Oncology]",Mayo Clinic Cancer Center


In [45]:
data_df=data_df[-data_df.name.duplicated()]
print(data_df.shape)

(1470, 9)


In [52]:
pickle.dump(data_df, open(os.path.join(base_dir, "data/Oncologist_info_1470.pkl"), "wb"))

In [53]:
data_df=pickle.load(open(os.path.join(base_dir, "data/Oncologist_info_1470.pkl"),"rb"))

In [55]:
data_df.to_csv(os.path.join(base_dir, "data/Oncologist_info_1470.csv"), index=False)

In [14]:
data_df=pd.read_csv(os.path.join(base_dir, "data/Oncologist_info_1470.csv"),index_col='name')

In [15]:
data_df.shape

(1470, 9)

In [16]:
data_df.head()

Unnamed: 0_level_0,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
John H. Glick,"MD, FASCO",(215) 662-6334,University of Pennsylvania-Abramson Cancer Center,"3400 Civic Center Blvd3rd Fl, Ste 3-300S","Philadelphia, PA 19104-5127, US","['Breast Cancer', 'Cancer Prevention']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,7
Arthur M. Feldman,MD,(215) 662-9801,University of Pennsylvania-Abramson Cancer Center,Penn Presbyterian Medcl Ctr51 N 39th St MAB St...,"Philadelphia, PA 19104, US","['Breast Cancer', 'Geriatrics Oncology']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,0
David M. Mintzer,MD,Search for Phone Number,Abramson Cancer Center at Pennsylvania Hospital,230 W Washington Sq Fl 2,"Philadelphia, PA 19106-3500, US","['Breast Cancer', 'Lung Cancer', 'Palliative C...","['Hematology', 'Hospice and Palliative Medicin...",Abramson Cancer Center,13
David J. Vaughn,MD,(215) 349-8140,University of Pennsylvania-Abramson Cancer Center,PCAM South 10-1143400 Civic Center Blvd,"Philadelphia, PA 19104, US","['Bladder Cancer', 'Prostate Cancer', 'Testicu...",['Medical Oncology'],Abramson Cancer Center,86
Charles John Schneider,"MD, FACP",Search for Phone Number,"Hospital of the University of Pennsylvania, Ab...",Pereleman Center for Advanced Medicine3400 Civ...,"Philadelphia, PA 19104, US","['Clinical Research', 'Developmental Therapeut...",['Medical Oncology'],Abramson Cancer Center,0


In [12]:
doctor2pmid_1153_dict_df=pd.read_csv(os.path.join(base_dir, 'data/doctor2pmid_1153_dict_df.csv'),converters={"Condition":'pmid'})

In [18]:
data_df_1153 = data_df.loc[np.array(doctor2pmid_1153_dict_df.name)]

In [19]:
data_df_1153.head()

Unnamed: 0_level_0,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
John H. Glick,"MD, FASCO",(215) 662-6334,University of Pennsylvania-Abramson Cancer Center,"3400 Civic Center Blvd3rd Fl, Ste 3-300S","Philadelphia, PA 19104-5127, US","['Breast Cancer', 'Cancer Prevention']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,7
David M. Mintzer,MD,Search for Phone Number,Abramson Cancer Center at Pennsylvania Hospital,230 W Washington Sq Fl 2,"Philadelphia, PA 19106-3500, US","['Breast Cancer', 'Lung Cancer', 'Palliative C...","['Hematology', 'Hospice and Palliative Medicin...",Abramson Cancer Center,13
David J. Vaughn,MD,(215) 349-8140,University of Pennsylvania-Abramson Cancer Center,PCAM South 10-1143400 Civic Center Blvd,"Philadelphia, PA 19104, US","['Bladder Cancer', 'Prostate Cancer', 'Testicu...",['Medical Oncology'],Abramson Cancer Center,86
Marcia S. Brose,"MD, PhD, FASCO",(215) 615-0741,University of Pennsylvania-Abramson Cancer Center,5 Silverstein3400 Spruce St,"Philadelphia, PA 19104, US","['Head and Neck Cancer', 'Clinical Research', ...","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,50
Robert G. Maki,"MD, PhD, FACP, FASCO",800-789-7366,Abramson Cancer Center,Perelman Center for Advanced Medicine3400 Civi...,"Philadelphia, Pennsylvania 19104","['Bone Cancer', 'Sarcoma', 'Clinical Trials/Bi...","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,162


In [24]:
data_df_1153.center_name.value_counts().to_frame().reset_index().rename(columns={'center_name':'counts', 'index':'center_name'})

Unnamed: 0,center_name,counts
0,The University of Texas MD Anderson Cancer Center,179
1,Memorial Sloan-Kettering Cancer Center,141
2,Dana-Farber/Harvard Cancer Center,104
3,Mayo Clinic Cancer Center,70
4,Massachusetts General Hospital,49
5,City of Hope Comprehensive Cancer Center,43
6,Cleveland Clinic,42
7,Yale Cancer Center,40
8,Johns Hopkins,34
9,The University of Chicago Comprehensive Cancer...,33


In [26]:
data_df_1153.to_csv(os.path.join(base_dir, "data/Oncologist_info_1153.csv"),index=True,index_label='name')

In [27]:
onco_df=pd.read_csv(os.path.join(base_dir, "data/Oncologist_info_1153.csv"), index_col='name')

In [28]:
onco_df.shape

(1153, 9)

In [29]:
onco_df.head()

Unnamed: 0_level_0,degree,phone,center_name2,address,city_state,speciality,certificate,center_name,article_num
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
John H. Glick,"MD, FASCO",(215) 662-6334,University of Pennsylvania-Abramson Cancer Center,"3400 Civic Center Blvd3rd Fl, Ste 3-300S","Philadelphia, PA 19104-5127, US","['Breast Cancer', 'Cancer Prevention']","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,7
David M. Mintzer,MD,Search for Phone Number,Abramson Cancer Center at Pennsylvania Hospital,230 W Washington Sq Fl 2,"Philadelphia, PA 19106-3500, US","['Breast Cancer', 'Lung Cancer', 'Palliative C...","['Hematology', 'Hospice and Palliative Medicin...",Abramson Cancer Center,13
David J. Vaughn,MD,(215) 349-8140,University of Pennsylvania-Abramson Cancer Center,PCAM South 10-1143400 Civic Center Blvd,"Philadelphia, PA 19104, US","['Bladder Cancer', 'Prostate Cancer', 'Testicu...",['Medical Oncology'],Abramson Cancer Center,86
Marcia S. Brose,"MD, PhD, FASCO",(215) 615-0741,University of Pennsylvania-Abramson Cancer Center,5 Silverstein3400 Spruce St,"Philadelphia, PA 19104, US","['Head and Neck Cancer', 'Clinical Research', ...","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,50
Robert G. Maki,"MD, PhD, FACP, FASCO",800-789-7366,Abramson Cancer Center,Perelman Center for Advanced Medicine3400 Civi...,"Philadelphia, Pennsylvania 19104","['Bone Cancer', 'Sarcoma', 'Clinical Trials/Bi...","['Internal Medicine', 'Medical Oncology']",Abramson Cancer Center,162
