In [None]:
import time as time
import pandas as pd
import random
import os

from selenium import webdriver 

In [None]:
url = 'https://www.cpso.on.ca/Public-Information-Services/Find-a-Doctor?search=general'

# Load the list of all 3 letter postal codes in ontario
postalCodes = pd.read_csv('./res/ON Postal Code.csv')

In [None]:
# Gets the profile url for future use
def getProfile(element):
    link = element.find_element_by_tag_name('a').get_attribute('href')
    return link


# Gets the name from the page
def getName(element):
    name = element.find_element_by_tag_name('h3').text
    return name


# Gets the location info
def getLocation(element):
    location = element.find_element_by_tag_name('p').text
    return location


# gets the specialty if its listed else make it blank
def getSpecialty(element):
    try:
        specialty = element.find_elements_by_tag_name('div')[-1].find_element_by_tag_name('p').text
    except:
        specialty = ""
    return specialty


# Append the results to the file
def appendToFile(filePath, df):
    header = True
    
    # if file exists then add header
    if os.path.isfile(filePath):
        header = False
        
    df.to_csv(filePath, mode='a', header=header, index=False)
    
# Gets all the elements that are contain the data        
def getScrapePageInfo():
    articleList = browser.find_elements_by_tag_name('article')
    profileList = list(map(getProfile, articleList))
    nameList = list(map(getName, articleList))
    locationList = list(map(getLocation, articleList))
    specialty = list(map(getSpecialty, articleList))
    # Save the data in a dataFrame
    data = pd.DataFrame(columns={'Name', 'Link', 'Location', 'Specialty'})
    data['Name'] = nameList
    data['Link'] = profileList
    data['Location'] = locationList
    data['Specialty'] = specialty
    return data
    
    
# Loop through the pages after search and scrape  info
def scrapePages():
    
    # Gets the page limit
    pageLimit = browser.find_element_by_css_selector('.row.doctor-search-count').find_element_by_css_selector('.medium-4.columns.text-align--right').text
    pageLimit = int(pageLimit.split(' ')[-1][:-1])
    
    # Scroll through the pages
    for i in range(1, pageLimit+1):
        
        # dataframe that equals the scraped page
        df = getScrapePageInfo()
        
        appendToFile('./csv/ON Physician.csv', df)
        # Delete the dataframe once its been saved
        del df 
        try:
            pageList = browser.find_element_by_css_selector('.doctor-search-paging')
            if (i-1) % 5 == 0 and (i-1) != 0:
                nextPage = pageList.find_element_by_xpath("//a[contains(text(),'Next 5')]".format(i))
            else:
                nextPage = pageList.find_element_by_xpath("//a[text() = '{}']".format(i))
            nextPage.click()
        except:
            print('No Pages')
        time.sleep(3)

In [None]:
option = webdriver.ChromeOptions()
option.add_argument('-incognito')

browser = webdriver.Chrome(executable_path='../chromedriver', chrome_options=option)

In [None]:
# Loop through each postal code
for i in range(len(postalCodes)):
    
    # Go to advance search page on CPSO
    browser.get(url)
    
    # Wait anyway where from 0.5-2 seconds for the page to load
    time.sleep(random.uniform(0.5,2))
    
    # Grab the postal code search input
    inputForm = browser.find_element_by_id('p_lt_ctl04_pageplaceholder_p_lt_ctl02_AllDoctorsSearch_txtPostalCode')
    
    # Send the ith postal code
    inputForm.send_keys(postalCodes.iloc[i, 0])
    
    # Grab the submit button
    submitButton = browser.find_element_by_id('p_lt_ctl04_pageplaceholder_p_lt_ctl02_AllDoctorsSearch_btnSubmit1')
    
    # Press submit
    submitButton.click()
    
    # Wait for the page to load
    time.sleep(random.uniform(0.5,2))
    
    # Try to scrape the page if there are doctors in that FSA
    try:            
        scrapePages()
    except:
        print('No doctors in FSA')