In [32]:
import requests
from bs4 import BeautifulSoup
import csv

In [33]:
# Query method to get the detail page of a table row
# Return response data from get request
def get_table(index):
    # Modify HERE
    page_size = 20
    base_url = "https://www.edr.hk/ajax/search-all?draw=6&columns%5B0%5D%5Bdata%5D=&columns%5B0%5D%5Bname%5D=&columns%5B0%5D%5Bsearchable%5D=true&columns%5B0%5D%5Borderable%5D=false&columns%5B0%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B0%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B1%5D%5Bdata%5D=&columns%5B1%5D%5Bname%5D=&columns%5B1%5D%5Bsearchable%5D=true&columns%5B1%5D%5Borderable%5D=false&columns%5B1%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B1%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B2%5D%5Bdata%5D=&columns%5B2%5D%5Bname%5D=&columns%5B2%5D%5Bsearchable%5D=true&columns%5B2%5D%5Borderable%5D=false&columns%5B2%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B2%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B3%5D%5Bdata%5D=&columns%5B3%5D%5Bname%5D=&columns%5B3%5D%5Bsearchable%5D=true&columns%5B3%5D%5Borderable%5D=false&columns%5B3%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B3%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B4%5D%5Bdata%5D=&columns%5B4%5D%5Bname%5D=&columns%5B4%5D%5Bsearchable%5D=true&columns%5B4%5D%5Borderable%5D=false&columns%5B4%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B4%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B5%5D%5Bdata%5D=address_chi&columns%5B5%5D%5Bname%5D=&columns%5B5%5D%5Bsearchable%5D=true&columns%5B5%5D%5Borderable%5D=false&columns%5B5%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B5%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B6%5D%5Bdata%5D=district&columns%5B6%5D%5Bname%5D=&columns%5B6%5D%5Bsearchable%5D=true&columns%5B6%5D%5Borderable%5D=false&columns%5B6%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B6%5D%5Bsearch%5D%5Bregex%5D=false&columns%5B7%5D%5Bdata%5D=phone&columns%5B7%5D%5Bname%5D=&columns%5B7%5D%5Bsearchable%5D=true&columns%5B7%5D%5Borderable%5D=false&columns%5B7%5D%5Bsearch%5D%5Bvalue%5D=&columns%5B7%5D%5Bsearch%5D%5Bregex%5D=false&search%5Bvalue%5D=&search%5Bregex%5D=false&_=1679641375878"
    url_variance = "&start=%s&length=%s" % (index * page_size, page_size)
    res = requests.get(base_url + url_variance)
    return res.json()

In [34]:
# Return True to stop scraping
def get_table_end_condition(index, data):
    # Modify HERE
    return len(data['data']) == 0

In [35]:
def scrape_detail(index, data, html):
    soup = BeautifulSoup(html, 'html.parser')
    
    # Extract doctor name
    name_element = soup.find('h1')
    doctor_name = name_element.text.strip()

    # Extract clinic address
    address_element = soup.find('div', {'class': 'profile-box-general'})
    clinic_address = address_element.find_all('p')[0].text.strip()

    # Extract phone number
    phone_element = soup.find('h2', text='診症電話')
    if phone_element is not None:
        phone_number = phone_element.find_next('p').text.strip()

    # Extract medical services offered
    services_element = soup.find('h2', text='醫療服務包括')
    if services_element is not None:
        services_list = services_element.find_next('ul')
        medical_services = [service.text.strip() for service in services_list.find_all('li')]

    # Extract other information
    other_info_element = soup.find('div', {'class': 'other-info'})
    if other_info_element is not None:
        other_info = [p.text.strip() for p in other_info_element.find_all('p')]

    # Extract professional qualifications
    profile_cert_list = soup.find('ul', class_='profile-cert-list')
    certifications = profile_cert_list.find_all('li')
    if certifications is not None:
        professional_qualifications = [cert.text.strip() for cert in certifications]

    return {
        '中文姓名': data['name_chi'] if 'name_chi' in data else None,
        '英文姓名': data['name_eng'] if 'name_eng' in data else None,
        '姓別': data['gender'],
        '專科': data['speciality_name_chi'] if 'speciality_name_chi' in data else None,
        '電話': data['phone'] if 'phone' in data else None,
        '地區': data['district']   if 'district' in data else None,
        '地址': data['address_chi'] if 'address_chi' in data else None,
        '醫療服務': medical_services if 'medical_services' in locals() else None,
        '其他資料': other_info if 'other_info' in locals() else None,
        '專業資格': professional_qualifications if 'professional_qualifications' in locals() else None,
        '執業': data['practice'] if 'practice' in data else None,
    }

In [36]:
def get_detail(index, data, out):
    # Modify HERE
    for row in data['data']:
        base_url = "https://www.edr.hk/"
        url = base_url + row['url']
        res = requests.get(url)
        res_data = scrape_detail(index, row, res.text)
        
        fieldnames = list(res_data.keys())
        writer = csv.DictWriter(out, fieldnames=fieldnames)
        writer.writerow(res_data)

    return True

In [37]:
def main(out):
    spamwriter = csv.writer(out, quoting=csv.QUOTE_MINIMAL)
    index = 0
    while True:
        print(index)
        res_data = get_table(index)
        if get_table_end_condition(index, res_data):
            break
        get_detail(index, res_data, out)
        index += 1

with open('output.csv', 'w', encoding='utf8') as out:
    main(out)

0
1
2


AttributeError: 'NoneType' object has no attribute 'find_all'