In [150]:
# As usual ...
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [151]:
CENTRE_LIST_BASE_URL = 'https://www.toronto.ca/data/children/dmc/a2z/a2z{}.html'

# build centre list url
def build_centre_list_url(alphabet):
  return CENTRE_LIST_BASE_URL.format(alphabet)

In [152]:
CENTRE_DETAIL_BASE_URL = 'https://www.toronto.ca/data/children/dmc/{}'

# build centre detail url
def build_centre_detail_url(relative_link):
  # remove first 3 characters './/'
  relative_link = relative_link[3:]
  return CENTRE_DETAIL_BASE_URL.format(relative_link)

In [153]:
# define some requests constants
ENCODING = 'utf-8'
FEATURES = 'html.parser'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

In [154]:
import requests
from bs4 import BeautifulSoup

# get requests and return soup
def get_requests(url):
  response = requests.get(url, headers=HEADERS)

  # set encoding to get rid of, for example, Ysraël becomes Ysra√´l
  response.encoding = ENCODING

  soup = BeautifulSoup(response.text, FEATURES)
  return soup

In [155]:
def scrape_central_name_and_link(row):
  first_cell = row.find('td')

  # fast fail: No cell found
  if not first_cell:
      return {}

  # otherwise, find link tag
  link_tag = first_cell.find_all('a', href=True)[0]

  # fast fail: no link tag
  if not link_tag:
      return {}

  # otherwise, continue to scrape
  name = link_tag.text
  link = link_tag['href']

  return {
      'name': name,
      'link': build_centre_detail_url(link) # store link for easier access later
  }

In [156]:
def scrape_all_centres_name_and_link(alphabet):
  # build url first
  url = build_centre_list_url(alphabet)

  soup = get_requests(url)

  table = soup.find('table')

  if not table:
    print(f'\tNo table found. Skip this URL: {url}')
    # return empty array
    return []

  centres = []
  rows = table.find_all('tr')

  for index, row in enumerate(rows):
    child_care = scrape_central_name_and_link(row)

    if child_care:
      centres.append(child_care)

  return centres

In [157]:
import re # regular expression

def clean_text(original_text):
  # Step 1: Remove all non-printable characters.
  new_text = ''.join(char for char in original_text if char.isprintable())
  # Step 2: Replace any sequence of whitespace with a single space
  new_text = re.sub(r'\s+', ' ', new_text)
  # Step 3: Add space after ','
  new_text = re.sub(r'(,)(\S)', r'\1 \2', new_text)
  # Step 4: Add space before '('
  new_text = re.sub(r'(\S)(\()', r'\1 \2', new_text)
  # Step 5: Remove space before '/'
  new_text = re.sub(r'\s+/', '/', new_text)
  # Step 6: Remove space after '/'
  new_text = re.sub(r'/\s+', '/', new_text)

  return new_text.strip()

In [158]:
WARD = 'Ward:'
CROSS_STREET_REF_BEGIN = '('

def extract_address(centre, original_text):
  temp_string = clean_text(original_text)

  # ward
  ward_index = temp_string.find(WARD)
  if ward_index != 1:
    ward = temp_string[ward_index + len(WARD):].strip()
    print(f'\tWard:\t{ward}')
    centre['ward'] = ward

    temp_string = temp_string[:ward_index].strip()

  # cross_street_reference
  cross_street_ref_begin_index = temp_string.find(CROSS_STREET_REF_BEGIN)
  if cross_street_ref_begin_index != -1:
    cross_street_ref = clean_text(temp_string[cross_street_ref_begin_index:]
                                  .replace("(", '')
                                  .replace(')', '')
                                  .replace('&', '/'))
    print(f'\tCross Street Reference:\t{cross_street_ref}')
    centre['cross_street_reference'] = cross_street_ref

    temp_string = temp_string[:cross_street_ref_begin_index].strip()

  # address
  address = temp_string

  centre['address'] = address

In [159]:
PHONE = 'Phone:'
PHONE_EXTENSION = 'ext'
OR_EMAIL = 'or email:'

def extract_contact(centre, original_text):
  phone_index = original_text.find(PHONE)
  email_index = original_text.find(OR_EMAIL)

  contact_name = phone = phone_extension = email = ''

  if phone_index !=-1:
    phone = original_text[phone_index + len(PHONE):email_index].strip()

    split = phone.split(',')
    # more elements means a contact name is provided
    if len(split) > 1:
      contact_name = split[0].strip()
      phone = split[1].strip()

    # check if email is provided
    if email_index!=-1:
      # email don't need to have uppercase
      email = original_text[email_index + len(OR_EMAIL):].strip().lower()

    phone_extension_index = phone.find(PHONE_EXTENSION)
    if phone_extension_index!=-1:
      phone_extension = phone[phone_extension_index + len(PHONE_EXTENSION):].strip()
      phone = phone[:phone_extension_index].strip()

  centre['contact_name'] = contact_name
  centre['phone'] = phone
  centre['phone_extension'] = phone_extension
  centre['email'] = email

In [160]:
EMAIL_DOMAIN_EXCLUSION = {'gmail.com', 'hotmail.com', 'rogers.com'}
EMAIL_AT_SIGN = '@'

def extract_website(centre, original_text):
  # format url, remove unnecessary prefix
  temp_string = (clean_text(original_text)
                        .replace('Website:', '')
                        .replace('https://', '')
                        .replace('http://', '')
                        .replace('www.', '')
                  .lower()) # website don't need to have uppercase

  # email can be input unexpectedly in the website field
  email_sign_index = temp_string.find(EMAIL_AT_SIGN)

  website = ''
  # check if the specific character was found
  if email_sign_index == -1:
    website = temp_string

  else:
    print(f'\tThis should be an email address: {temp_string}')

    # take into account this value as email if email is empty
    if centre['email'] == '':
      print(f'\tEmail:\t{temp_string}')
      centre['email'] = temp_string

    # slice the string from @ to the end
    # for example, someone@somecompany.com -> somecompany.com
    temp_string = temp_string[email_sign_index + 1:]

    # prevent email provider from being considered as a child care centre' website
    if temp_string not in EMAIL_DOMAIN_EXCLUSION:
      print(f'\tWebsite:\t{temp_string}, extracted from email')
      website = temp_string

  # remove the redundant '/' at the end (cosmetic reason)
  if website.endswith('/'):
    website = website[:-1]

  centre['website'] = website

In [161]:
def scrape_all_centres_detail(centre):
  url = centre['link']

  soup = get_requests(url)

  content = soup.find('div', class_='csd_opcrit_content_box')
  if not content:
    return {}

  address = content.find('p')
  if address:
    extract_address(centre, address.text)

  ul_tag = content.find('ul')
  if ul_tag:
    contact = ul_tag.find('li', class_='nudge')
    if contact:
      extract_contact(centre, contact.text)

    li_tags = ul_tag.find_all('li')
    if len(li_tags) > 1:
      website = li_tags[1]
      extract_website(centre, website.text)


In [162]:
import string
from time import sleep

LOWERCASE_ALPHABET = string.ascii_lowercase
SLEEP_TIME_IN_SECOND = 0.1
centres = []

for alphabet in LOWERCASE_ALPHABET:
  print(f'Start scraping centre list for alphabet {alphabet}.')
  alphabet_centres = scrape_all_centres_name_and_link(alphabet)
  centres.extend(alphabet_centres)
  print(f'\tFinish scraping {len(alphabet_centres)} centres with alphabet {alphabet}. Sleep for a short moment.')

  # incorporate a sleep in between requests as required
  sleep(SLEEP_TIME_IN_SECOND)

for index, centre in enumerate(centres):
  print(f'({index:04d}) Start scraping centre detail for centre {centre['name']}.')
  scrape_all_centres_detail(centre)
  print('\tFinish scraping. Sleep for a short moment.')

  # incorporate a sleep in between requests as required
  sleep(SLEEP_TIME_IN_SECOND)

Start scraping centre list for alphabet a.
	Finish scraping 60 centres with alphabet a. Sleep for a short moment.
(0000) Start scraping centre detail for centre A & A Daycare Centre.
	Ward:	Scarborough-Agincourt
	Cross Street Reference:	Midland/Finch E.
	Finish scraping. Sleep for a short moment.
(0001) Start scraping centre detail for centre Abacus Montessori Learning Centre.
	Ward:	Beaches-East York
	Finish scraping. Sleep for a short moment.
(0002) Start scraping centre detail for centre Abiona Centre For Infant And Early Mental Health Elc Broadview Campus.
	Ward:	Toronto-Danforth
	Cross Street Reference:	Broadview/Mortimer
	Finish scraping. Sleep for a short moment.
(0003) Start scraping centre detail for centre Abiona Centre For Infant And Early Mental Health Elc Humewood Campus.
	Ward:	Toronto-St. Paul's
	Cross Street Reference:	Christie/St. Clair
	Finish scraping. Sleep for a short moment.
(0004) Start scraping centre detail for centre Aboriginal Head Start - Epnigishmok.
	Ward:

In [163]:
def main():
  df = pd.DataFrame(centres)
  print(df)

  # Export the DataFrame to a CSV file
  output_file_name = 'toronto-az-child-care-centres.csv'
  output_columns = ['name', 'address', 'cross_street_reference', 'ward', 'contact_name', 'phone', 'phone_extension', 'email', 'website']

  df.to_csv(output_file_name, columns=output_columns, index=False)
  print(f'DataFrame successfully exported to {output_file_name}.')


In [164]:
if __name__ == '__main__':
  main()

                                                 name  \
0                                A & A Daycare Centre   
1                   Abacus Montessori Learning Centre   
2   Abiona Centre For Infant And Early Mental Heal...   
3   Abiona Centre For Infant And Early Mental Heal...   
4                 Aboriginal Head Start - Epnigishmok   
5            Absorbent Minds (Op1756925 Ontario Inc.)   
6                                   Absorbent Minds 2   
7                     Absorbent Minds Montessori East   
8              Absorbent Years Childcare (Montessori)   
9                       Absorbent Years Childcare Inc   
10           After Four Children's Enrichment Program   
11  Agape Christian Montessori School (2236800 Ont...   
12                        Agile Innovators Montessori   
13                   Agincourt Montessori School Inc.   
14  Agincourt Temple Day Care(Governing Council Of...   
15                       Air-O-Down Child Care Centre   
16         Albion Early Learnin