# Import Libraries

In [233]:
# Libraries
import pandas as pd
import numpy as np
import requests 
import re
import joblib

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# BeautifulSoup
from bs4 import BeautifulSoup as bs

# Web Scrape

In [205]:
# Launch Browser 
# Setup the Selenium WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [228]:
# Initialize DataFrame
# Specify column names to use with dataframe
columns = ['School Name', 'Full Address', 'Street Address', 'City', 'State', 'Zip Code', 'Country', 'School Type', 'Website', 'Description']

# Initialize DataFrame
df = pd.DataFrame(columns=columns)

print(df)

Empty DataFrame
Columns: [School Name, Full Address, Street Address, City, State, Zip Code, Country, School Type, Website, Description]
Index: []


In [229]:
# Scrape data from each state/province page 
# Create list of States & Provinces in the USA & Canada to loop over
state_province = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'ON', 'PE', 'QC', 'SK']
province = ['AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'ON', 'PE', 'QC', 'SK']

for state in state_province:

    if state in province: 
        co_code = 'CA'

    else:
        co_code = 'US'

    # Create link for each state/province 
    url = f'https://www.adventisteducation.org/schools?stateProvince={state}%3B{co_code}'

    # Open page
    driver.get(url)

    # Wait for the element to become clickable
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//*[contains(@class, 'Schools_searchResults__')]"))
    )
    # Once the element is clickable, get the outer HTML
    html_content = element.get_attribute("outerHTML")

    # Use BeautifulSoup to parse the HTML content
    soup = bs(html_content, "html.parser")



    # Find each 'a' anchor tag matching the regex search below to find info for each school
    result_divs = soup.find_all('a', class_ = re.compile('SearchResult_root__'))

    # Loop over each result to extract name of school
    for div in result_divs:


        # School Name
        # Find name of school 
        name_heading = div.find_all('h2', class_ = re.compile('SearchResult_heading__'))
        name_pattern = r">.*?</"

        # Use regex to find the pattern in the name_heading and extract the matching text
        name = re.search(name_pattern, str(name_heading))
        
        # Extract all matches in a single group
        name = name.group()
        
        # Use string slicing to remove the ends that helped return the result
        name = name[1:-2]


        # School Address
        # Find & Collect each full address
        address_heading = div.find_all('div', class_ = re.compile('SearchResult_address__'))
        address_pattern  = r">.*?</"
        address_full = re.search(address_pattern, str(address_heading))
        address_full = address_full.group()

        # Trim result address string of regex remnants
        address_full = address_full[1:-2]


        try:
            # Extract different parts of address information
            address_detail_pattern  = re.compile(r"(.*(\s.*)*,)\s(.*,)\s(\w*,)\s(\d+-*\d*,|.{3}\s.{3},)\s(\w*)")  # regex expression
            address_detail = address_detail_pattern.search(address_full)   # search full address for patterns

            if address_detail != None:
                # Save each section of results to a different variable
                address_street, address_road_type, address_city, address_state, address_zip, address_country = address_detail.groups()  # extract each group from pattern

                # Use string slicing to remove the ends that helped return the result
                address_street = address_street[:-1]
                address_city = address_city[:-1]
                address_state = address_state[:-1]
                address_zip = address_zip[:-1]

            else: 
                # If there is no match, set default value
                address_street, address_road_type, address_city, address_state, address_zip, address_country = None

        
        except Exception as e:
            # Set a default value if any exceptions occur during regex search
            print(f"An error occurred: {e}")
            address_street, address_road_type, address_city, address_state, address_zip, address_country = None
        


        # School Type
        # Find school type
        school_heading = div.find_all('dd')

        # Extract school type
        school_pattern  = re.compile(r">(.*)</")  # regex expression
        school_type = re.search(school_pattern, str(school_heading))  # search for pattern
        school_type = school_type.group()   # extract result from group 

        school_type = school_type[1:-2]


        # School Website
        # Find link
        link_pattern  = re.compile('href="/schools/(.){6}"')
        link_partial = re.search(link_pattern, str(div))
        link_partial = link_partial.group()

        link_partial = link_partial[6:-1]

        link_root = 'https://www.adventisteducation.org'

        link_full = link_root + link_partial


        # School Description
        # Find description
        desc_full = div.find_all('div', class_ = re.compile('SearchResult_intro__'))
        
        # Check if there is a description or not
        if desc_full:

            # Convert matching result to a string and trim off excess
            desc_full = str(desc_full)
            desc_full = desc_full[40:-7]


        new_row = pd.DataFrame([{'School Name': name, 'Full Address': address_full, 'Street Address': address_street, 'City': address_city, 'State': address_state, 'Zip Code': address_zip, 'Country': address_country, 'School Type': school_type, 'Website': link_full, 'Description': desc_full}])
        #print(new_row)

        df = pd.concat([df, new_row], ignore_index=True)

display(df)




Unnamed: 0,School Name,Full Address,Street Address,City,State,Zip Code,Country,School Type,Website,Description
0,Bethany Christian Academy,"1765 Highland Ave, Montgomery, AL, 36107-2657,...",1765 Highland Ave,Montgomery,AL,36107-2657,USA,PK-08,https://www.adventisteducation.org/schools/ANTP25,Bethany Christian Academy is a Christ-centered...
1,Bethany SDA Child Development Center,"714 Cedar St, Montgomery, AL, 36106-1002, USA",714 Cedar St,Montgomery,AL,36106-1002,USA,ECP,https://www.adventisteducation.org/schools/ANTP27,[]
2,Big Cove Christian Academy,"6354 Highway 431 S, Owens Cross Roads, AL, 357...",6354 Highway 431 S,Owens Cross Roads,AL,35763-9210,USA,PK-08,https://www.adventisteducation.org/schools/ANTG16,[]
3,Ephesus Academy Child Development Center,"829 McMillon Ave SW, Birmingham, AL, 35211-173...",829 McMillon Ave SW,Birmingham,AL,35211-1730,USA,ECP,https://www.adventisteducation.org/schools/ANTP2A,[]
4,Ephesus Junior Academy,"829 McMillon Ave SW, Birmingham, AL, 35211-173...",829 McMillon Ave SW,Birmingham,AL,35211-1730,USA,PK-10,https://www.adventisteducation.org/schools/ANTP45,[]
...,...,...,...,...,...,...,...,...,...,...
885,Windsor Adventist Elementary School,"5350 Haig Ave, Windsor, ON, N8T 1K8, Canada",5350 Haig Ave,Windsor,ON,N8T 1K8,Canada,PK-08,https://www.adventisteducation.org/schools/AN6M90,[]
886,Greaves Adventist Academy,"2330 West Hill Ave, Montreal, QC, H4B 2S4, Canada",2330 West Hill Ave,Montreal,QC,H4B 2S4,Canada,PK-12,https://www.adventisteducation.org/schools/AN6P15,[]
887,Sartigan Adventist Academy,"645 7e Rue Sartigan, Saint-Georges, QC, G5Y 5B...",645 7e Rue Sartigan,Saint-Georges,QC,G5Y 5B8,Canada,PK-08,https://www.adventisteducation.org/schools/AN6PAS,[]
888,Curtis-Horne Christian School,"3718 Hill Ave, Regina, SK, S4S 0X5, Canada",3718 Hill Ave,Regina,SK,S4S 0X5,Canada,K-09,https://www.adventisteducation.org/schools/AN6B20,[]


In [231]:
# Remember to close the browser
driver.quit()

# Export Results to File

In [230]:
# Export dataframe to a CSV file

# Specify file name and include datetime index column for CSV file 
df.to_csv('Data/2024_NASDA_Education_processed.csv', index=True)

# Serialize Results to File

Use joblib to serialize data and variables.

In [235]:
# serialize results to a file: future

# create dictionary with variables to save for later
export = {
    'df': df
    }

# save dictionary as a file
joblib.dump(export, '2024_NASDA_Education_export')

# import joblib file with serialized variables
loaded_data = joblib.load('2024_NASDA_Education_export')

# display keys to use to save to new/active variables
loaded_data.keys()

dict_keys(['df'])

# Summary

In this project, the goal of consolidating the different types and locations of Adventist schools across North America was achieved via the official Adventist Education website. 

The consolidated information will be made into a dynamic map using Tableau so that people considering moving in North America can find, *in one place*, what Adventist Education options are available where they might be relocating to.