# Import Libraries

In [21]:
# Libraries
import pandas as pd
import numpy as np
import requests 
import re

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# BeautifulSoup
from bs4 import BeautifulSoup as bs

# Functions

In [67]:
# Setup the Selenium WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [200]:
# Specify column names to use with dataframe
columns = ['School Name', 'Full Address', 'Street Address', 'City', 'State', 'Zip Code', 'Country', 'School Type', 'Website', 'Description']

# Initialize DataFrame
df = pd.DataFrame(columns=columns)

print(df)

Empty DataFrame
Columns: [School Name, Full Address, Street Address, City, State, Zip Code, Country, School Type, Website, Description]
Index: []


In [201]:
# Create list of States & Provinces in the USA & Canada to loop over
#state_province = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'ON', 'PE', 'QC', 'SK']
state_province = ['AL']
province = ['AB', 'BC', 'MB', 'NB', 'NL', 'NS', 'ON', 'PE', 'QC', 'SK']

for state in state_province:

    if state in province: 
        co_code = 'CA'

    else:
        co_code = 'US'

    # Create link for each state/province 
    url = f'https://www.adventisteducation.org/schools?stateProvince={state}%3B{co_code}'

    # Open page
    driver.get(url)

    # Wait for the element to become clickable
    element = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//*[contains(@class, 'Schools_searchResults__')]"))
    )
    # Once the element is clickable, get the outer HTML
    html_content = element.get_attribute("outerHTML")

    # Use BeautifulSoup to parse the HTML content
    soup = bs(html_content, "html.parser")



    # Find each 'a' anchor tag matching the regex search below to find info for each school
    result_divs = soup.find_all('a', class_ = re.compile('SearchResult_root__'))

    # Loop over each result to extract name of school
    for div in result_divs:


        # School Name
        # Find name of school 
        name_heading = div.find_all('h2', class_ = re.compile('SearchResult_heading__'))
        name_pattern = r">.*?</"

        # Use regex to find the pattern in the name_heading and extract the matching text
        name = re.search(name_pattern, str(name_heading))
        
        # Extract all matches in a single group
        name = name.group()
        
        # Use string slicing to remove the ends that helped return the result
        name = name[1:-2]


        # School Address
        # Find & Collect each full address
        address_heading = div.find_all('div', class_ = re.compile('SearchResult_address__'))
        address_pattern  = r">.*?</"
        address_full = re.search(address_pattern, str(address_heading))
        address_full = address_full.group()

        # Trim result address string of regex remnants
        address_full = address_full[1:-2]

        # Extract different parts of address information
        address_detail_pattern  = re.compile(r"(\d+(\s.*)*,)\s(.*,)\s(\w*,)\s(\d+-*\d*,|.{3}\s.{3},)\s(\w*)")  # regex expression
        address_detail = address_detail_pattern.search(address_full)   # search full address for patterns

        # '6354 Highway 431 S, Owens Cross Roads, AL, 35763-9210, USA'

        if address_detail != None:
            address_street, address_road_type, address_city, address_state, address_zip, address_country = address_detail.groups()  # extract each group from pattern

            # Use string slicing to remove the ends that helped return the result
            address_street = address_street[:-1]
            address_road_type = address_road_type[1:]
            address_city = address_city[:-1]
            address_state = address_state[:-1]
            address_zip = address_zip[:-1]

        
        # School Type
        # Find school type
        school_heading = div.find_all('dd')

        # Extract school type
        school_pattern  = re.compile(r">(.*)</")  # regex expression
        school_type = re.search(school_pattern, str(school_heading))  # search for pattern
        school_type = school_type.group()   # extract result from group 

        school_type = school_type[1:-2]


        # School Website
        # Find link
        link_pattern  = re.compile('href="/schools/(.){6}"')
        link_partial = re.search(link_pattern, str(div))
        link_partial = link_partial.group()

        link_partial = link_partial[6:-1]

        link_root = 'https://www.adventisteducation.org'

        link_full = link_root + link_partial


        # School Description
        # Find description
        desc_full = div.find_all('div', class_ = re.compile('SearchResult_intro__'))
        
        # Check if there is a description or not
        if desc_full:

            # Convert matching result to a string and trim off excess
            desc_full = str(desc_full)
            desc_full = desc_full[40:-7]


        new_row = pd.DataFrame([{'School Name': name, 'Full Address': address_full, 'Street Address': address_street, 'City': address_city, 'State': address_state, 'Zip Code': address_zip, 'Country': address_country, 'School Type': school_type, 'Website': link_full, 'Description': desc_full}])
        #print(new_row)

        df = pd.concat([df, new_row], ignore_index=True)

display(df)




Unnamed: 0,School Name,Full Address,Street Address,City,State,Zip Code,Country,School Type,Website,Description
0,Bethany Christian Academy,"1765 Highland Ave, Montgomery, AL, 36107-2657,...",1765 Highland Ave,Montgomery,AL,36107-2657,USA,PK-08,https://www.adventisteducation.org/schools/ANTP25,Bethany Christian Academy is a Christ-centered...
1,Bethany SDA Child Development Center,"714 Cedar St, Montgomery, AL, 36106-1002, USA",714 Cedar St,Montgomery,AL,36106-1002,USA,ECP,https://www.adventisteducation.org/schools/ANTP27,[]
2,Big Cove Christian Academy,"6354 Highway 431 S, Owens Cross Roads, AL, 357...",6354 Highway 431 S,Owens Cross Roads,AL,35763-9210,USA,PK-08,https://www.adventisteducation.org/schools/ANTG16,[]
3,Ephesus Academy Child Development Center,"829 McMillon Ave SW, Birmingham, AL, 35211-173...",829 McMillon Ave SW,Birmingham,AL,35211-1730,USA,ECP,https://www.adventisteducation.org/schools/ANTP2A,[]
4,Ephesus Junior Academy,"829 McMillon Ave SW, Birmingham, AL, 35211-173...",829 McMillon Ave SW,Birmingham,AL,35211-1730,USA,PK-10,https://www.adventisteducation.org/schools/ANTP45,[]
5,Floral Crest School,"1228 County Road 89, Bryant, AL, 35958-5332, USA",1228 County Road 89,Bryant,AL,35958-5332,USA,PK-08,https://www.adventisteducation.org/schools/ANTG36,[]
6,Hoover Christian School,"2113 Old Rocky Ridge Rd, Hoover, AL, 35216-610...",2113 Old Rocky Ridge Rd,Hoover,AL,35216-6101,USA,PK-10,https://www.adventisteducation.org/schools/ANTG20,"Hoover Christian School, a Seventh-day Adventi..."
7,Mobile Junior Academy,"1900 Cody Rd S, Mobile, AL, 36695-3007, USA",1900 Cody Rd S,Mobile,AL,36695-3007,USA,PK-08,https://www.adventisteducation.org/schools/ANTG52,[]
8,New Life Early Learning Center,"3912 Pulaski Pike NW # B, Huntsville, AL, 3581...",3912 Pulaski Pike NW # B,Huntsville,AL,35810-2659,USA,ECP,https://www.adventisteducation.org/schools/ANTP2F,[]
9,Oakwood Adventist Academy,"7000 Adventist Blvd, Huntsville, AL, 35896-000...",7000 Adventist Blvd,Huntsville,AL,35896-0001,USA,PK-12,https://www.adventisteducation.org/schools/ANTP31,Oakwood Adventist Academy is a Seventh-day Adv...


In [199]:
df['Full Address'][2]

'6354 Highway 431 S, Owens Cross Roads, AL, 35763-9210, USA'

In [65]:
# Remember to close the browser
driver.quit()

In [None]:
'''
# Open page
url = 'https://www.adventisteducation.org/schools?stateProvince=AK%3BUS'  # Replace this with the URL of the page you want to scrape
driver.get(url)

# Wait for the element to become clickable
element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//*[contains(@class, 'Schools_searchResults__')]"))
)
# Once the element is clickable, get the outer HTML
html_content = element.get_attribute("outerHTML")

# Use BeautifulSoup to parse the HTML content
soup = bs(html_content, "html.parser")
#pretty_html = soup.prettify()
#print(pretty_html)
'''

In [191]:
'''
# Find each 'a' anchor tag matching the regex search below to find info for each school
result_divs = soup.find_all('a', class_ = re.compile('SearchResult_root__'))

# Loop over each result to extract name of school
for div in result_divs:


    # School Name
    # Find name of school 
    name_heading = div.find_all('h2', class_ = re.compile('SearchResult_heading__'))
    name_pattern = r">.*?</"

    # Use regex to find the pattern in the name_heading and extract the matching text
    name = re.search(name_pattern, str(name_heading))
    
    # Extract all matches in a single group
    name = name.group()
    
    # Use string slicing to remove the ends that helped return the result
    name = name[1:-2]

    #print(name)


    # School Address
    # Find & Collect each full address
    address_heading = div.find_all('div', class_ = re.compile('SearchResult_address__'))
    address_pattern  = r">.*?</"
    address_full = re.search(address_pattern, str(address_heading))
    address_full = address_full.group()

    # Trim result address string of regex remnants
    address_full = address_full[1:-2]
    #print(address_full)

    # Extract different parts of address information
    address_detail_pattern  = re.compile(r"(\d+(\s\w*)*,)\s(\w*,)\s(\w*,)\s(\d+-*\d*,)\s(\w*)")  # regex expression
    address_detail = address_detail_pattern.search(address_full)   # search full address for patterns
    address_street, address_road_type, address_city, address_state, address_zip, address_country = address_detail.groups()  # extract each group from pattern

    # Use string slicing to remove the ends that helped return the result
    address_street = address_street[:-1]
    address_road_type = address_road_type[1:]
    address_city = address_city[:-1]
    address_state = address_state[:-1]
    address_zip = address_zip[:-1]

    #print(address_street)
    #print(address_road_type)
    #print(address_city)
    #print(address_state)
    #print(address_zip)
    #print(address_country)

    
    # School Type
    # Find school type
    school_heading = div.find_all('dd')

    # Extract school type
    school_pattern  = re.compile(r">(.*)</")  # regex expression
    school_type = re.search(school_pattern, str(school_heading))  # search for pattern
    school_type = school_type.group()   # extract result from group 

    school_type = school_type[1:-2]

    #print(school_type)


    # School Website
    # Find link
    link_pattern  = re.compile('href="/schools/(.){6}"')
    link_partial = re.search(link_pattern, str(div))
    link_partial = link_partial.group()

    link_partial = link_partial[6:-1]

    #print(link_partial)

    link_root = 'https://www.adventisteducation.org'

    link_full = link_root + link_partial

    #print(link_full)


    # School Description
    # Find description
    desc_full = div.find_all('div', class_ = re.compile('SearchResult_intro__'))
    
    # Check if there is a description or not
    if desc_full:

        # Convert matching result to a string and trim off excess
        desc_full = str(desc_full)
        desc_full = desc_full[40:-7]
        
        #print(desc_full)


    new_row = pd.DataFrame([{'School Name': name, 'Full Address': address_full, 'Street Address': address_street, 'City': address_city, 'State': address_state, 'Zip Code': address_zip, 'Country': address_country, 'School Type': school_type, 'Website': link_full, 'Description': desc_full}])
    #print(new_row)

    df = pd.concat([df, new_row], ignore_index=True)

display(df)
'''

Unnamed: 0,School Name,Full Address,Street Address,City,State,Zip Code,Country,School Type,Website,Description
0,Amazing Grace Academy,"2238 S Inner Springer Loop, Palmer, AK, 99645,...",2238 S Inner Springer Loop,Palmer,AK,99645,USA,PK-12,https://www.adventisteducation.org/schools/ANI465,Amazing Grace Academy is a Seventh-day Adventi...
1,Anchorage Junior Academy,"5511 Omalley Rd, Anchorage, AK, 99507-6856, USA",5511 Omalley Rd,Anchorage,AK,99507-6856,USA,PK-08,https://www.adventisteducation.org/schools/ANI415,AJA exists to introduce each student to Jesus ...
2,Dillingham Adventist School,"446 Windmill Hill Rd, Dillingham, AK, 99576, USA",446 Windmill Hill Rd,Dillingham,AK,99576,USA,PK-08,https://www.adventisteducation.org/schools/ANI435,[]
3,Golden Heart Christian School,"1809 Farmers Loop Rd, Fairbanks, AK, 99709-655...",1809 Farmers Loop Rd,Fairbanks,AK,99709-6553,USA,PK-08,https://www.adventisteducation.org/schools/ANI440,[]
4,Juneau Adventist Christian School,"4890 Glacier Hwy, Juneau, AK, 99801-9512, USA",4890 Glacier Hwy,Juneau,AK,99801-9512,USA,PK-08,https://www.adventisteducation.org/schools/ANI445,[]
5,Sitka Adventist School,"1613 Halibut Point Rd, Sitka, AK, 99835-7010, USA",1613 Halibut Point Rd,Sitka,AK,99835-7010,USA,PK-08,https://www.adventisteducation.org/schools/ANI480,[]


In [None]:
'''
result_divs = soup.find_all('a', class_ = re.compile('SearchResult_root__'))
#result_divs = soup.find_all('a', class_ = 'SearchResult_root__th6zA')


#print(result_divs)

for div in result_divs:
    #print(div)

    name_heading = div.find_all('h2', class_ = re.compile('SearchResult_heading__'))

    #pattern = r">.*?</"

    #name = re.match(pattern, name_heading)

    print(name_heading)
#address = result_divs.find_all('div', class_ = re.compile('SearchResult_address__'))
#school_type = result_divs.find_all('dd')

#print(name_heading, address, school_type)
'''

[<h2 class="SearchResult_heading__6Ypll">Amazing Grace Academy</h2>]
[<h2 class="SearchResult_heading__6Ypll">Anchorage Junior Academy</h2>]
[<h2 class="SearchResult_heading__6Ypll">Dillingham Adventist School</h2>]
[<h2 class="SearchResult_heading__6Ypll">Golden Heart Christian School</h2>]
[<h2 class="SearchResult_heading__6Ypll">Juneau Adventist Christian School</h2>]
[<h2 class="SearchResult_heading__6Ypll">Sitka Adventist School</h2>]


"\n#address = result_divs.find_all('div', class_ = re.compile('SearchResult_address__'))\n#school_type = result_divs.find_all('dd')\n\n#print(name_heading, address, school_type)\n"

In [15]:
'''
# Send a GET request to the webpage
#url = 'https://www.adventisteducation.org/schools'  # Replace this with the URL of the page you want to scrape
url = 'https://www.adventisteducation.org/schools?stateProvince=AL%3BUS'  # Replace this with the URL of the page you want to scrape
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = bs(response.text, 'html.parser')

    # Find elements by their HTML tags or classes
    # For example, to find all <a> tags (links) in the page
    links = soup.find_all('a')

    # Process the scraped data
    for link in links:
        # Print the link text and its URL
        print(f"Link Text: {link.text}, URL: {link['href']}")

else:
    print("Failed to retrieve the webpage")
'''


Need to install Selenium to execute JS and pull results with Python. 

In [9]:
'''
s = soup.find('div', id='Schools_searchResults__xyz')
content = s.find_all('p')

print(content)
'''

[<p>This Python Tutorial is very well suited for Beginners, and also for experienced programmers with other programming languages like C++ and Java. This specially designed free Python tutorial will help you learn Python Programming Language in the most efficient way, with topics from basics to advanced (like Web-scraping, Django, Deep-Learning, etc.) with examples. </p>, <p>Python is a high-level, general-purpose, and very popular programming language. Python programming language (latest Python 3) is being used in web development, Machine Learning applications, along with all cutting-edge technology in Software Industry.

  Python language is being used by almost all tech-giant companies like – Google, Amazon, Facebook, Instagram, Dropbox, Uber… etc.</p>, <p>The biggest strength of Python is huge collection of standard library which can be used for the following:</p>, <p>Python is currently the most widely used multi-purpose, high-level programming language, which allows programming i

In [10]:
'''
leftbar = s.find('ul', class_='leftBarList')

content = leftbar.find_all('li')

print(content)
'''

[<li style="background-color: var(--leftbar-explore-section-color) !important;"><a class="experience-section" href="https://www.geeksforgeeks.org/courses/coding-for-everyone?itm_source=geeksforgeeks&amp;itm_medium=leftbar&amp;itm_campaign=courses">Coding for Everyone</a></li>, <li class="currentpage"><a href="https://www.geeksforgeeks.org/python-programming-language/?ref=lbp">Python Tutorial</a></li>, <li><a href="https://www.geeksforgeeks.org/introduction-to-python/?ref=lbp">Introduction To Python</a></li>, <li><a href="https://www.geeksforgeeks.org/python-language-advantages-applications/?ref=lbp">Python Language advantages and applications</a></li>, <li><a href="https://www.geeksforgeeks.org/download-and-install-python-3-latest-version/?ref=lbp">Download and Install Python 3 Latest Version</a></li>, <li><a href="https://www.geeksforgeeks.org/python-3-basics/?ref=lbp">Python 3 basics</a></li>, <li><a href="https://www.geeksforgeeks.org/python-keywords/?ref=lbp">Python Keywords</a></l

In [11]:
'''
lines = s.find_all('p')

for line in lines:
    print(line.text)
'''

This Python Tutorial is very well suited for Beginners, and also for experienced programmers with other programming languages like C++ and Java. This specially designed free Python tutorial will help you learn Python Programming Language in the most efficient way, with topics from basics to advanced (like Web-scraping, Django, Deep-Learning, etc.) with examples. 
Python is a high-level, general-purpose, and very popular programming language. Python programming language (latest Python 3) is being used in web development, Machine Learning applications, along with all cutting-edge technology in Software Industry.

  Python language is being used by almost all tech-giant companies like – Google, Amazon, Facebook, Instagram, Dropbox, Uber… etc.
The biggest strength of Python is huge collection of standard library which can be used for the following:
Python is currently the most widely used multi-purpose, high-level programming language, which allows programming in Object-Oriented and Proced

In [12]:
'''
lines = leftbar.find_all('li')

for line in lines:
    print(line.text)
'''

Coding for Everyone
Python Tutorial
Introduction To Python
Python Language advantages and applications
Download and Install Python 3 Latest Version
Python 3 basics
Python Keywords
Namespaces and Scope in Python
Statement, Indentation and Comment in Python
Assigning Values to Variables
Taking input in Python
Taking input from console in Python
Taking multiple inputs from user in Python
Python | Output using print() function
How to print without newline in Python?
Python end parameter in print()
Python | sep parameter in print()
Python | Output Formatting
Python Operators
Ternary Operator in Python
Division Operators in Python
Operator Overloading in Python
Any All in Python
Operator Functions in Python | Set 1
Operator Functions in Python | Set 2
Difference between == and is operator in Python
Python Membership and Identity Operators
Python Data Types
Python | Set 3 (Strings, Lists, Tuples, Iterations)
Python String
Python string length | len() function to find string length
String Slic

In [13]:
'''
for link in soup.find_all('a'):
    print(link.get('href'))
'''

#main
https://www.geeksforgeeks.org/
https://www.geeksforgeeks.org/complete-guide-to-arrays-data-structure/?ref=outind
https://www.geeksforgeeks.org/introduction-to-matrix-or-grid-data-structure-and-algorithms-tutorial/?ref=outind
https://www.geeksforgeeks.org/complete-guide-to-string-data-structure/?ref=outind
https://www.geeksforgeeks.org/singly-linked-list-definition-meaning-dsa/?ref=outind
https://www.geeksforgeeks.org/doubly-linked-list/?ref=outind
https://www.geeksforgeeks.org/circular-linked-list/?ref=outind
https://www.geeksforgeeks.org/insertion-in-doubly-circular-linked-list/?ref=outind
https://www.geeksforgeeks.org/introduction-to-linked-list-data-structure-and-algorithm-tutorial/?ref=outind
https://www.geeksforgeeks.org/introduction-to-stack-data-structure-and-algorithm-tutorials/?ref=outind
https://www.geeksforgeeks.org/introduction-to-queue-data-structure-and-algorithm-tutorials/?ref=outind
https://www.geeksforgeeks.org/generic-treesn-array-trees/?ref=outind
https://www.g

In [15]:
'''
images_list = []

images = soup.select('img')

for image in images:
    src = image.get('src')
    alt = image.get('alt')
    images_list.append({'src': src, 'alt': alt})

for image in images_list:
    print(image)
'''

{'src': 'https://media.geeksforgeeks.org/gfg-gg-logo.svg', 'alt': 'geeksforgeeks'}
{'src': 'https://media.geeksforgeeks.org/auth-dashboard-uploads/Group-23726.png', 'alt': 'Related Articles'}
{'src': 'https://media.geeksforgeeks.org/wp-content/cdn-uploads/20230305181456/Python4.png', 'alt': 'Python Tutorial'}
{'src': 'https://media.geeksforgeeks.org/auth-dashboard-uploads/gfgFooterLogo.png', 'alt': 'geeksforgeeks-footer-logo'}
{'src': 'https://media.geeksforgeeks.org/auth-dashboard-uploads/googleplay.png', 'alt': 'GFG App on Play Store'}
{'src': 'https://media.geeksforgeeks.org/auth-dashboard-uploads/appstore.png', 'alt': 'GFG App on App Store'}
{'src': '', 'alt': 'Lightbox'}


In [None]:
'''
URL = 'https://www.geeksforgeeks.org/page/1/'

req = requests.get(URL)
soup = bs(req.text, 'html.parser')

titles = soup.find_all('div', attrs = {'class', 'head'})

print(titles.text)
'''