In [85]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
from IPython import display

In [86]:
def scrape_and_save_content(soup):
    if soup.title and soup.title.string:
        page_title = soup.title.string.strip()
    else:
        print('Non extractable page')
        return
    
    filenames = [f"programpages2023/{page_title}2023.txt",
                 f"programpages2024/{page_title}2024.txt"]
    
    ids = ['year-panel-2023', 'year-panel-2024']

    #Checking if the page is a program page
    if soup.find('div', id= ids[0]) or soup.find('div', id= ids[1]):

        #This tag contains values of a form. So we are removing them for a clean data. 
        option_tags = soup.find_all('option')
        for option_tag in option_tags:
            option_tag.extract()

        #Writing program pages into text files for both 2023 and 2024
        for i in range(2):
            try:        
                with open(filenames[i], 'w', encoding='utf-8') as file:
                    file.write(soup.find('div', id= ids[i]).get_text(strip=True, separator='\n'))
            except FileNotFoundError:
                print("File not found. Please check the file path or ensure that the file exists.")
            except IOError as e:
                print(f"An error occurred: {e}")
                return ''
            except Exception as e:
                print(f"An error occurred: {e}")
                return ''
            
    #Writing non-program pages.
    else:
        filename = f"otherpages/{page_title}.txt"
        try:        
            final_text = '\n'.join([text.text for text in soup.find_all('p')])
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(final_text)
        except FileNotFoundError:
            print("File not found. Please check the file path or ensure that the file exists.")
        except IOError as e:
            print(f"An error occurred: {e}")
            return ''
        except Exception as e:
            print(f"An error occurred: {e}")
            return ''



    return ''

In [88]:
def scrape_page(url, depth=1, all_links=None):
    display.clear_output(wait=True)
    if url in all_links.keys():
        print('Already Visited')
    else:
        # Make a GET request to the URL
        try: 
            response = requests.get(url)
        except requests.exceptions.RequestException as e:
            response = None
        # Check if the request was successful
        if response:
            # Parse the HTML content of the page
            try:
                soup = BeautifulSoup(response.text, 'html.parser')
            except Exception as e:
                print(f'Error : {e}')
                return
            #print(f"Depth {depth}: {url}")
            #print('-' * 50)

            #Scraping and saving the content of the links into a txt file.
            scrape_and_save_content(soup)

            #Find all the links in the url
            all_links[url] = soup.find_all('a', href=True)
            print(f'Link count: {len(all_links)}')
        else:
            print(f"Failed to retrieve {url}")

        # Recursively scrape each linked page, limiting the depth
    if depth < 3 and url in all_links.keys():
        for link in all_links[url]:
            absolute_url = link['href']
            if not absolute_url.startswith('http'):
                absolute_url = 'https://www.georgebrown.ca' + absolute_url
                print(absolute_url)
            if 'georgebrown' in absolute_url: #Not letting it go out of college website.
                # Recursively scrape the linked page with increased depth
                scrape_page(absolute_url, depth= depth + 1, all_links= all_links)


In [89]:
#start_url = 'https://www.georgebrown.ca'
start_url = 'https://www.georgebrown.ca/program-finder?year=2024'
#start_url = 'https://www.georgebrown.ca/programs/applied-ai-solutions-development-program-postgraduate-t431'
all_links = {}
os.makedirs('otherpages', exist_ok= True)
os.makedirs('programpages2023', exist_ok= True)
os.makedirs('programpages2024', exist_ok= True)
scrape_page(start_url, all_links=all_links)

Link count: 3731
https://www.georgebrown.ca/programs/course-outlines?searchOnLoad=true&programVersionCode=S118%2CS118


# TESTING PURPOSES

In [77]:
url = 'https://www.georgebrown.ca/programs/applied-ai-solutions-development-program-postgraduate-t431'

#url = 'https://www.georgebrown.ca/apply'

response = requests.get(url)

soup = BeautifulSoup(response.text, 'html.parser')

def normal_page_scraper(soup):
    final_text = '\n'.join([text.text for text in soup.find_all('p')])
    filename = 
    try:        
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(final_text)
    except FileNotFoundError:
        print("File not found. Please check the file path or ensure that the file exists.")
    except IOError as e:
        print(f"An error occurred: {e}")
        return ''
    with open('before.txt', 'w', encoding='utf-8') as file:
        file.write(final_text)

normal_page_scraper(soup)
    

In [16]:
soup

const showDialogTwoThousandTwentyThree = document.getElementById('show-dialog-TwoThousandTwentyThree');
const contentDialogTwoThousandTwentyThree = document.getElementById('content-dialog-TwoThousandTwentyThree');
const cancelDialogTwoThousandTwentyThree = contentDialogTwoThousandTwentyThree.querySelector('#cancel-dialog-TwoThousandTwentyThree');
showDialogTwoThousandTwentyThree.addEventListener('click', () => {
contentDialogTwoThousandTwentyThree.showModal();
});
cancelDialogTwoThousandTwentyThree.addEventListener('click', (event) => {
event.preventDefault();
contentDialogTwoThousandTwentyThree.close();
});
</script></section> </footer></div>