In [8]:
import requests
from bs4 import BeautifulSoup
import time

# URL of the webpage to scrape
url = "https://catalog.tulane.edu/programs/?optionlessH#filter=.filter_1"

# Function to scrape the page with retries
def scrape_page(url, max_retries=3, retry_delay=5):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.content
        except (requests.exceptions.RequestException, ConnectionError) as e:
            print(f"Error retrieving page, retrying in {retry_delay} seconds: {e}")
            retries += 1
            time.sleep(retry_delay)
    raise Exception("Failed to retrieve page after multiple attempts")

# Scrape the page
html_content = scrape_page(url)

# Parse the HTML content using Beautiful Soup
soup = BeautifulSoup(html_content, "html.parser")

# Find all the major titles
major_titles = soup.find_all("span", class_="title")

# Find the school and major type information
major_info = soup.find_all("span", class_="keyword")

# Loop through each major title and extract the information
for i in range(len(major_titles)):
    # Get the major name
    major_name = major_titles[i].text.strip()

    # Get the school and major type
    if 2*i < len(major_info):
        school = major_info[2*i].text.strip()
        major_type = major_info[2*i+1].text.strip()
    else:
        school = "N/A"
        major_type = "N/A"

    # Construct the links
    major_links = soup.find_all("a", class_="item-container")
    


    if i < len(major_links):
        major_href = major_links[i]["href"]
        parts = major_href.split('/')
        print("Part:", parts)
        if len(parts) >= 3:
            home_link = f"https://catalog.tulane.edu/{'/'.join(parts[1:-1])}/{parts[-1]}/"
            req_link = home_link + "#requirementstext"
        else:
            home_link = "N/A"
            req_link = "N/A"
    else:
        home_link = "N/A"
        req_link = "N/A"

'''
    print(f"Major: {major_name}")
    print(f"School: {school}")
    print(f"Major Type: {major_type}")
    print(f"Home Link: {home_link}")
    print(f"Requirements Link: {req_link}")
    print()

'''


for title in major_titles:
    # Get the major name
    major_name = title.text.strip()
    
    # Find the 'a' tag that wraps this 'span' with class "title"
    major_link_tag = title.find_parent("a")
    major_href = major_link_tag['href'] if major_link_tag and 'href' in major_link_tag.attrs else "N/A"

    # If a valid href is found, construct the full URL
    if major_href != "N/A":
        parts = major_href.split('/')
        home_link = f"https://catalog.tulane.edu/{'/'.join(parts[1:-1])}/{parts[-1]}/"
        req_link = home_link + "#requirementstext"
    else:
        home_link = "N/A"
        req_link = "N/A"
    
    # Print the information
    print(f"Major: {major_name}")
    print(f"Home Link: {home_link}")
    print(f"Requirements Link: {req_link}")
    print()

Major: Accounting Fundamentals Certificate
Home Link: https://catalog.tulane.edu/professional-advancement/business-leadership-studies/accounting-fundamentals-certificate//
Requirements Link: https://catalog.tulane.edu/professional-advancement/business-leadership-studies/accounting-fundamentals-certificate//#requirementstext

Major: Accounting Minor (Freeman School of Business)
Home Link: https://catalog.tulane.edu/business/accounting/accounting-minor//
Requirements Link: https://catalog.tulane.edu/business/accounting/accounting-minor//#requirementstext

Major: Accounting, MACCT
Home Link: https://catalog.tulane.edu/business/accounting/accounting-mac//
Requirements Link: https://catalog.tulane.edu/business/accounting/accounting-mac//#requirementstext

Major: Admiralty, LMA
Home Link: https://catalog.tulane.edu/law/master-laws/admiralty-lma//
Requirements Link: https://catalog.tulane.edu/law/master-laws/admiralty-lma//#requirementstext

Major: Advanced Emergency Management Certificate (G