<a href="https://colab.research.google.com/github/zaynacheema/Web-Crawler-Texas-Pension-Fund/blob/Dev/Notebooks/Web_Crawler_UTSYSTEM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This code was made to crawl the UTSystem website, starting from the homepage to locate webcast video links hosted on
# Mediasite. It navigates through pages, such as "Regents" and "Meetings," and extracts links related to board webcasts by
# following links that contain specific keywords, in order to locate Board Meeting Webcasts for the Pension Fund.


In [None]:
#requirements
!pip install bs4

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def fetch_and_parse(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        content = response.content
        return BeautifulSoup(content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def find_webcast_links(start_url):
    visited = set()
    queue = [start_url]

    while queue:
        url = queue.pop(0)
        if url in visited:
            continue
        visited.add(url)

        soup = fetch_and_parse(url)
        if not soup:
            continue

        for link in soup.find_all('a'):
            href = link.get('href')
            link_text = link.get_text().strip()

            if href and link_text.lower().startswith('webcast for'):
                full_url = urljoin(url, href)
                if full_url.startswith('http') and full_url not in visited:
                    queue.append(full_url)

                    # After finding a Webcast link, fetch and extract Mediasite links
                    extract_mediasite_links(full_url)

def extract_mediasite_links(webcast_url):
    #fetch and parse the Webcasts page
    soup = fetch_and_parse(webcast_url)
    if not soup:
        return

    #extract Mediasite links
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and 'mediasite' in href:
            print("Mediasite link found:", href)

def navigate_and_crawl(homepage_url):
    soup = fetch_and_parse(homepage_url)
    if not soup:
        return

    #navigate to the "Regents" page
    regents_page_url = None
    for link in soup.find_all('a'):
        href = link.get('href')
        link_text = link.get_text().strip()
        if link_text.lower() == 'regents':
            regents_page_url = urljoin(homepage_url, href)
            break

    if not regents_page_url:
        print("Regents page not found.")
        return

    #navigate to the "Meetings" page from the "Regents" page
    soup = fetch_and_parse(regents_page_url)
    if not soup:
        return

    meetings_page_url = None
    for link in soup.find_all('a'):
        href = link.get('href')
        link_text = link.get_text().strip()
        if link_text.lower() == 'meetings':
            meetings_page_url = urljoin(regents_page_url, href)
            break

    if not meetings_page_url:
        print("Meetings page not found.")
        return

    #start the search for Webcast links
    find_webcast_links(meetings_page_url)

#start from the homepage/source URL
homepage_url = 'https://www.utsystem.edu'
navigate_and_crawl(homepage_url)
