<a href="https://colab.research.google.com/github/zaynacheema/Web-Crawler-Texas-Pension-Fund/blob/Dev/Notebooks/Web_Crawler_TRS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#install necessary libraries and packages
!pip install duckduckgo-search requests-html nest-asyncio
!pip install requests-html

In [None]:
# This code searches DuckDuckGo for a specific query, in this case, TRS board meeting webcasts, using the DDGS library
# and retrieves relevant URLs. It then fetches and renders the webpages using AsyncHTMLSession, parsing them with BeautifulSoup to
# find Mediasite video links. The entire process runs asynchronously to gather results efficiently. Finally, it prints and returns
# any Mediasite links found from the top search results.

In [None]:
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
import nest_asyncio
import asyncio
from pprint import pprint
import random
import time

#to allow asyncio to work in Colab
nest_asyncio.apply()

def search_duckduckgo(query, max_results=5):
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=max_results))
    print(f"Search results for: {query}")
    urls = []
    for r in results:
        print(f"Result URL: {r['href']}")
        urls.append(r['href'])
    return urls

async def fetch_mediasite_links(url):
    session = AsyncHTMLSession()
    try:
        response = await session.get(url)
        await response.html.arender()
        soup = BeautifulSoup(response.html.html, 'html.parser')
        mediasite_links = []
        for link in soup.find_all('a'):
            href = link.get('href')
            if href and 'mediasite' in href:
                mediasite_links.append(href)
                print('Mediasite link found:', href)
        if not mediasite_links:
            print(f"No Mediasite links found in {url}")
        return mediasite_links
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

async def main(urls):
    tasks = [fetch_mediasite_links(url) for url in urls]
    results = await asyncio.gather(*tasks)
    return [link for sublist in results for link in sublist]

def search_and_crawl(query, max_results=3):
    urls = search_duckduckgo(query, max_results)
    if not urls:
        print(f"No results found for query: {query}")
        return []

    print(f"Fetching Mediasite links from top {len(urls)} results...")
    results = asyncio.run(main(urls))
    return results

#usage
query = 'TRS board meeting webcasts'
result = search_and_crawl(query)
pprint(result)

In [None]:
#this code is a pseudo-version of above, it pulls all Mediasite/video links from a page, in this case, TRS, or the Teacher Retirement System of Texas
#It ultimately goes through a page and pulls all video links hosted on Mediasite Player and retrieves their URLs.

In [None]:
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import nest_asyncio
import asyncio

#to allow asyncio to work in Jupyter environments like Colab
nest_asyncio.apply()

async def fetch_mediasite_links(url):
    session = AsyncHTMLSession()

    #make request to the page
    response = await session.get(url)

    #render the JavaScript
    await response.html.arender()

    #parse the rendered HTML with BeautifulSoup
    soup = BeautifulSoup(response.html.html, 'html.parser')

    #extract Mediasite links
    mediasite_links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and 'mediasite' in href:
            mediasite_links.append(href)
            print('Mediasite link found:', href)

    if not mediasite_links:
        print("No Mediasite links found.")
    return mediasite_links

#function to run the async code and handle event loop
async def main(url):
    return await fetch_mediasite_links(url)

#execute
url = 'https://www.trs.texas.gov/Pages/board_meeting_webcasts.aspx'
result = await main(url)
