In [None]:
#install necessary libraries and packages
!pip install duckduckgo-search requests-html nest-asyncio
!pip install requests-html

Collecting duckduckgo-search
  Downloading duckduckgo_search-6.2.11-py3-none-any.whl.metadata (24 kB)
Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting primp>=0.6.1 (from duckduckgo-search)
  Downloading primp-0.6.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.1-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests-html)
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting bs4 (from requests-html)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting w3lib (from requests-html)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting appdir

In [None]:
# This code searches DuckDuckGo for a specific query, in this case, TRS board meeting webcasts, using the DDGS library
# and retrieves relevant URLs. It then fetches and renders the webpages using AsyncHTMLSession, parsing them with BeautifulSoup to
# find Mediasite video links. The entire process runs asynchronously to gather results efficiently. Finally, it prints and returns
# any Mediasite links found from the top search results.

In [None]:
from duckduckgo_search import DDGS
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
import nest_asyncio
import asyncio
from pprint import pprint
import random
import time

#to allow asyncio to work in Colab
nest_asyncio.apply()

def search_duckduckgo(query, max_results=5):
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=max_results))
    print(f"Search results for: {query}")
    urls = []
    for r in results:
        print(f"Result URL: {r['href']}")
        urls.append(r['href'])
    return urls

async def fetch_mediasite_links(url):
    session = AsyncHTMLSession()
    try:
        response = await session.get(url)
        await response.html.arender()
        soup = BeautifulSoup(response.html.html, 'html.parser')
        mediasite_links = []
        for link in soup.find_all('a'):
            href = link.get('href')
            if href and 'mediasite' in href:
                mediasite_links.append(href)
                print('Mediasite link found:', href)
        if not mediasite_links:
            print(f"No Mediasite links found in {url}")
        return mediasite_links
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

async def main(urls):
    tasks = [fetch_mediasite_links(url) for url in urls]
    results = await asyncio.gather(*tasks)
    return [link for sublist in results for link in sublist]

def search_and_crawl(query, max_results=3):
    urls = search_duckduckgo(query, max_results)
    if not urls:
        print(f"No results found for query: {query}")
        return []

    print(f"Fetching Mediasite links from top {len(urls)} results...")
    results = asyncio.run(main(urls))
    return results

#usage
query = 'TRS board meeting webcasts'
result = search_and_crawl(query)
pprint(result)

Search results for: TRS board meeting webcasts
Result URL: https://www.trs.texas.gov/Pages/board_meeting_webcasts.aspx
Result URL: https://www.trs.texas.gov/Pages/board_meeting_calendar.aspx
Result URL: https://www.trs.texas.gov/Pages/board.aspx
Fetching Mediasite links from top 3 results...


[INFO] Starting Chromium download.
INFO:pyppeteer.chromium_downloader:Starting Chromium download.
100%|██████████| 183M/183M [00:06<00:00, 26.5Mb/s]
[INFO] Beginning extraction
INFO:pyppeteer.chromium_downloader:Beginning extraction
[INFO] Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205
INFO:pyppeteer.chromium_downloader:Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205


No Mediasite links found in https://www.trs.texas.gov/Pages/board_meeting_calendar.aspx
Mediasite link found: https://urldefense.com/v3/__https:/trs.mediasite.com/mediasite/Play/f2611598c594445e98b5ca03f4542db41d__;!!KATJWcSg8mjg!-W4mdjR7GTuCKBzwBfbJIkIf3eyrsOh_AglLL4Kzz1635seKeYNUI3N4SeoUhjs4Qy-M7hSxcKBACQy0Y8tCNp0tYHgCN0R1DOqc1w$
Mediasite link found: https://urldefense.com/v3/__https:/trs.mediasite.com/mediasite/Play/812d17a7f17a4fcea1cc4940b758e6d31d__;!!KATJWcSg8mjg!-W4mdjR7GTuCKBzwBfbJIkIf3eyrsOh_AglLL4Kzz1635seKeYNUI3N4SeoUhjs4Qy-M7hSxcKBACQy0Y8tCNp0tYHgCN0RvfA1SuQ$
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/trsjuly/browse/null/title-az/null/0/77e5dbd7e3974351a3593b82d8de604f14
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/trsmaymeeting2024/browse/null/title-az/null/0/c0f455d2465c4a66ae3bf37ac7389f2e14
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/20242dayboardmeetinglive/browse/null/title-az/null/0/b5fcba3f698741bb9

In [None]:
#this code is a pseudo-version of above, it pulls all Mediasite/video links from a page, in this case, TRS, or the Teacher Retirement System of Texas
#It ultimately goes through a page and pulls all video links hosted on Mediasite Player and retrieves their URLs.

In [None]:
from requests_html import AsyncHTMLSession
from bs4 import BeautifulSoup
import nest_asyncio
import asyncio

#to allow asyncio to work in Jupyter environments like Colab
nest_asyncio.apply()

async def fetch_mediasite_links(url):
    session = AsyncHTMLSession()

    #make request to the page
    response = await session.get(url)

    #render the JavaScript
    await response.html.arender()

    #parse the rendered HTML with BeautifulSoup
    soup = BeautifulSoup(response.html.html, 'html.parser')

    #extract Mediasite links
    mediasite_links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and 'mediasite' in href:
            mediasite_links.append(href)
            print('Mediasite link found:', href)

    if not mediasite_links:
        print("No Mediasite links found.")
    return mediasite_links

#function to run the async code and handle event loop
async def main(url):
    return await fetch_mediasite_links(url)

#execute
url = 'https://www.trs.texas.gov/Pages/board_meeting_webcasts.aspx'
result = await main(url)


Mediasite link found: https://urldefense.com/v3/__https:/trs.mediasite.com/mediasite/Play/f2611598c594445e98b5ca03f4542db41d__;!!KATJWcSg8mjg!-W4mdjR7GTuCKBzwBfbJIkIf3eyrsOh_AglLL4Kzz1635seKeYNUI3N4SeoUhjs4Qy-M7hSxcKBACQy0Y8tCNp0tYHgCN0R1DOqc1w$
Mediasite link found: https://urldefense.com/v3/__https:/trs.mediasite.com/mediasite/Play/812d17a7f17a4fcea1cc4940b758e6d31d__;!!KATJWcSg8mjg!-W4mdjR7GTuCKBzwBfbJIkIf3eyrsOh_AglLL4Kzz1635seKeYNUI3N4SeoUhjs4Qy-M7hSxcKBACQy0Y8tCNp0tYHgCN0RvfA1SuQ$
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/trsjuly/browse/null/title-az/null/0/77e5dbd7e3974351a3593b82d8de604f14
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/trsmaymeeting2024/browse/null/title-az/null/0/c0f455d2465c4a66ae3bf37ac7389f2e14
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/20242dayboardmeetinglive/browse/null/title-az/null/0/b5fcba3f698741bb94a64336cfdbe27a14
Mediasite link found: https://trs.mediasite.com/mediasite/Channel/300a