In [43]:
DOMAIN = "https://lahs.mvla.net/"
LINK_CLASS_NAME = "dropdown-item"
SAVE_DIR = "../documents/"
MAX_DOCS = 150

In [44]:
import os

#Helper functions for dealing with the different files

def read_urls(file_path):
    with open(file_path, 'r') as file:
        return [line.strip() for line in file.readlines()]


def write_urls(file_path, urls):
    with open(file_path, 'w') as file:
        for url in urls:
            file.write(f"{url}\n")


def save_doc(url, content):
    file_path = url.replace('http://', '').replace('https://', '').replace('/', '_') + '.txt'
    file_path = os.path.join(SAVE_DIR, file_path)
    
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)


In [45]:
done = read_urls("done.txt")
pending = read_urls("pending.txt")

In [46]:
from urllib.parse import urljoin

def is_valid(link):
    if 'http' in link and DOMAIN not in link: #External link
        return False
    elif 'pdf' in link: #Not a webpage
        return False
    elif "javascript" in link:
        return False

    return True

def get_links(soup):
    for a_tag in soup.find_all('a', class_=LINK_CLASS_NAME):
        link = a_tag.get('href')
        if is_valid(link):
            if 'http' not in link:
                link = urljoin(DOMAIN, link)
        else:
            continue
        
        if link in done or link in pending:
            continue
        else:
            pending.append(link)

    write_urls("pending.txt", pending)

In [47]:
import requests
from bs4 import BeautifulSoup

def scrape_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        get_links(soup)
        
        for nav in soup.find_all(class_='top-bar-header'):
            nav.decompose()
        
        save_doc(url, soup.get_text(separator='\n', strip=True))
        done.append(url)
        write_urls("done.txt", done)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return False

In [48]:
while len(pending) != 0 and len(os.listdir(SAVE_DIR)) < MAX_DOCS:
    url = pending.pop(0)
    scrape_page(url)