In [None]:
import datetime
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup
import json
import os
import pathlib
import time

In [None]:
from_page = 1
to_page = 4
link_file = pathlib.Path('./data/linklist.json')

In [None]:
def load_known_link_ids(link_file: str) -> List[str]:
    """Loads known link IDs from a file and returns them as a list.

    If the file does not exist, it creates an empty file.

    Args:
        link_file (str): The file to load the link IDs from.

    Returns:
        List[str]: A list of known link IDs.
    """
    known_link_ids = []
    if os.path.exists(link_file):
        with open(link_file) as f:
            try:
                link_list = json.load(f)
                for linfo in link_list:
                    known_link_ids.append(linfo['id'])
            except json.JSONDecodeError:
                print(f'Link file {link_file} is empty')
    else:
        with open(link_file, 'w') as j:
            j.close()
    return known_link_ids


In [None]:
def scrape_links(from_page: int, to_page: int, link_file: str) -> None:
    """Scrapes links from a website and appends them to a file.

    The function makes a GET request to the website's search API for each page between `from_page` and `to_page` inclusive.
    If a link has not been seen before, it is added to the file. The function also outputs a message for every 10th page it
    processes.

    Args:
        from_page (int): The first page to scrape links from.
        to_page (int): The last page to scrape links from.
        link_file (str): The file to append the new links to.
    """
    known_link_ids = load_known_link_ids(link_file)
    has_results = True
    i = from_page
    while has_results and i <= to_page:
        new_link_list = []  
        time.sleep(2)
        search_response = requests.get(f'https://magic.wizards.com/en/news/archive?search&page={i}&category=all&author=all&order=newest')
        page_result = search_response.text
        soup = BeautifulSoup(page_result, 'html.parser')

        # Check if page is empty
        entry = soup.find(class_="css-36asz")
        if entry:
            print(f'Response from page {i} :'+entry.text.strip())
            has_results = False
            break

        # Find all entries on the page
        entry_list = soup.find_all(class_="css-9f4rq")
        for entry in entry_list:
            parent = entry.findParent()
            link_id = parent.get('href')
            if link_id not in known_link_ids:
                link_info = {
                    'id': link_id,
                    'link': 'https://magic.wizards.com' + link_id,
                    'name': parent.text,
                    'date_added': datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
                    'date_downloaded': ''
                }
                print(f'Adding {link_id}')
                new_link_list.append(link_info)
            else:
                print(f'Link with Id {link_id} already exists')

        # Append new links to the file
        with open(link_file, 'w') as j:
            json.dump(link_list + new_link_list, j, indent=4)

        if i % 10 == 0:
            print(f'Page: {i}')
            print(search_response.status_code)
        i += 1

In [None]:
def main():
    # The file to store the links in
    link_file = './data/links.json'
    # The first page to scrape links from
    from_page = 1
    # The last page to scrape links from
    to_page = 50
    # Scrape the links from the website
    scrape_links(from_page, to_page, link_file)
    # Load the known link IDs from the file
    known_link_ids, link_list = load_link_ids(link_file)

if __name__ == '__main__':
    # Call the main method when the module is executed as a script
    main()

In [None]:
def test_scrape_links():
    # Test data
    link_file = './test_data/test_links.json'
    from_page = 1
    to_page = 2
    
    # Call the scrape_links method with the test data
    scrape_links(from_page, to_page, link_file)
    
    # Assert that the link file was created
    assert os.path.exists(link_file)
    
    # Load the links from the file
    with open(link_file) as f:
        links = json.load(f)
    
    # Assert that the correct number of links was scraped
    assert len(links) == 20 # 20 links are expected to be scraped from 2 pages

def test_load_link_ids():
    # Test data
    link_file = './test_data/test_links.json'
    
    # Load the link IDs from the file
    known_link_ids, links = load_link_ids(link_file)
    
    # Assert that the correct number of link IDs was loaded
    assert len(known_link_ids) == 20 # 20 links are expected to be in the file
    assert len(links) == 20 # 20 links are expected to be in the file

def scrape_test():
    test_scrape_links()
    test_load_link_ids()


In [None]:
def main():
    # The file to store the links in
    link_file = './data/links.json'
    # The first page to scrape links from
    from_page = 1
    # The last page to scrape links from
    to_page = 50
    # Run the scrape tests
    scrape_test()
    # Scrape the links from the website
    scrape_links(from_page, to_page, link_file)
    # Load the known link IDs from the file
    known_link_ids, link_list = load_link_ids(link_file)


if __name__ == '__main__':
    # Call the main method when the module is executed as a script
    main()

In [1]:
scrape_test()
    # Scrape the links from the website

NameError: name 'scrape_test' is not defined