In [9]:
import datetime
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup
import json
import os
import pathlib
import time
from typing import List, Tuple

In [10]:
from_page = 1
to_page = 4
link_file = pathlib.Path('./data/linklist.json')

In [34]:
def load_known_link_ids(link_file: str) -> Tuple[List[str], List[dict]]:
    """Loads known link IDs from a file and returns them as a list.

    If the file does not exist, it creates an empty file.

    Args:
        link_file (str): The file to load the link IDs from.

    Returns:
        Tuple[List[str], List[dict]]: A tuple containing the known link IDs and the link list.
    """
    known_link_ids = []
    link_list = []
    if os.path.exists(link_file):
        with open(link_file) as f:
            try:
                link_list = json.load(f)
                for linfo in link_list:
                    known_link_ids.append(linfo['id'])
            except json.JSONDecodeError:
                print(f'Link file {link_file} is empty')
    else:
        with open(link_file, 'w') as j:
            j.close()
    return known_link_ids, link_list


In [54]:
def scrape_links(from_page: int, to_page: int, link_file: str) -> None:
    """Scrapes links from a website and appends them to a file.

    The function makes a GET request to the website's search API for each page between `from_page` and `to_page` inclusive.
    If a link has not been seen before, it is added to the file. The function also outputs a message for every 10th page it
    processes.

    Args:
        from_page (int): The first page to scrape links from.
        to_page (int): The last page to scrape links from.
        link_file (str): The file to append the new links to.
    """
    known_link_ids, link_list = load_known_link_ids(link_file)

    has_results = True
    i = from_page
    while has_results and i <= to_page:
        new_link_list = []  
        time.sleep(2)
        search_response = requests.get(f'https://magic.wizards.com/en/news/archive?search&page={i}&category=all&author=all&order=newest')
        page_result = search_response.text
        soup = BeautifulSoup(page_result, 'html.parser')

        # Check if page is empty
        entry = soup.find(class_="css-36asz")
        if entry:
            print(f'Response from page {i} :'+entry.text.strip())
            has_results = False
            break

        # Find all entries on the page
        entry_list = soup.find_all(class_="css-9f4rq")
        for entry in entry_list:
            parent = entry.findParent()
            link_id = parent.get('href')
            if link_id not in known_link_ids:
                link_info = {
                    'id': link_id,
                    'link': 'https://magic.wizards.com' + link_id,
                    'name': parent.text,
                    'date_added': datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S"),
                    'date_downloaded': ''
                }
                print(f'Adding {link_id}')
                new_link_list.append(link_info)
            else:
                print(f'Link with Id {link_id} already exists')

        link_list = link_list + new_link_list

        if i % 10 == 0:
            print(f'Page: {i}')
            print(search_response.status_code)
        i += 1

        
    with open(link_file, 'w') as j:
        json.dump(link_list, j, indent=4)
    

In [55]:
# def main():
#     # The file to store the links in
#     link_file = './data/links.json'
#     # The first page to scrape links from
#     from_page = 1
#     # The last page to scrape links from
#     to_page = 50
#     # Scrape the links from the website
#     scrape_links(from_page, to_page, link_file)
#     # Load the known link IDs from the file
#     known_link_ids, link_list = load_known_link_ids(link_file)

# if __name__ == '__main__':
#     # Call the main method when the module is executed as a script
#     main()

In [59]:
def test_scrape_links():
    # Test data
    link_file = './test_data/test_links.json'
    from_page = 1
    to_page = 2
    
    # Call the scrape_links method with the test data
    scrape_links(from_page, to_page, link_file)
    
    # Assert that the link file was created
    assert os.path.exists(link_file)
    
    # Load the links from the file
    with open(link_file) as f:
        links = json.load(f)
    
    # Assert that the correct number of links was scraped
    assert len(links) == 10 # 10 links are expected to be scraped from 2 pages

def scrape_test():
    test_scrape_links()



In [62]:
scrape_test()

Link with Id /en/news/announcements/get-a-phyrexian-click-wheel-gift-from-your-local-game-store already exists
Link with Id /en/news/mtg-arena/alchemy-phyrexia-card-image-gallery already exists
Link with Id /en/news/mtg-arena/mtg-arena-announcements-february-27-2023 already exists
Link with Id /en/news/making-magic/one-thousand-one-hundred-and-counting already exists
Link with Id /en/news/announcements/on-june-23-experience-middle-earth-like-never-before already exists
Link with Id /en/news/announcements/commander-masters-arrives-august-4-2023 already exists
Link with Id /en/news/mtg-arena/mtg-arena-announcements-february-20-2023 already exists
Link with Id /en/news/mtg-arena/arena-open-phyrexia-all-will-be-one-march-4-5 already exists
Link with Id /en/news/making-magic/phyrexia-all-will-be-one-vision-design-handoff-document-part-2 already exists
Link with Id /en/news/announcements/a-first-look-at-march-of-the-machine already exists


In [63]:
def download_article_html(link: str) -> bool:
    # Use a flag to track the success of the download
    success = False
    
    try:
        response = requests.get(link)
        response.raise_for_status()
        file_name = './data/raw_html/' + link.split('/')[-1]
        with open(file_name, 'w') as f:
            f.write(response.text)
    except requests.exceptions.HTTPError as ex:
        print(f"Error downloading '{link}': {ex}")
    except Exception as ex:
        print(f"Error writing to file '{file_name}': {ex}")
    else:
        success = True
        print(f"Download of '{link}' was successful")
    
    return success

In [64]:
def download_html_files(link_file: str, skip_existing: bool = False):
    """
    Downloads the HTML files for all links in the link file.
    
    Args:
        link_file: The file that contains the links to download.
        skip_existing: Whether to skip links that have already been downloaded.
    
    Returns:
        None
    """
    # The temporary file to store the links in
    link_file_tmp = link_file + '.tmp'
    # Open the link file for reading
    with open(link_file) as lr:
        # The number of links that have been downloaded
        count = 0
        # Load the links from the file
        links = json.load(lr)
        # Loop over the links
        for link_info in links:
            # Get the link
            link = link_info['link']
            # Check if the link has already been downloaded
            if link_info['date_downloaded'] == '' and not skip_existing:
                # Download the HTML file for the link
                download_article_html(link)
                # Update the date the link was downloaded
                link_info['date_downloaded'] = datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
                # Increment the number of links that have been downloaded
                count += 1
            else:
                print(f'Link {link} already downloaded')
        
        # Write the updated links to the temporary file
        with open(link_file_tmp, 'w') as lw:
            json.dump(links, lw, indent=4) 
        # Replace the original link file with the temporary file
        shutil.move(link_file_tmp, link_file)
        print(f'Download of {count} html files successful')