## Scrape https://magic.wizards.com/en/news/archive and collect the articles

Imports

In [None]:
import json
import pathlib
import shutil
import datetime
import requests


In [None]:
link_file = pathlib.Path('./data/linklist.json')
link_file_tmp = pathlib.Path('./data/linklist.json.tmp')
skip_existing = True

In [None]:
def download_article_html(link: str) -> bool:
    # Use a flag to track the success of the download
    success = False
    
    try:
        response = requests.get(link)
        response.raise_for_status()
        file_name = './data/raw_html/' + link.split('/')[-1]
        with open(file_name, 'w') as f:
            f.write(response.text)
    except requests.exceptions.HTTPError as ex:
        print(f"Error downloading '{link}': {ex}")
    except Exception as ex:
        print(f"Error writing to file '{file_name}': {ex}")
    else:
        success = True
        print(f"Download of '{link}' was successful")
    
    return success

In [None]:
def download_html_files(link_file: str, skip_existing: bool = False):
    """
    Downloads the HTML files for all links in the link file.
    
    Args:
        link_file: The file that contains the links to download.
        skip_existing: Whether to skip links that have already been downloaded.
    
    Returns:
        None
    """
    # The temporary file to store the links in
    link_file_tmp = link_file + '.tmp'
    # Open the link file for reading
    with open(link_file) as lr:
        # The number of links that have been downloaded
        count = 0
        # Load the links from the file
        links = json.load(lr)
        # Loop over the links
        for link_info in links:
            # Get the link
            link = link_info['link']
            # Check if the link has already been downloaded
            if link_info['date_downloaded'] == '' and not skip_existing:
                # Download the HTML file for the link
                download_article_html(link)
                # Update the date the link was downloaded
                link_info['date_downloaded'] = datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")
                # Increment the number of links that have been downloaded
                count += 1
            else:
                print(f'Link {link} already downloaded')
        
        # Write the updated links to the temporary file
        with open(link_file_tmp, 'w') as lw:
            json.dump(links, lw, indent=4) 
        # Replace the original link file with the temporary file
        shutil.move(link_file_tmp, link_file)
        print(f'Download of {count} html files successful')
        

       
    

In [None]:
def main():
    # Load the known link IDs from the link file
    link_file = "./data/links.json"
    known_link_ids = load_known_link_ids(link_file)
    
    # Download the articles, skipping existing ones if specified
    skip_existing = False
    download_articles(link_file, known_link_ids, skip_existing)

if __name__ == '__main__':
    # Call the main method only if this script is being run directly
    main()