In [2]:
import os
import requests
import shutil
from typing import List, Dict

# Function to download HTML content and save it to a file
def download_html(url: str, output_folder: str, output_file: str) -> None:
    try:
        # Send a GET request to the URL
        response: requests.Response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Ensure the output folder exists
        os.makedirs(output_folder, exist_ok=True)

        # Write the HTML content to the file
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(response.text)

        print(f"HTML content successfully saved to {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while downloading the HTML: {e}")

def download_pdf(url: str, output_folder: str, output_file: str) -> None:
    try:
        # Send a GET request to the URL
        response: requests.Response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Ensure the output folder exists
        os.makedirs(output_folder, exist_ok=True)

        # Write the PDF content to the file
        with open(output_file, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print(f"PDF successfully saved to {output_file}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while downloading the PDF: {e}")

def create_folder(folder_path: str) -> None:
    try:
        os.makedirs(folder_path, exist_ok=True)
        print(f"Folder '{folder_path}' has been created (or already exists).")
    except Exception as e:
        print(f"An error occurred while creating the folder: {e}")

def clear_folder(folder_path: str) -> None:
    try:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)  # Remove file or symbolic link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)  # Remove directory and its contents
        print(f"Contents of the folder '{folder_path}' have been deleted.")
    except FileNotFoundError:
        print(f"Folder '{folder_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred while clearing the folder: {e}")




In [3]:
html_output_folder = "data/raw/html"
create_folder(html_output_folder)
clear_folder(html_output_folder)

html_pages: List[Dict[str, str]] = [
    {
        "url": "https://weaviate.io/blog/rag-evaluation?utm_term=&utm_campaign=NAL+Performance+Max&utm_source=adwords&utm_medium=ppc&hsa_acc=3045935254&hsa_cam=22233172755&hsa_grp=&hsa_ad=&hsa_src=x&hsa_tgt=&hsa_kw=&hsa_mt=&hsa_net=adwords&hsa_ver=3&gad_source=1&gclid=Cj0KCQjws-S-BhD2ARIsALssG0bUBRpZx5CUAKpkkKvZ0eKfuOS3Ihw7gP7cxbSsNvi1D7uCeVTobXYaAq8NEALw_wcB#gad_source_1",
        "output_filename": os.path.join(html_output_folder, "An Overview on RAG Evaluation.html"),
    },
    {
        "url": "https://weaviate.io/blog/retrieval-evaluation-metrics",
        "output_filename": os.path.join(html_output_folder, "Evaluation Metrics for Search and Recommendation Systems.html"),
    },
    {
        "url": "https://www.meridianenergy.co.nz/ev/electric-hybrid-cars",
        "output_filename": os.path.join(html_output_folder, "Electric and hybrid cars.html"),
    },
    {
        "url": "https://www.meridianenergy.co.nz/ev/buying-ev",
        "output_filename": os.path.join(html_output_folder, "Is it worth buying a second-hand Nissan Leaf.html"),
    },
]



Folder 'data/raw/html' has been created (or already exists).
Contents of the folder 'data/raw/html' have been deleted.


In [4]:

for page in html_pages:
    url: str = page["url"]
    output_filename: str = page["output_filename"]
    download_html(url, html_output_folder, output_filename)

HTML content successfully saved to data/raw/html/An Overview on RAG Evaluation.html
HTML content successfully saved to data/raw/html/Evaluation Metrics for Search and Recommendation Systems.html
HTML content successfully saved to data/raw/html/Electric and hybrid cars.html
HTML content successfully saved to data/raw/html/Is it worth buying a second-hand Nissan Leaf.html


In [5]:
pdf_output_folder = "data/raw/pdf"
create_folder(pdf_output_folder)
clear_folder(pdf_output_folder)

pdf_pages: list[dict[str, str]] = [
    {
        "url": "https://www.meridianenergy.co.nz/public/Investors/Reports-and-presentations/Monthly-operating-reports/FY25/February-2025-monthly-operating-report.pdf",
        "output_filename": os.path.join(pdf_output_folder, "Monthly operating report for February 2025 (PDF).pdf"),
    },
    {
        "url": "https://www.meridianenergy.co.nz/public/Investors/Reports-and-presentations/Monthly-operating-reports/FY25/January-2025-monthly-operating-report.pdf",
        "output_filename": os.path.join(pdf_output_folder, "Monthly operating report for January 2025 (PDF).pdf"),
    },
    {
        "url": "https://r2.lmor152.com/GLAM%20Thesis.pdf",
        "output_filename": os.path.join(pdf_output_folder, "Geocoding via LINZ Address Matching.pdf"),
    },
]

Folder 'data/raw/pdf' has been created (or already exists).
Contents of the folder 'data/raw/pdf' have been deleted.


In [6]:
for page in pdf_pages:
    url: str = page["url"]
    output_filename: str = page["output_filename"]
    download_pdf(url, pdf_output_folder, output_filename)

PDF successfully saved to data/raw/pdf/Monthly operating report for February 2025 (PDF).pdf
PDF successfully saved to data/raw/pdf/Monthly operating report for January 2025 (PDF).pdf
PDF successfully saved to data/raw/pdf/Geocoding via LINZ Address Matching.pdf


In [7]:
import json
from typing import Dict, List

combined_data: dict[str, List[Dict[str, str]]] = {
    "html_pages": html_pages,
    "pdf_pages": pdf_pages,
}

output_json_path = "data/raw/index.json"
with open(output_json_path, "w", encoding="utf-8") as json_file:
    json.dump(combined_data, json_file, indent=4)

print(f"Combined data successfully written to {output_json_path}")

Combined data successfully written to data/raw/index.json
