In [1]:
import requests
import json
import time
from pathlib import Path

def fetch_page(url, retries=2):
    """Funkcja pobierająca zawartość strony z obsługą ponownych prób."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                print(f"Warning: Unable to fetch page {url}, status code: {response.status_code}")
        except Exception as e:
            print(f"Warning: Exception occurred while fetching page {url}: {e}")
        time.sleep(1)  # Opóźnienie między próbami
    print(f"Error: Failed to fetch page {url} after {retries} retries.")
    return None

def load_last_saved_page(file_path):
    """Wczytuje numer ostatnio zapisanej strony."""
    if file_path.exists():
        try:
            with file_path.open("r", encoding="utf-8") as file:
                return int(file.read().strip())
        except ValueError:
            print("Warning: Failed to read last saved page. Starting from the beginning.")
    return 0

def save_last_page(file_path, page_number):
    """Zapisuje numer ostatnio zapisanej strony."""
    with file_path.open("w", encoding="utf-8") as file:
        file.write(str(page_number))


dupa


In [2]:
import time

# Pliki wejściowe i wyjściowe
input_file = Path("../data/all_urls.json")  # Plik wejściowy
output_dir = Path("../data/pages")  # Katalog na zapisane strony
last_saved_file = Path("../data/last_page_saved.txt")  # Plik z informacją o ostatniej zapisanej stronie

# Tworzenie katalogów
output_dir.mkdir(parents=True, exist_ok=True)  # Tworzy katalog, jeśli nie istnieje
last_saved_file.parent.mkdir(parents=True, exist_ok=True)  # Tworzy katalog dla pliku, jeśli nie istnieje

# Wczytanie danych z pliku JSON
with input_file.open("r", encoding="utf-8") as file:
    urls = json.load(file)

total_pages = len(urls)
start_index = load_last_saved_page(last_saved_file)

start_time = time.time()  # Start pomiaru czasu

for index, record in enumerate(urls[start_index:], start=start_index + 1):
    url = record.get("url")
    page_id = record.get("id")

    if not url or not page_id:
        print(f"Warning: Missing URL or ID in record: {record}")
        continue

    # Pobieranie zawartości strony
    print(f"Fetching page {index}/{total_pages} for ID: {page_id}")
    content = fetch_page(url)
    if content is None:
        print(f"Warning: Could not fetch page for ID: {page_id}")
        continue

    # Zapis do pliku
    output_file = output_dir / f"{page_id}.html"
    try:
        with output_file.open("w", encoding="utf-8") as file:
            file.write(content)
        print(f"Page {index}/{total_pages} saved for ID: {page_id}")
    except Exception as e:
        print(f"Error: Failed to save page for ID: {page_id}. Exception: {e}")
        continue

    # Zapis informacji o ostatniej zapisanej stronie co 100 stron
    if index % 100 == 0:
        save_last_page(last_saved_file, index)
        print(f"Saved progress at page {index} to {last_saved_file}")

    # Wyliczanie szacowanego czasu
    elapsed_time = time.time() - start_time
    pages_processed = index - start_index
    if pages_processed > 0:
        estimated_total_time = elapsed_time / pages_processed * (total_pages - start_index)
        estimated_remaining_time = estimated_total_time - elapsed_time
        print(f"Estimated time to finish: {estimated_remaining_time:.2f} seconds")

# Zapis ostatniego przetworzonego rekordu
save_last_page(last_saved_file, total_pages)
print(f"Finished saving pages. Progress saved at {last_saved_file}.")


Fetching page 1/640 for ID: 642329777923089
Page 1/640 saved for ID: 642329777923089
Estimated time to finish: 544.22 seconds
Fetching page 2/640 for ID: 106555413684827
Page 2/640 saved for ID: 106555413684827
Estimated time to finish: 442.58 seconds
Fetching page 3/640 for ID: 392594704211408
Page 3/640 saved for ID: 392594704211408
Estimated time to finish: 427.83 seconds
Fetching page 4/640 for ID: 718355094498193
Page 4/640 saved for ID: 718355094498193
Estimated time to finish: 421.34 seconds
Fetching page 5/640 for ID: 554325886939280
Page 5/640 saved for ID: 554325886939280
Estimated time to finish: 435.41 seconds
Fetching page 6/640 for ID: 110143484926250
Page 6/640 saved for ID: 110143484926250
Estimated time to finish: 422.87 seconds
Fetching page 7/640 for ID: 383837113922516
Page 7/640 saved for ID: 383837113922516
Estimated time to finish: 437.29 seconds
Fetching page 8/640 for ID: 192197701926815
Page 8/640 saved for ID: 192197701926815
Estimated time to finish: 475.76 