In [1]:
import os
import json
import requests
from pathlib import Path
from time import sleep, time

def load_progress(progress_file):
    """Wczytuje informacje o postępie z pliku."""
    if Path(progress_file).exists():
        with open(progress_file, "r", encoding="utf-8") as file:
            return json.load(file)
    return {}

def save_progress(progress_file, progress_data):
    """Zapisuje informacje o postępie do pliku."""
    with open(progress_file, "w", encoding="utf-8") as file:
        json.dump(progress_data, file, ensure_ascii=False, indent=4)

def download_images(json_file, output_folder, progress_file):
    """Pobiera zdjęcia z ofert zapisanych w pliku JSON i zapisuje je w podanym folderze."""
    # Tworzenie katalogu na obrazy
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)

    # Wczytywanie postępu
    progress_data = load_progress(progress_file)

    # Wczytywanie danych z pliku JSON
    with open(json_file, "r", encoding="utf-8") as file:
        offers = json.load(file)

    total_offers = len(offers)
    already_processed = sum(1 for offer_id, status in progress_data.items() if status["status"] == "success")
    start_time = time()
    processed_offers = 0  # Liczba faktycznie przetworzonych ofert

    for i, offer in enumerate(offers, start=1):
        offer_id = offer.get("id")
        images = offer.get("zdjecia", [])[:12]  # Ograniczenie do maksymalnie 12 zdjęć

        # Sprawdzenie, czy oferta była już pobrana
        if progress_data.get(offer_id, {}).get("status") == "success":
            print(f"Offer {already_processed + i}/{total_offers} (ID: {offer_id}) already downloaded. Skipping.")
            continue

        print(f"Processing offer {already_processed + i}/{total_offers} (ID: {offer_id})...")

        # Liczba faktycznie przetworzonych ofert
        processed_offers += 1
        offer_success = True
        for index, img_url in enumerate(images):
            # Generowanie nazwy pliku
            image_filename = f"{offer_id}_{index}.jpg"
            image_path = output_path / image_filename

            # Pobieranie obrazu z maksymalnie 3 próbami
            success = False
            for attempt in range(3):  # Maksymalnie 3 próby
                try:
                    response = requests.get(img_url, stream=True, timeout=10)
                    response.raise_for_status()  # Sprawdzenie poprawności żądania

                    # Zapis obrazu do pliku
                    with open(image_path, "wb") as img_file:
                        for chunk in response.iter_content(1024):
                            img_file.write(chunk)

                    print(f"Downloaded: {image_filename}")
                    success = True
                    break  # Wyjście z pętli prób, jeśli się uda

                except Exception as e:
                    print(f"Attempt {attempt + 1}/3 failed for {img_url}: {e}")
                    sleep(0.1)  # Krótkie opóźnienie przed ponowną próbą

            if not success:
                offer_success = False
                print(f"Failed to download {img_url} for ID {offer_id} after 3 attempts.")

        # Aktualizacja postępu
        progress_data[offer_id] = {"status": "success" if offer_success else "failed"}
        if (already_processed + i) % 10 == 0 or i == total_offers:  # Zapisywanie co 10 ofert
            save_progress(progress_file, progress_data)
            print(f"Progress saved after {already_processed + i} offers.")

        # Przewidywany czas do końca
        elapsed_time = time() - start_time
        avg_time_per_offer = elapsed_time / processed_offers if processed_offers > 0 else 0
        remaining_time = avg_time_per_offer * (total_offers - (already_processed + i))
        print(f"Estimated time remaining: {remaining_time / 60:.2f} minutes.")

# Plik wejściowy JSON, folder na zdjęcia i plik postępu
json_file = "../data/all_offers.json"
output_folder = "../data/images"
progress_file = "../data/progress.json"



In [2]:
# Uruchomienie funkcji
download_images(json_file, output_folder, progress_file)

Offer 4981/14679 (ID: 100002067157270) already downloaded. Skipping.
Offer 4982/14679 (ID: 100008750780405) already downloaded. Skipping.
Offer 4983/14679 (ID: 100016885962989) already downloaded. Skipping.
Offer 4984/14679 (ID: 100023027395321) already downloaded. Skipping.
Offer 4985/14679 (ID: 100049777153183) already downloaded. Skipping.
Offer 4986/14679 (ID: 100054620611557) already downloaded. Skipping.
Offer 4987/14679 (ID: 100064809496710) already downloaded. Skipping.
Offer 4988/14679 (ID: 100075118000516) already downloaded. Skipping.
Offer 4989/14679 (ID: 100078009534233) already downloaded. Skipping.
Offer 4990/14679 (ID: 100087970532231) already downloaded. Skipping.
Offer 4991/14679 (ID: 100097644897111) already downloaded. Skipping.
Offer 4992/14679 (ID: 100101892292338) already downloaded. Skipping.
Offer 4993/14679 (ID: 100103490336814) already downloaded. Skipping.
Offer 4994/14679 (ID: 100108866936167) already downloaded. Skipping.
Offer 4995/14679 (ID: 100113549040

KeyboardInterrupt: 