In [7]:
import hashlib
import requests
from bs4 import BeautifulSoup
import re
import json
import time
from pathlib import Path

def fetch_page(url, retries=2):
    """Funkcja pobierająca zawartość strony z obsługą ponownych prób."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                print(f"Warning: Unable to fetch page {url}, status code: {response.status_code}")
        except Exception as e:
            print(f"Warning: Exception occurred while fetching page {url}: {e}")
        time.sleep(1)  # Opóźnienie między próbami
    return None

def alphanumeric_to_numeric_id(alphanumeric_id):
    hash_object = hashlib.sha256(alphanumeric_id.encode())
    numeric_id = int(hash_object.hexdigest(), 16)
    return str(numeric_id)[:15]

def scrape_offers(base_url, max_pages=100):
    """Funkcja pobierająca wszystkie ogłoszenia z podanej liczby stron."""
    all_items = []

    for page in range(1, max_pages + 1):
        print(f"Fetching page {page}...")
        url = f"{base_url}?page={page}"
        html_content = fetch_page(url)
        if not html_content:
            print(f"Skipping page {page} due to errors.")
            continue

        soup = BeautifulSoup(html_content, "html.parser")
        items = soup.find_all("article", class_=re.compile(r"ooa-1yux8sr"))

        if not items:
            print(f"No items found on page {page}.")
            continue

        all_items.extend(items)

    return all_items

def parse_offers(items):
    """Funkcja wyodrębniająca tytuły, adresy URL i ID z ogłoszeń."""
    results = []

    for offer in items:
        try:
            title_url_container = offer.find("h2", class_=re.compile(r"ooa-1kyyooz"))
            if not title_url_container:
                print("Warning: Title container not found.")
                continue

            title = title_url_container.find("a")
            if not title or not title.get("href"):
                print("Warning: Title or URL not found.")
                continue

            url = title["href"]
            id_numeric = alphanumeric_to_numeric_id(url.split('-')[-1].split('.')[0])

            results.append({"url": url, "id": id_numeric})
        except Exception as e:
            print(f"Warning: Failed to parse an offer: {e}")

    return results



In [8]:
base_url = "https://www.otomoto.pl/osobowe/bmw"
max_pages = 20 # Ustaw limit maksymalnej liczby stron do przeszukania

# Pobranie wszystkich ofert
all_items_raw = scrape_offers(base_url, max_pages=max_pages)

# Przetwarzanie ofert
all_items_parsed = parse_offers(all_items_raw)

# Zapis do pliku JSON Lines
output_path = Path("../data/all_urls.json")
output_path.parent.mkdir(parents=True, exist_ok=True)  # Tworzy katalog, jeśli nie istnieje

with output_path.open("w", encoding="utf-8") as file:
    json.dump(all_items_parsed, file, ensure_ascii=False, indent=4)


print(f"Zapisano {len(all_items_parsed)} ogłoszeń do pliku '{output_path}'.")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Zapisano 640 ogłoszeń do pliku '..\data\all_urls.json'.
