In [25]:
import json
from pathlib import Path
from bs4 import BeautifulSoup

def get_car_data(file_path):
    """Parsuje plik HTML i wyciąga informacje o aucie."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Pobranie ID z nazwy pliku
    file_id = Path(file_path).stem

    # Szukanie parametrów
    for script in soup.find_all("script"):
        if "mileage" in script.text:  # Wyszukiwanie skryptu zawierającego dane
            try:
                data_json = json.loads(script.string)
                
                advert_data = data_json.get("props", {}).get("pageProps", {}).get("advert", {})
                details = advert_data.get("details", [])
                photos = advert_data.get("images", {}).get("thumbnails", [])
                seller_data = advert_data.get("seller", {})
    
                # Wyciąganie kluczowych danych z pola "details"
                details_dict = {detail.get("key"): detail.get("value") for detail in details}
                
                car_data = {
                "id": file_id,
                "marka": details_dict.get("make"),
                "model": details_dict.get("model"),
                "rok_produkcji": details_dict.get("year"),
                "przebieg": details_dict.get("mileage"),
                "rodzaj_paliwa": details_dict.get("fuel_type"),
                "pojemnosc_silnika": details_dict.get("engine_capacity"),
                "moc_silnika": details_dict.get("engine_power"),
                "skrzynia_biegow": details_dict.get("gearbox"),
                "naped": details_dict.get("transmission"),
                "liczba_drzwi": details_dict.get("door_count"),
                "typ_nadwozia": details_dict.get("body_type"),
                "kolor": details_dict.get("color"),
                "liczba_miejsc": details_dict.get("nr_seats"),
                "stan_pojazdu": details_dict.get("new_used"),
                "bezwypadkowosc": details_dict.get("no_accident"),
                "cena": advert_data.get("price", {}).get("value"),
                "czy_cesja": "tak" if details_dict.get("leasing_concession") == "Tak" else "nie",
                "rodzaj_ogloszenia": "dealer" if seller_data.get("type") == "PROFESSIONAL" else "prywatne",
                "lokalizacja": seller_data.get("location", {}).get("city"),
                "zdjecia": [photo.get("url") for photo in photos if photo.get("url")]
                }
                
            except (TypeError, json.JSONDecodeError):
                continue


    return car_data

def parse_all_files(input_folder, output_file):
    """Przetwarza wszystkie pliki HTML i zapisuje dane do JSON."""
    input_path = Path(input_folder)
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    all_files = list(input_path.glob("*.html"))
    total_files = len(all_files)

    all_data = []
    for i, file_path in enumerate(all_files, start=1):
        try:
            print(f"Processing file {i}/{total_files}: {file_path.name}")
            data = get_car_data(file_path)
            all_data.append(data)
        except Exception as e:
            print(f"Error processing file {file_path.name}: {e}")

    # Zapis do JSON
    with open(output_path, "w", encoding="utf-8") as file:
        json.dump(all_data, file, ensure_ascii=False, indent=4)
    print(f"Data saved to {output_file}")




In [None]:
# Wykonanie programu
parse_all_files("../data/pages", "../data/all_offers.json")