In [5]:
import json
from pathlib import Path
from bs4 import BeautifulSoup
import re

def get_car_data(file_path):
    """Parsuje plik HTML i wyciąga informacje o aucie."""
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Pobranie ID z nazwy pliku
    file_id = Path(file_path).stem

    # Szukanie parametrów
    for script in soup.find_all("script"):
        if "mileage" in script.text:  # Wyszukiwanie skryptu zawierającego dane
            try:
                data_json = json.loads(script.string)
                
                advert_data = data_json.get("props", {}).get("pageProps", {}).get("advert", {})
                details = advert_data.get("details", [])
                photos = advert_data.get("images", {}).get("thumbnails", [])
                seller_data = advert_data.get("seller", {})
                price_data = advert_data.get("price", {})

                descriptions = []
                elements = soup.find_all("div", class_=re.compile(r"ooa-unlmzs") )
                paragraphs = elements[0].find_all("p")
                descriptions.append(" ".join([p.get_text(strip=True) for p in paragraphs]))
                
                # Wyciąganie kluczowych danych z pola "details"
                details_dict = {detail.get("key"): detail.get("value") for detail in details}
                
                car_data = {
                    "id": file_id,
                    "marka": details_dict.get("make"),
                    "model": details_dict.get("model"),
                    "rok_produkcji": details_dict.get("year"),
                    "przebieg": details_dict.get("mileage"),
                    "rodzaj_paliwa": details_dict.get("fuel_type"),
                    "pojemnosc_silnika": details_dict.get("engine_capacity"),
                    "moc_silnika": details_dict.get("engine_power"),
                    "skrzynia_biegow": details_dict.get("gearbox"),
                    "naped": details_dict.get("transmission"),
                    "liczba_drzwi": details_dict.get("door_count"),
                    "typ_nadwozia": details_dict.get("body_type"),
                    "kolor": details_dict.get("color"),
                    "liczba_miejsc": details_dict.get("nr_seats"),
                    "stan_pojazdu": details_dict.get("new_used"),
                    "bezwypadkowosc": details_dict.get("no_accident") == "Tak",  # Zmiana na True/False
                    "cena": price_data.get("value"),
                    "waluta": price_data.get("currency"),  # Dodano walutę ceny
                    "czy_cesja": details_dict.get("leasing_concession") == "Tak",  # Zmiana na True/False
                    "czy_prywatny": seller_data.get("type") != "PROFESSIONAL",  # True jeśli prywatne, False jeśli dealer
                    "lokalizacja": seller_data.get("location", {}).get("city"),
                    "url": advert_data.get("url"),  # Dodano URL ogłoszenia
                    "czy_uszkodzony": details_dict.get("damaged") == "Tak",  # Zmiana na True/False
                    "opis": descriptions,
                    "zdjecia": [photo.get("url") for photo in photos if photo.get("url")],
                    
                }
            except (TypeError, json.JSONDecodeError):
                continue




    return car_data

def parse_all_files(input_folder, output_file):
    """Przetwarza wszystkie pliki HTML i zapisuje dane do JSON."""
    input_path = Path(input_folder)
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    all_files = list(input_path.glob("*.html"))
    total_files = len(all_files)

    all_data = []
    for i, file_path in enumerate(all_files, start=1):
        try:
            print(f"Processing file {i}/{total_files}: {file_path.name}")
            data = get_car_data(file_path)
            all_data.append(data)
        except Exception as e:
            print(f"Error processing file {file_path.name}: {e}")

    # Zapis do JSON
    with open(output_path, "w", encoding="utf-8") as file:
        json.dump(all_data, file, ensure_ascii=False, indent=4)
    print(f"Data saved to {output_file}")




In [6]:
# Wykonanie programu
parse_all_files("../data/pages", "../data/all_offers.json")

Processing file 1/2912: 434920697260316.html
Processing file 2/2912: 494911174348403.html
Processing file 3/2912: 657574849871564.html
Processing file 4/2912: 147549099306240.html
Processing file 5/2912: 615462214543859.html
Processing file 6/2912: 151069040792899.html
Processing file 7/2912: 108029595066942.html
Processing file 8/2912: 191019645830174.html
Processing file 9/2912: 380574962991336.html
Processing file 10/2912: 571544195309655.html
Processing file 11/2912: 541153943161142.html
Processing file 12/2912: 863148609989819.html
Processing file 13/2912: 103761931209906.html
Processing file 14/2912: 365141085961810.html
Processing file 15/2912: 206561311270981.html
Processing file 16/2912: 663155332433471.html
Processing file 17/2912: 174865707201079.html
Processing file 18/2912: 804559599700343.html
Processing file 19/2912: 274002076377213.html
Processing file 20/2912: 121740929643515.html
Processing file 21/2912: 300460748597422.html
Processing file 22/2912: 151353758782998.ht