Ez a kód egy API-n keresztül adatokat gyűjt, feldolgozza azokat, majd egy SQLite adatbázisba menti. Az API által biztosított információk alapján fájlokat is letölt, PDF-ből szöveges formátumba konvertálja őket, és a megfelelő könyvtárakban tárolja. A kód részletes logokat készít, hogy könnyebb legyen a hibák diagnosztizálása és a működés nyomon követése.

---

## Funkciók és Célok

### 1. **Adatok Tárolása Az Adatbázisban**
A kód két fő adatbázis-táblát kezel:
- **`ujbuda_meghivo_mappa` (db_folder):** A mappák metaadatait tárolja, például a dátumot, kategóriát és helyszínt.
- **`ujbuda_napirendi` (db_napirendi):** A napirendi pontok adatait tartalmazza, beleértve a napirendi pont nevét, az előterjesztőt és a hivatkozásokat.

### 2. **Mappaadatok Feldolgozása**
- Az API segítségével lekéri a mappák UUID-jait és metaadatait.
- Frissíti a meglévő mappaadatokat az adatbázisban, vagy új bejegyzést hoz létre.

### 3. **Napirendi Pontok Lekérése**
- A mappák típusától függően meghatározza a megfelelő API URL-t.
- Lekéri a napirendi pontokat, és frissíti vagy hozzáadja azokat az adatbázishoz.

### 4. **Fájlok Letöltése és Konvertálása**
- Letölti a napirendi pontokhoz kapcsolódó PDF fájlokat.
- A letöltött PDF fájlokat szöveges formátumba konvertálja, és letükrözve az eredeti mappastruktúrát menti.

In [61]:
import logging
import os
import requests
import sqlite3
import time
import warnings

from dataclasses import dataclass
from math import e as euler
from pathlib import Path

import fitz # pymupdf

warnings.filterwarnings("ignore")

In [52]:
# Paths
DATABASE_PATH = "../onkorm.db"
LOG_FILE_PATH = "../download.log"
PDF_FOLDER = Path("../pdf")
TXT_FOLDER = Path("../txt")

os.makedirs(PDF_FOLDER, exist_ok=True)
os.makedirs(TXT_FOLDER, exist_ok=True)

# Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    filename=LOG_FILE_PATH,
    filemode="a",
)

@dataclass
class Onkorm:
    """Dataclass to store configuration for Onkorm."""
    name: str
    base_url: str
    db_folder: str
    db_napirendi: str
    db_file_detail: str


ujbuda = Onkorm(
    name="Újbuda",
    base_url="https://mikrodat.ujbuda.hu/app/cms/api/honlap",
    db_folder="ujbuda_meghivo_mappa",
    db_napirendi="ujbuda_napirendi",
    db_file_detail="ujbuda_file_det"
)

create_folder_sql = f"""
CREATE TABLE IF NOT EXISTS {ujbuda.db_folder} (
    folder_uuid TEXT UNIQUE,
    detail_uuid TEXT,
    datum TEXT,
    name TEXT,
    testuletijelolo TEXT,
    targy TEXT,
    napirend TEXT,
    kategoria TEXT,
    nyilvanossagjelolo TEXT,
    idopont TEXT,
    hely TEXT,
    gyujto TEXT,
    folapra TEXT,
    eloterjeszto TEXT,
    dateLastModified TEXT,
    iktatoszam TEXT
);
"""

create_agenda_sql = f"""
CREATE TABLE IF NOT EXISTS {ujbuda.db_napirendi} (
    uuid TEXT PRIMARY KEY,
    folder_uuid TEXT,
    gyujto TEXT,
    targy TEXT,
    name TEXT,
    linkName TEXT,
    napirend TEXT,
    nyilvanossagjelolo TEXT,
    hasPermissions TEXT,
    folapra TEXT,
    eloterjeszto TEXT,
    referencia TEXT
);
"""

create_file_detail_sql = f"""
CREATE TABLE IF NOT EXISTS {ujbuda.db_file_detail} (
    gyujto TEXT,
    nyilvanossagjelolo TEXT,
    dateLastModified TEXT,
    statetext TEXT,
    name TEXT,
    userLastModified TEXT,
    filesize INTEGER,
    uuid TEXT UNIQUE NOT NULL,
    agenda_uuid TEXT,
    folder_uuid TEXT
);
"""

def create_table(conn, sql):
    """Create a table if it does not exist."""
    
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    
def fetch_json(url):
    try:
        response = requests.get(url, verify=False, timeout=10)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        logging.error(f"Failed to fetch data from {url}: {e}")
        return None
    
import sqlite3

def insert_into_table(conn, table_name, data):
    try:
        cursor = conn.cursor()
        columns = ', '.join(data.keys())
        placeholders = ', '.join(['?'] * len(data))
        values = tuple(data.values())
        insert_query = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
        cursor.execute(insert_query, values)
        conn.commit()
    except sqlite3.Error as e:
        print(e)
        logging.error(f"Database error while inserting values {values}: {e}")

with sqlite3.connect(DATABASE_PATH) as conn:
    insert_into_table(conn,ujbuda.db_file_detail,file_item)

def process_years(conn, table_name, years):
    cursor = conn.cursor()
    for year in years:
        folder_url = f"{ujbuda.base_url}/inv/folders?year={year}"
        folder_data = fetch_json(folder_url)
        if not folder_data:
            continue

        for entry in folder_data.get("content", []):
            folder_uuid = entry.get("uuid")
            if not folder_uuid:
                continue

            cursor.execute(f"SELECT 1 FROM {ujbuda.db_folder} WHERE folder_uuid = ?", (folder_uuid,))
            if cursor.fetchone():
                #skip if folder_uuid exists
                continue

            folder_data = {
                "folder_uuid": entry.get("uuid"), #map uuid -> folder uuid
                "datum": entry.get("datum"),
                "nyilvanossagjelolo": entry.get("nyilvanossagjelolo"),
                "kategoria": entry.get("kategoria"),
                "idopont": entry.get("idopont"),
                "hely": entry.get("hely"),
            }

            insert_into_table(cursor, table_name, folder_data)

        conn.commit()

In [42]:
# SQLITE DB
with sqlite3.connect(DATABASE_PATH) as conn:
    create_table(conn, create_folder_sql)
    create_table(conn, create_agenda_sql)
    create_table(conn, create_file_detail_sql)

In [21]:
years_url = f"{ujbuda.base_url}/inv/years"
years_data = fetch_json(years_url)
with sqlite3.connect(DATABASE_PATH) as conn:
    cursor = conn.cursor()
    if years_data:
        process_years(cursor, ujbuda.db_folder, years_data.get("content", []))   

In [13]:
with sqlite3.connect(DATABASE_PATH) as conn:
    cursor = conn.cursor()
    query = f"SELECT folder_uuid FROM {ujbuda.db_folder}"
    folder_uuid_list = [ i[0] for i in cursor.execute(query).fetchall() if i[0]]

In [56]:
for folder_uuid in folder_uuid_list[:]:
    
    time.sleep(euler)
    
    logging.info(f"Processing folder: {folder_uuid}")

    # Fetch folder details
    folder_detail_url = f"{ujbuda.base_url}/detail?id={folder_uuid}"
    folder_detail_data = fetch_json(folder_detail_url)
    if not folder_detail_data:
        logging.error(f"Failed to fetch folder details for UUID {folder_uuid}")
        continue

    folder_details = folder_detail_data.get("content", {})
    if not folder_details:
        logging.warning(f"No details found for folder UUID {folder_uuid}")
        continue

    folder_details["folder_uuid"] = folder_uuid
    agenda_uuid = folder_details.pop("uuid", None)

    if "nev" in folder_details:
        folder_details["name"] = folder_details.pop("nev")

    # Update folder details in the database
    try:
        with sqlite3.connect(DATABASE_PATH) as conn:
            cursor = conn.cursor()
            values = ", ".join([f"{key} = ?" for key in folder_details if key != "folder_uuid"])
            sql = f"UPDATE {ujbuda.db_folder} SET {values} WHERE folder_uuid = ?"
            cursor.execute(sql, tuple(folder_details[key] for key in folder_details if key != "folder_uuid") + (folder_uuid,))
            conn.commit()
    except sqlite3.DatabaseError as e:
        logging.error(f"Database error while updating folder UUID {folder_uuid}: {e}")
        continue

    session_type = folder_details.get("name", "").lower()

    # Determine agenda URL
    if "bizottság" in session_type:
        agenda_url = f"{ujbuda.base_url}/inv/list?id={folder_uuid}&id2={agenda_uuid}"
    elif session_type == "képviselő-testület":
        agenda_url = f"{ujbuda.base_url}/inv/listtest?id={folder_uuid}"
    else:
        logging.warning(f"Unknown session type for folder {folder_uuid}")
        continue

    # Fetch agenda data
    agenda_data = fetch_json(agenda_url)
    if not agenda_data or not agenda_data.get("content"):
        logging.info(f"No agenda data for folder {folder_uuid}")
        continue

    for agenda_item in agenda_data["content"]:
        agenda_item["folder_uuid"] = folder_uuid
        uuid = agenda_item.get("uuid")
        if not uuid:
            logging.warning(f"Agenda item missing UUID for folder {folder_uuid}")
            continue

        # Insert agenda item into the database
        try:
            with sqlite3.connect(DATABASE_PATH) as conn:
                cursor = conn.cursor()
                cursor.execute(f"SELECT 1 FROM {ujbuda.db_napirendi} WHERE uuid = ?", (uuid,))
                if not cursor.fetchone():
                    columns = list(agenda_item.keys())
                    placeholders = ", ".join(["?"] * len(columns))
                    sql = f"INSERT INTO {ujbuda.db_napirendi} ({', '.join(columns)}) VALUES ({placeholders})"
                    cursor.execute(sql, tuple(agenda_item[col] for col in columns))
                    conn.commit()
        except sqlite3.DatabaseError as e:
            logging.error(f"Database error while inserting agenda item UUID {uuid}: {e}")
            continue

    # Process files for agenda items
    for agenda_item in agenda_data["content"]:
        if agenda_item.get("napirend") == "0":
            # Skip downloading the invite
            continue

        file_name = agenda_item.get("name")
        agenda_uuid = agenda_item.get("uuid")
        if not file_name or not agenda_uuid:
            logging.warning(f"Missing file_name or agenda_uuid for folder {folder_uuid}")
            continue

        body_dok_url = f"{ujbuda.base_url}/elo/djav?uuid={folder_uuid}&uuid2={agenda_uuid}"
        try:
            body_file_json = fetch_json(body_dok_url)
            if not body_file_json or not body_file_json.get("content"):
                logging.warning(f"No file content for agenda {agenda_uuid}")
                continue
        except Exception as e:
            logging.error(f"Error fetching files for agenda UUID {agenda_uuid}: {e}")
            continue
            
        #insert file
        with sqlite3.connect(DATABASE_PATH) as conn:
            cursor = conn.cursor()
            for file_item in body_file_json.get("content", []):
                file_item["folder_uuid"] = folder_uuid
                file_item["agenda_uuid"] = agenda_uuid
                file_uuid = file_item.get("uuid")
                cursor.execute(f"SELECT 1 FROM {ujbuda.db_file_detail} WHERE uuid = ?", (file_uuid,))
                if not cursor.fetchone():
                    insert_into_table(conn,ujbuda.db_file_detail,file_item)
                    logging.info(f"{file_uuid} added to database")
                else:
                    logging.error(f"{file_uuid} already in database")          
            
        for file_item in body_file_json.get("content", []):
            
            file_name = file_item.get("name")
            file_uuid = file_item.get("uuid")
            
            if not file_name or not file_uuid:
                logging.warning(f"Missing file details for agenda UUID {agenda_uuid}")
                continue

            file_download_url = f"{ujbuda.base_url}/getfile/{file_uuid}/{file_name}"
            save_path = PDF_FOLDER / folder_uuid / agenda_uuid / file_name

            # Download and save file
            try:
                file_response = requests.get(file_download_url, timeout=10)
                file_response.raise_for_status()
                os.makedirs(save_path.parent, exist_ok=True)
                with open(save_path, "wb") as file:
                    file.write(file_response.content)
                logging.info(f"Downloaded file {file_name} for agenda {agenda_uuid}")
            except requests.RequestException as e:
                logging.error(f"Error downloading file {file_name}: {e}")
                continue

            # Convert to text and save
            try:
                with fitz.open(save_path) as doc:
                    txt_path = TXT_FOLDER / folder_uuid / agenda_uuid / file_name.replace(".pdf", ".txt")
                    os.makedirs(txt_path.parent, exist_ok=True)
                    with open(txt_path, "wb") as out:
                        for page in doc:
                            text = page.get_text().encode("utf8")
                            out.write(text)
                            out.write(bytes((12,)))
                    logging.info(f"Converted PDF to text: {txt_path}")
            except Exception as e:
                logging.error(f"Error converting file {file_name} to text: {e}")
