In [None]:


Section 0 – Imports & setup

- Import SQLAlchemy engine (via your existing project setup).
- Import CRUD functions you already have (Books, Authors, etc.).
- Import tools for HTTP requests and file handling.



# Database / SQL
from sqlalchemy import text
# from db_connection import get_engine   # if you created this helper

# HTTP for API calls
import requests

# Data handling
import pandas as pd
from pathlib import Path

# Reuse existing CRUD modules (adapt the paths/names to your project)
# from CRUD_Books import get_book_by_id, update_book_details, get_books, update_book_status
# from CRUD_Authors import create_author, get_author_by_name, get_author_by_id

def get_engine():
    """
    TODO: adapt this to reuse your real connection string.
    Example:

        from sqlalchemy import create_engine
        import urllib.parse

        schema = "lianes_library"
        user = "root"
        raw_password = "..."
        password = urllib.parse.quote_plus(raw_password)
        host = "127.0.0.1"
        port = 3306

        connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{schema}"
        engine = create_engine(connection_string)

    For now, just reuse whatever you already use in CRUD-Books.py.
    """
    # return engine  # imported from your project
    raise NotImplementedError("Implement get_engine() to return your SQLAlchemy engine.")
: 
,
: { 
: 
 },
: [
,
,
ISBN_CORRIGIDOS_PATH = Path("data") / "isbn_corrigidos.csv"

def load_isbn_corrigidos(path: Path = ISBN_CORRIGIDOS_PATH) -> set:
    """
    Load previously processed ISBNs from a CSV or text file.
    Returns a Python set of strings (ISBN values).

    If the file does not exist yet -> return an empty set.
    """
    if not path.exists():
        return set()

    # Example: CSV with one column named 'isbn'
    df = pd.read_csv(path)
    return set(df["isbn"].astype(str).tolist())

def save_isbn_corrigidos(isbn_set: set, path: Path = ISBN_CORRIGIDOS_PATH):
    """
    Persist the set of processed ISBNs to disk.
    Overwrites the file each time (simple approach).
    """
    df = pd.DataFrame(sorted(isbn_set), columns=["isbn"])
    df.to_csv(path, index=False)

## Selecting books that need author information

We want to select only:

- books where `author_id` (or similar) is NULL or missing,
- AND their ISBN is not in `isbn_corrigidos`.

You may already have a helper in `CRUD_Books.py`.
If not, this notebook will outline one.

In [None]:
def get_books_needing_authors(engine, isbn_corrigidos: set, limit: int = 1000):
    """
    Query the `books` table for books that:
    - are missing author_id (or some indicator), and
    - their ISBN is not in `isbn_corrigidos`.

    This function only builds the query; you must adapt it
    to your actual schema (e.g., column names and conditions).
    """

    # TODO: adjust column names: if you use author_id, or still store author text, etc.
    # Example assuming:
    #   - books table has: book_id, ISBN, author_id, title
    #   - author_id is NULL for books without mapped author
    placeholders = 
.join([f":isbn{i}" for i in range(len(isbn_corrigidos))]) or "NULL"

    base_query = f"""
        SELECT book_id, ISBN, title
        FROM books
        WHERE author_id IS NULL
          AND (ISBN NOT IN ({placeholders}) OR {len(isbn_corrigidos)} = 0)
        LIMIT :limit
    """

    params = {f"isbn{i}": isbn for i, isbn in enumerate(isbn_corrigidos)}
    params["limit"] = limit

    with engine.connect() as conn:
        result = conn.execute(text(base_query), params)
        rows = result.fetchall()
        # Convert to list of dicts
        return [dict(r._mapping) for r in rows]

## Fetching book metadata from an external API (by ISBN)

We’ll define a placeholder function and suggest APIs to use:
- Open Library API
- Google Books API

The function should return a dict with at least `authors` as a list of strings.

In [None]:
def fetch_book_metadata_from_api(isbn: str) -> dict | None:
    """
    Fetch book metadata (especially authors) from an external API using ISBN.

    TODO: implement actual API calls here.

    Steps:
    1. Build the URL for the chosen API (Open Library, Google Books, etc.).
    2. Call `requests.get(url, timeout=...)`.
    3. Check response status, parse JSON (`response.json()`).
    4. Extract:
        - authors as a list of strings.
        - optionally: title, publisher, etc.
    5. Return a dict like: {"authors": [...], "title": "..."}.

    If nothing is found or an error occurs, return None.
    """
    # Example skeleton for Open Library (fill as needed):
    # url = f"https://openlibrary.org/isbn/{isbn}.json"
    # try:
    #     resp = requests.get(url, timeout=10)
    #     if resp.status_code != 200:
    #         return None
    #     data = resp.json()
    #     # parse authors depending on API structure
    #     authors = [...]
    #     title = data.get("title")
    #     return {"authors": authors, "title": title}
    # except Exception as e:
    #     print(f"Error fetching ISBN {isbn}: {e}")
    #     return None
    raise NotImplementedError("Implement API call for your chosen service.")
: 
,
: { 
: 
 },
: [
,
,
    Get an existing author_id by name, or create a new author and return its id.

    This function is a wrapper around your CRUD logic in CRUD_Authors.py.
    """

    with engine.connect() as conn:
        # 1. Try find existing
        select_query = text("SELECT author_id FROM authors WHERE name = :name")
        result = conn.execute(select_query, {"name": author_name})
        row = result.fetchone()
        if row:
            return row._mapping["author_id"]

        # 2. Insert new author
        insert_query = text("INSERT INTO authors (name) VALUES (:name)")
        res = conn.execute(insert_query, {"name": author_name})
        author_id = getattr(res, 'lastrowid', None)
        # If SQLAlchemy Core with result.inserted_primary_key is available, you can use that instead
        if author_id is None and hasattr(res, 'inserted_primary_key'):
            pk = res.inserted_primary_key
            author_id = pk[0] if pk else None
        return author_id

## Linking book to authors

Two options:

1. **Simple model (1 author per book)**  
   - `books` has a single `author_id` column.  
   - If multiple authors are returned, you decide: use the first or extend to a join table.

2. **Many-to-many model** (recommended if you really have multi-author works)
   - Create `book_authors` table with (book_id, author_id).
   - Do NOT store `author_id` directly in `books`.

This notebook outlines the simple version (1 author per book).

In [None]:
def set_book_author_id(engine, book_id: int, author_id: int):
    """
    Update `books.author_id` for a given book.

    Assumes `books` table has `author_id` column.
    If not, you must ALTER TABLE books ADD COLUMN author_id INT, etc.
    """
    query = text("
,
,

    with engine.connect() as conn:
        conn.execute(query, {"author_id": author_id, "book_id": book_id})
        # If using autocommit = False, you'd need a transaction/commit here.

## Main processing function

We now assemble everything:

- Load `isbn_corrigidos`.
- Get a batch of books that have `author_id` missing and ISBN not in `isbn_corrigidos`.
- For each book: fetch metadata, create/get authors, link book, mark ISBN processed.
- Save `isbn_corrigidos`.

In [None]:
def process_books_missing_authors(batch_size: int = 100):
    """
    Main loop:
    - Loads isbn_corrigidos
    - Fetches a batch of books needing authors
    - Fills missing author info via API
    - Updates DB and isbn_corrigidos file
    """

    engine = get_engine()
    isbn_corrigidos = load_isbn_corrigidos()

    # 1. Get a batch of books needing authors
    books_to_process = get_books_needing_authors(engine, isbn_corrigidos, limit=batch_size)

    if not books_to_process:
        print("No books needing authors found.")
        return

    for book in books_to_process:
        book_id = book["book_id"]
        isbn = str(book["ISBN"])
        title = book.get("title")

        print(f"
Processing book_id={book_id}, ISBN={isbn}, title={title}")

        # Skip if ISBN is already in the corrected list (double-check)
        if isbn in isbn_corrigidos:
            print(f"  -> ISBN {isbn} already in isbn_corrigidos, skipping.")
            continue

        # 2. Call external API
        try:
            meta = fetch_book_metadata_from_api(isbn)
        except NotImplementedError:
            print("fetch_book_metadata_from_api is not implemented yet.")
            break

        if not meta or not meta.get("authors"):
            print(f"  -> No author info found for ISBN {isbn}.")
            # Optionally still add ISBN to avoid re-trying forever
            isbn_corrigidos.add(isbn)
            continue

        authors = meta["authors"]
        print(f"  -> Found authors: {authors}")

        # 3. For now, handle first author only (simple model)
        main_author_name = authors[0].strip()
        author_id = get_or_create_author(engine, main_author_name)

        # 4. Link book ←→ author
        set_book_author_id(engine, book_id, author_id)
        print(f"  -> Linked book_id={book_id} to author_id={author_id} ({main_author_name})")

        # 5. Mark this ISBN as processed
        isbn_corrigidos.add(isbn)

    # 6. Persist updated isbn_corrigidos
    save_isbn_corrigidos(isbn_corrigidos)
    print("
Done. Updated isbn_corrigidos and book author links.")

In [None]:
# Example manual run (test with a small batch first)
# process_books_missing_authors(batch_size=20)

# TODO: Uncomment and run after implementing get_engine() and fetch_book_metadata_from_api()