In [None]:
import json
import requests
from datetime import datetime 
import time

API_URL=""
API_TOKEN=""
HEADERS={
    "content-type": "application/json",
    "authorization": API_TOKEN
}

INITIAL_DATASET_PATH=r"E:\book_recommender\data\initial_dataset.json"
CATALOG_PATH=r"E:\book_recommender\data\catalog.json"


In [11]:
with open(INITIAL_DATASET_PATH,"r") as f:
    initial_data=json.load(f)
all_titles=[]
for genre, titles in initial_data.items():
    all_titles.extend(titles)
unique_titles=list(dict.fromkeys(all_titles))
print("total titles:", len(unique_titles))


total titles: 161


In [16]:
def load_catalog():
    try:
        with open(CATALOG_PATH, "r") as f:
            return json.load(f)
    except: return {}
def save_catalog(catalog): 
    with open(CATALOG_PATH, "w") as f:
        json.dump(catalog, f, indent=4)
catalog=load_catalog()

In [18]:
def search_book(title):
    query = """
    query SearchBook($query: String!) {
      search(query: $query, query_type: "Book", per_page: 1) {
        results
        ids
      }
    }
    """

    payload = {
        "query": query,
        "variables": {"query": title}
    }

    try:
        r = requests.post(API_URL, headers=HEADERS, json=payload)
        data = r.json()

        # API error
        if "errors" in data:
            print("API Error:", data["errors"])
            return None

        results = data["data"]["search"]["results"]

        # No hits
        if not results["hits"]:
            return None

        # Return the main metadata object
        return results["hits"][0]["document"]

    except Exception as e:
        print("Request failed:", e)
        return None


In [19]:
def clean_document(doc):
    return {
        "book_id": doc.get("id"),
        "title": doc.get("title"),
        "authors": doc.get("author_names", []),
        "description": doc.get("description", ""),
        "genres": doc.get("genres", []),
        "tags": doc.get("tags", []),
        "moods": doc.get("moods", []),
        "rating": doc.get("rating"),
        "pages": doc.get("pages"),
        "published_year": doc.get("release_year"),
        "image_url": doc.get("image", {}).get("url"),
        "isbn_list": doc.get("isbns", []),
        "embedding_vector_path": None,
        "added_source": "initial",
        "last_updated": datetime.now().strftime("%Y-%m-%d")
    }


In [23]:
for title in unique_titles:
    print(f"\nSearching for: {title}")

    # Step 1 — Fetch metadata from API
    doc = search_book(title)

    if doc is None:
        print(" Not found, skipping")
        continue

    # Step 2 — Clean the raw metadata
    cleaned = clean_document(doc)
    book_id = cleaned["book_id"]

    if not book_id:
        print("No book_id returned, skipping")
        continue

    # Step 3 — Check if ID already exists in catalog
    if book_id in catalog:
        print(f" Already exists in catalog (ID: {book_id}) — skipping")
        continue

    # Step 4 — Insert into catalog
    catalog[book_id] = cleaned
    save_catalog(catalog)

    print(f" Added: {cleaned['title']} (ID: {book_id})")

    # Step 5 — Respect rate limits
    time.sleep(0.5)



Searching for: The Da Vinci Code
 Added: The Da Vinci Code (ID: 373163)

Searching for: Angels & Demons
 Added: Angels & Demons (ID: 123621)

Searching for: Treasure Island
 Added: Treasure Island (ID: 382607)

Searching for: The Lost City of Z
 Added: The Lost City of Z: A Tale of Deadly Obsession in the Amazon (ID: 374999)

Searching for: The Three Musketeers
 Added: The Three Musketeers (ID: 704789)

Searching for: The Bourne Identity
 Added: The Bourne Identity (ID: 15201)

Searching for: The Hunt for Red October
 Added: The Hunt for Red October (ID: 63707)

Searching for: Life of Pi
 Added: Life of Pi (ID: 259662)

Searching for: Shogun
 Added: Shōgun (ID: 6705)

Searching for: Ready Player One
 Added: Ready Player One (ID: 26363)

Searching for: The Martian
 Added: The Martian (ID: 292354)

Searching for: The Count of Monte Cristo
 Added: The Count of Monte Cristo (ID: 473869)

Searching for: The Hunger Games
 Added: The Hunger Games (ID: 88639)

Searching for: Jurassic Park
 Ad

In [24]:

print("Total books in catalog:", len(catalog))
print("Catalog saved at:", CATALOG_PATH)


Total books in catalog: 161
Catalog saved at: E:\book_recommender\data\catalog.json
