# Dataset CRUD Testing

Testing the CRUD operations and dataset import functionality.

**Prerequisites:** Run `make setup` to start PostgreSQL and run migrations.

In [1]:
import polars as pl
from pathlib import Path

from bookdb.db.session import SessionLocal
from bookdb.datasets.crud import BookCRUD, AuthorCRUD, UserCRUD, BookListCRUD
from bookdb.datasets.processor import import_dataset, preview_dataset, read_file, import_authors, import_books

# Data directory (relative to this notebook)
DATA_DIR = Path("../../data")

## 1. Test CRUD Operations w/ fake data

In [3]:
session = SessionLocal()

# Create an author
author = AuthorCRUD.get_or_create(session, "George Orwell")
print(f"Author: {author.name} (ID: {author.id})")

# Create a book with authors
book = BookCRUD.create_with_authors(
    session,
    author_names=["George Orwell"],
    title="Animal Farm",
    pages_number=112,
    publish_year=1945,
)
print(f"Book: {book.title} by {[a.name for a in book.authors]}")

session.commit()

Author: George Orwell (ID: 1)
Book: Animal Farm by ['George Orwell']


In [4]:
# Search for books
results = BookCRUD.search_by_title(session, "Animal")
print(f"Found {len(results)} books:")
for b in results:
    print(f"  - {b.title} ({b.publish_year})")

Found 3 books:
  - Animal Farm (1945)
  - Animal Farm (1945)
  - Animal Farm (1945)


In [5]:
# Create user and book list
user = UserCRUD.get_or_create(session, "reader@example.com", "Book Reader")
book_list = BookListCRUD.create(session, user.id, "Classics")
BookListCRUD.add_book(session, book_list.id, book.id)

print(f"User: {user.name}")
print(f"List '{book_list.name}' has {len(book_list.books)} book(s)")

session.commit()

User: Book Reader
List 'Classics' has 1 book(s)


## 2. Test import_dataset with Sample CSV

In [6]:
# Create a sample CSV
sample = pl.DataFrame({
    "title": ["The Great Gatsby", "1984", "Pride and Prejudice", "Brave New World"],
    "authors": ["F. Scott Fitzgerald", "George Orwell", "Jane Austen", "Aldous Huxley"],
    "pages_number": [180, 328, 279, 268],
    "publisher_name": ["Scribner", "Secker & Warburg", "T. Egerton", "Chatto & Windus"],
    "publish_year": [1925, 1949, 1813, 1932],
})

sample_path = Path("/tmp/sample_books.csv")
sample.write_csv(sample_path)
print(f"Created: {sample_path}")
sample

Created: /tmp/sample_books.csv


title,authors,pages_number,publisher_name,publish_year
str,str,i64,str,i64
"""The Great Gatsby""","""F. Scott Fitzgerald""",180,"""Scribner""",1925
"""1984""","""George Orwell""",328,"""Secker & Warburg""",1949
"""Pride and Prejudice""","""Jane Austen""",279,"""T. Egerton""",1813
"""Brave New World""","""Aldous Huxley""",268,"""Chatto & Windus""",1932


In [7]:
# Preview the dataset before importing
preview = preview_dataset(sample_path, n=3)
for item in preview:
    print(f"Book: {item['book']['title']}")
    print(f"  Authors: {item['authors']}")
    print(f"  Year: {item['book']['publish_year']}\n")

Book: The Great Gatsby
  Authors: ['F. Scott Fitzgerald']
  Year: 1925

Book: 1984
  Authors: ['George Orwell']
  Year: 1949

Book: Pride and Prejudice
  Authors: ['Jane Austen']
  Year: 1813



In [8]:
# Import the dataset
stats = import_dataset(sample_path)
print("Import stats:")
for k, v in stats.items():
    print(f"  {k}: {v}")

Import stats:
  rows: 4
  books_created: 4
  books_skipped: 0
  authors_createderrors: 0


In [9]:
# Verify the imported books
session = SessionLocal()

gatsby = BookCRUD.get_by_title(session, "The Great Gatsby")
if gatsby:
    b = gatsby[0]
    print(f"{b.title}")
    print(f"  Authors: {[a.name for a in b.authors]}")
    print(f"  Published: {b.publish_year}")
    print(f"  Pages: {b.pages_number}")

The Great Gatsby
  Authors: ['F. Scott Fitzgerald']
  Published: 1925
  Pages: 180


## 4. Test import_dataset with Parquet

In [10]:
# Preview dataset

PROJECT_ROOT = Path.cwd().parent.parent
DATA_DIR = PROJECT_ROOT / "data"

file_path = DATA_DIR / "raw_goodreads_book_authors.parquet"

# stats = import_dataset(file_path)

preview = preview_dataset(file_path)
for item in preview:
    print(item)
    print(f"{item['row']['name']}")

{'error': 'Missing title', 'row': {'average_rating': '3.98', 'author_id': '604031', 'text_reviews_count': '7', 'name': 'Ronald J. Fields', 'ratings_count': '49'}}
Ronald J. Fields
{'error': 'Missing title', 'row': {'average_rating': '4.08', 'author_id': '626222', 'text_reviews_count': '28716', 'name': 'Anita Diamant', 'ratings_count': '546796'}}
Anita Diamant
{'error': 'Missing title', 'row': {'average_rating': '3.92', 'author_id': '10333', 'text_reviews_count': '5075', 'name': 'Barbara Hambly', 'ratings_count': '122118'}}
Barbara Hambly
{'error': 'Missing title', 'row': {'average_rating': '3.68', 'author_id': '9212', 'text_reviews_count': '36262', 'name': 'Jennifer Weiner', 'ratings_count': '888522'}}
Jennifer Weiner
{'error': 'Missing title', 'row': {'average_rating': '3.82', 'author_id': '149918', 'text_reviews_count': '96', 'name': 'Nigel Pennick', 'ratings_count': '1740'}}
Nigel Pennick


In [11]:
# Import
stats = import_authors(file_path)
print(stats)

{'rows': 829529, 'authors_created': 825736, 'authors_skipped': 3788, 'errors': 5}


## 7. Cleanup

In [None]:
session.close()
print("Session closed")

Session closed
