# Aggregate Goodreads and Bookclub data

In [7]:
from pathlib import Path

from scifi.data_processor import process_bookclub_data, save_processed_data
from scifi.utils import get_active_book_suggesters

In [8]:
# Paths to source files
goodreads_dir = Path("data/goodreads/clean")
bookclub_path = Path("data/bookclub/bookclub.csv")
manual_ratings_path = Path("data/bookclub/manual_ratings.csv")

In [9]:
# Process all data using the new data processor module
bookclub_processed_df, unmatched_df, goodreads_df = process_bookclub_data(
    goodreads_dir=goodreads_dir,
    bookclub_path=bookclub_path,
    manual_ratings_path=manual_ratings_path,
)

print("✅ Processing complete!")
print(f"📚 Processed {len(bookclub_processed_df)} books")
print(f"📖 Combined {len(goodreads_df)} Goodreads entries")
print(f"❓ Found {len(unmatched_df)} unmatched books")

✅ Processing complete!
📚 Processed 76 books
📖 Combined 2069 Goodreads entries
❓ Found 2 unmatched books


In [10]:
# Save results to CSV files
save_processed_data(bookclub_processed_df, unmatched_df, goodreads_df)
print("📁 Data saved to CSV files!")

📁 Data saved to CSV files!


Which books have we read but no one can remember it..?

In [11]:
print(unmatched_df)

shape: (2, 6)
┌───────┬────────────┬───────────────────┬─────────────────────┬──────────────┬────────────────────┐
│ index ┆ date       ┆ title             ┆ author              ┆ suggested_by ┆ location           │
│ ---   ┆ ---        ┆ ---               ┆ ---                 ┆ ---          ┆ ---                │
│ i64   ┆ date       ┆ str               ┆ str                 ┆ str          ┆ str                │
╞═══════╪════════════╪═══════════════════╪═════════════════════╪══════════════╪════════════════════╡
│ 34    ┆ 2019-02-20 ┆ Saga comic series ┆ Bryan K. Vaughan &  ┆ David        ┆ De Doelen          │
│       ┆            ┆                   ┆ Fiona Stapl…        ┆              ┆                    │
│ 51    ┆ 2021-05-22 ┆ The Things        ┆ Peter Watts         ┆ Peter        ┆ Laurynas -         │
│       ┆            ┆                   ┆                     ┆              ┆ Keizersgracht 92k  │
└───────┴────────────┴───────────────────┴─────────────────────┴─────────────

Print the number of books suggested by each member of the book club.

In [12]:
# Show active book suggesters
active_suggesters = get_active_book_suggesters(bookclub_processed_df)
print("📊 Active book suggesters:")

for row in active_suggesters.iter_rows(named=True):
    print(f"- {row['suggested_by']:.<15} {row['count']} books")

📊 Active book suggesters:
- Laurynas....... 11 books
- Dion........... 11 books
- Robert......... 11 books
- David.......... 8 books
- Thirsa & Koen.. 8 books
- gezamenlijk.... 4 books
- Marloes........ 4 books
- Peter.......... 4 books
- Koen........... 3 books
- Thirsa......... 2 books
- Thomas......... 2 books
- Koen_M......... 1 books
