# Aggregate Goodreads and Bookclub data

In [1]:
from pathlib import Path

from scifi.utils import (
    get_reviewer_mapping,
    match_dataframes,
    merge_manual_ratings,
    pivot_goodreads_data,
    read_bookclub,
    read_combine_goodreads,
    read_manual_ratings,
)

In [2]:
# Paths to source files
goodreads_dir = Path("data/goodreads/clean")
bookclub_path = Path("data/bookclub_source.csv")
manual_ratings_path = Path("data/goodreads/manual_ratings.csv")

In [3]:
# Read and combine all Goodreads CSV files
goodreads_df = read_combine_goodreads(goodreads_dir)

# Group the Goodreads data by book and calculating average ratings
goodreads_pivot_df = pivot_goodreads_data(
    goodreads_df=goodreads_df,
    reviewer_mapping=get_reviewer_mapping(),
)

In [4]:
# Read the book club data
bookclub_df = read_bookclub(bookclub_path)

# Match the book club data with Goodreads data with an inner join
bookclub_processed_df = match_dataframes(
    bookclub_df=bookclub_df,
    goodreads_pivot_df=goodreads_pivot_df,
    on="title",
    how="inner",
)

# Find unmatched records with an anti join
unmatched_df = match_dataframes(
    bookclub_df=bookclub_df,
    goodreads_pivot_df=goodreads_pivot_df,
    on="title",
    how="anti",
)

In [5]:
# Read manual ratings
manual_ratings_df = read_manual_ratings(manual_ratings_path)

# Merge the manual ratings into the processed bookclub data
bookclub_processed_df = merge_manual_ratings(
    bookclub_processed_df=bookclub_processed_df,
    manual_ratings_df=manual_ratings_df,
    on="title",
)

In [6]:
# Sort by date
bookclub_processed_df = bookclub_processed_df.sort("date")

# Save results to CSV files
bookclub_processed_df.write_csv("data/bookclub_processed.csv")
unmatched_df.write_csv("data/goodreads/goodreads_unmatched.csv")
goodreads_df.write_csv("data/goodreads/goodreads_combined.csv")

Which books have we read but no one can remember it..?

In [7]:
print(unmatched_df)

shape: (3, 6)
┌───────┬────────────┬───────────────────┬───────────────────┬─────────────────┬───────────────────┐
│ index ┆ date       ┆ title             ┆ author            ┆ suggested_by    ┆ location          │
│ ---   ┆ ---        ┆ ---               ┆ ---               ┆ ---             ┆ ---               │
│ i64   ┆ date       ┆ str               ┆ str               ┆ str             ┆ str               │
╞═══════╪════════════╪═══════════════════╪═══════════════════╪═════════════════╪═══════════════════╡
│ 34    ┆ 2019-02-20 ┆ Saga comic series ┆ Bryan K. Vaughan  ┆ David           ┆ De Doelen         │
│       ┆            ┆                   ┆ & Fiona Stapl…    ┆                 ┆                   │
│ 51    ┆ 2021-05-22 ┆ The Things        ┆ Peter Watts       ┆ Allen (stemmen) ┆ Laurynas -        │
│       ┆            ┆                   ┆                   ┆                 ┆ Keizersgracht 92k │
│ 57    ┆ 2022-06-06 ┆ One Flew Over the ┆ Ken Kesey         ┆ David         