## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import boto3
from botocore.config import Config
import dask.dataframe as dd

## 1. Load and Prepare Data

In [7]:
import os
import boto3
from botocore.config import Config

def download_from_r2(object_name, local_path, bucket_name="bookdbio"):
    # ensure parent dir exists
    parent_dir = os.path.dirname(local_path)
    if parent_dir and not os.path.isdir(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)


    try:
        s3.download_file(bucket_name, object_name, local_path)
        print(f"Successfully downloaded {object_name} to {local_path}")
    except Exception as e:
        print(f"Download failed for {object_name}: {e}")

In [8]:
def list_bucket_contents(bucket_name="bookdbio"):
    """List all objects in the R2 bucket"""
    
   )
    
    try:
        response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            print("Available files in bucket:")
            for obj in response['Contents']:
                print(f"- {obj['Key']}")
        else:
            print("Bucket is empty")
    except Exception as e:
        print(f"Error listing bucket contents: {e}")

In [9]:
list_bucket_contents()

Available files in bucket:
- data/author_id_map.csv
- data/authors.parquet
- data/book_id_map.csv
- data/book_texts.parquet
- data/book_texts_reduced.parquet
- data/books.parquet
- data/books_dedup.parquet
- data/books_triplets.parquet
- data/books_triplets_reduced.parquet
- data/books_works.parquet
- data/interactions.parquet
- data/interactions_dedup.parquet
- data/interactions_prepared_ncf.parquet
- data/interactions_prepared_ncf_reduced.parquet
- data/item_id_map_reduced.csv
- data/new_authors.parquet
- data/new_books.parquet
- data/reduced_book_ids.csv
- data/reduced_books.parquet
- data/reduced_interactions.parquet
- data/reduced_reviews.parquet
- data/reduced_user_ids.csv
- data/reviews_dedup.parquet
- data/user_id_map.csv
- data/user_id_map_reduced.csv
- db/bookdb.sql


In [8]:
download_from_r2("data/reduced_books.parquet", "data/reduced_books.parquet")

Successfully downloaded data/reduced_books.parquet to data/reduced_books.parquet


In [9]:
books_df = dd.read_parquet("data/reduced_books.parquet")

In [10]:
books_df.head() 

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
3,743294297.0,3282,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,B002ENBLOK,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,6243154,Best Friends Forever,Best Friends Forever
15,800759494.0,2885,[],US,,"[{'count': '9381', 'name': 'to-read'}, {'count...",,False,3.91,B00B853QPM,...,,,,https://www.goodreads.com/book/show/89375.90_M...,https://s.gr-assets.com/assets/nophoto/book/11...,89375,68157,2957021,90 Minutes in Heaven: A True Story of Death an...,90 Minutes in Heaven: A True Story of Death an...
479,,346,[274178],US,en-GB,"[{'count': '6001', 'name': 'to-read'}, {'count...",B0055Q8HDG,True,3.86,B0055Q8HDG,...,6.0,,2011.0,https://www.goodreads.com/book/show/11731782-c...,https://images.gr-assets.com/books/1352764436m...,11731782,5125,16680541,"Collide (Collide, #1)","Collide (Collide, #1)"
583,395083621.0,1396,[],US,en-US,"[{'count': '549', 'name': 'history'}, {'count'...",,False,3.13,B008QI6EHQ,...,,,,https://www.goodreads.com/book/show/54270.Mein...,https://images.gr-assets.com/books/1395618385m...,54270,16837,2049624,Mein Kampf,Mein Kampf
807,60773758.0,1223,[163036],US,en-US,"[{'count': '7188', 'name': 'to-read'}, {'count...",,False,3.91,B000GCFWXW,...,10.0,,2005.0,https://www.goodreads.com/book/show/38568.A_Qu...,https://images.gr-assets.com/books/1410129015m...,38568,32140,2621331,A Quick Bite (Argeneau #1),A Quick Bite (Argeneau #1)


In [11]:
download_from_r2("data/reduced_interactions.parquet", "data/reduced_interactions.parquet")

Successfully downloaded data/reduced_interactions.parquet to data/reduced_interactions.parquet


In [13]:
interactions_df = dd.read_parquet("data/reduced_interactions.parquet")

In [14]:
interactions_df.head()

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,6480781,c8676124d8829874576fcb868af89315,True,5,,Mon Mar 20 23:58:16 -0700 2017,Wed Mar 22 11:47:49 -0700 2017,,
1,8842281e1d1347389f2ab93d60773d4d,29584452,dacadc8f32e80bbdb5cd052a84ad5c63,True,4,,Tue Nov 15 09:23:32 -0800 2016,Tue Dec 13 11:00:02 -0800 2016,Wed Nov 23 00:00:00 -0800 2016,Fri Nov 18 00:00:00 -0800 2016
2,8842281e1d1347389f2ab93d60773d4d,28119237,7a8dc8ab7f3c0084be8150d7e5bd40cb,True,4,A fascinating book about community and belongi...,Thu Sep 22 16:19:12 -0700 2016,Thu Sep 22 16:19:12 -0700 2016,Tue Nov 22 00:00:00 -0800 2016,
3,8842281e1d1347389f2ab93d60773d4d,186074,1d12addadc0c737dcd29c362c936a266,True,5,,Tue Jul 12 19:21:10 -0700 2016,Wed Mar 22 11:47:46 -0700 2017,,
4,8842281e1d1347389f2ab93d60773d4d,15839976,c7baa53f6f7d554ed9c3859f0d400d19,True,5,,Tue May 10 11:34:44 -0700 2016,Wed Mar 22 11:47:45 -0700 2017,,


In [None]:
download_from_r2("data/reduced_users.parquet", "data/reduced_users.parquet")