## 1. Imports

Importing required dependencies.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import boto3
from botocore.config import Config
import dask.dataframe as dd

#### Set up functions to import data from the cloud
* download_from_r2
* list_bucket_contents


In [5]:
import os
import boto3
from botocore.config import Config

def download_from_r2(object_name, local_path, bucket_name="bookdbio"):
    # ensure parent dir exists
    parent_dir = os.path.dirname(local_path)
    if parent_dir and not os.path.isdir(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)

    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )

    try:
        s3.download_file(bucket_name, object_name, local_path)
        print(f"Successfully downloaded {object_name} to {local_path}")
    except Exception as e:
        print(f"Download failed for {object_name}: {e}")

In [2]:
def list_bucket_contents(bucket_name="bookdbio"):
    """List all objects in the R2 bucket"""
    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )
    
    try:
        response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            print("Available files in bucket:")
            for obj in response['Contents']:
                print(f"- {obj['Key']}")
        else:
            print("Bucket is empty")
    except Exception as e:
        print(f"Error listing bucket contents: {e}")

In [None]:
def upload_to_r2(files, bucket_name):
    """
    Upload multiple files to Cloudflare R2 bucket
    
    Args:
        files (list): List of tuples containing (file_path, object_name)
        bucket_name (str): R2 bucket name
    """
    # Configure R2 client
    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )

    for file_path, object_name in files:
        try:
            s3.upload_file(file_path, bucket_name, object_name)
            print(f"Successfully uploaded {file_path} to {object_name}")
        except Exception as e:
            print(f"Upload failed for {file_path}: {e}")


In [None]:
list_bucket_contents()

## Load Data From The Cloud

### 1. reduced_books

Reduced set of books: contains book meta data.

In [6]:
download_from_r2("data/reduced_books.parquet", "data/reduced_books.parquet")

Successfully downloaded data/reduced_books.parquet to data/reduced_books.parquet


In [7]:
books_df = dd.read_parquet("data/reduced_books.parquet")

In [None]:
books_df.head() 

### 2. reduced_interactions

Reduced set of interactions: contains information about a users interaction with a book.

In [9]:
download_from_r2("data/reduced_interactions.parquet", "data/reduced_interactions.parquet")

Successfully downloaded data/reduced_interactions.parquet to data/reduced_interactions.parquet


In [10]:
interactions_df = dd.read_parquet("data/reduced_interactions.parquet")

In [None]:
interactions_df.head()

In [12]:
# Count the number of unique user IDs in the interactions dataframe
unique_users_count = interactions_df['user_id'].nunique().compute()
print(f"Number of unique user IDs: {unique_users_count}")


Number of unique user IDs: 205242


## Data Engineering
Altering datasets, so they can be used to generate training inputs for the cross-encoder.

### 1. reduced_books

Preparing book metadat dataset so it can be used to generate user contexts for the cross-encoder. This is done by:
* Dropping unnecessary columns
* Extracting book genres
* Adding book authors

In [13]:
def analyze_dataframe(df):
    # Get column info
    cols = df.columns
    dtypes = df.dtypes
    
    # Calculate total rows
    total_rows = len(df.compute())
    
    # Initialize lists to store results
    results = []
    
    # Analyze each column
    for col in cols:
        # Count non-null values
        non_null_count = df[col].count().compute()
        null_count = total_rows - non_null_count
        null_percentage = (null_count / total_rows) * 100
        
        results.append({
            'Column': col,
            'Data Type': str(dtypes[col]),
            'Non-Null Count': non_null_count,
            'Null Count': null_count,
            'Null Percentage': f'{null_percentage:.2f}%'
        })
    
    # Convert results to pandas DataFrame for better display
    results_df = pd.DataFrame(results)
    return results_df.sort_values('Null Percentage', ascending=False)

# Display the analysis
print("DataFrame Analysis:")
display(analyze_dataframe(books_df))

DataFrame Analysis:


Unnamed: 0,Column,Data Type,Non-Null Count,Null Count,Null Percentage
0,isbn,string,17663,0,0.00%
15,publisher,string,17663,0,0.00%
27,title,string,17663,0,0.00%
26,work_id,string,17663,0,0.00%
25,ratings_count,string,17663,0,0.00%
24,book_id,int64,17663,0,0.00%
23,image_url,string,17663,0,0.00%
22,url,string,17663,0,0.00%
21,publication_year,string,17663,0,0.00%
20,edition_information,string,17663,0,0.00%


In [None]:
books_df.set_index('book_id', inplace=True)


In [None]:
books_df.head()

In [16]:

def extract_genres(popular_shelves):
    """
    Extracts potential genres from a list of popular shelves dictionaries,
    adding only the base genre keyword found.

    Args:
        popular_shelves: A list of dictionaries, where each dictionary has
                         'count' and 'name' keys.

    Returns:
        A list of unique base genre names found, or an empty list on error.
    """
    try:
        if not isinstance(popular_shelves, np.ndarray) or len(popular_shelves) == 0:
            return []
        
        # Use a set to store unique base genres found
        found_genres = set() 
        
        genre_keywords = [
            'action', 'adventure', 'comedy', 'crime', 'mystery', 'textbook', 'children', 'mathematics', 'fantasy',
            'historical', 'horror', 'romance', 'satire', 'science fiction',
            'scifi', 'speculative fiction', 'thriller', 'western', 'paranormal',
            'dystopian', 'urban fantasy', 'contemporary', 'young adult', 'ya',
            'middle grade', 'children\'s', 'literary fiction', 'magic realism',
            'historical fiction', 'gothic', 'suspense', 'biography', 'memoir',
            'nonfiction', 'poetry', 'drama', 'historical romance',
            'fantasy romance', 'romantic suspense', 'science fiction romance',
            'contemporary romance', 'paranormal romance', 'epic fantasy',
            'dark fantasy', 'sword and sorcery', 'steampunk', 'cyberpunk',
            'apocalyptic', 'post-apocalyptic', 'alternate history',
            'superhero', 'mythology', 'fairy tales', 'folklore', 'war',
            'military fiction', 'spy fiction', 'political fiction', 'social science fiction',
            'techno-thriller', 'medical thriller', 'legal thriller',
            'psychological thriller', 'cozy mystery', 'hardboiled', 'noir',
            'coming-of-age', 'lgbtq+', 'christian fiction', 'religious fiction',
            'humor', 'travel', 'food', 'cooking', 'health', 'self-help',
            'business', 'finance', 'history', 'science', 'technology', 'nature',
            'art', 'music', 'philosophy', 'education', 'true crime', 'spiritual',
            'anthology', 'short stories', 'plays', 'screenplays', 'graphic novel',
            'comics', 'manga', 'erotica', 'new adult', 'chick lit', 'womens fiction',
            'sports fiction', 'family saga', ' Regency romance', 'literature'
        ]
        # Sort keywords by length descending to match longer phrases first (e.g., "science fiction" before "science")
        genre_keywords.sort(key=len, reverse=True)

        ignore_keywords = ['to-read', 'owned', 'hardcover', 'shelfari-favorites', 'series', 'might-read',
                           'dnf-d', 'hambly-barbara', 'strong-females', 'first-in-series',
                           'no-thanks-series-collections-boxes', 'entertaining-but-limited',
                           'kate-own', 'e-book', 'compliation', 'my-books',
                           'books-i-own-but-have-not-read', 'everything-owned', 'books-to-find',
                           'i-own-it', 'favorite', 'not-read', 'read-some-day', 'library',
                           'audiobooks', 'status-borrowed', 'owned-books',
                           'spec-fic-awd-locus-nom', '01', 'hardbacks', 'paper', 'german',
                           'hardback', 'physical-scifi-fantasy', 'childhood-favorites',
                           'bundle-same-author', 'aa-sifi-fantasy', 'ready-to-read',
                           'bought-on-flee-markets', 'fantasy-general', 'hardcopy', 'box-2',
                           'unfinished', 'magic', 'duplicates', 'favorites', 'books-i-own',
                           'fantasy-classic', 'own-hard-copy', 'fantasy-read',
                           'book-club-edition', 'sci-fi-or-fantasy', 'fiction-fantasy',
                           'fiction-literature-poetry', 'paused-hiatus', 'status—borrowed',
                           'recs-fantasy', 'fantasy-scifi', 'omnibus', 'speculative',
                           'sf--fantasy', 'in-my-home-library', 'fant-myth-para-vamps',
                           'read-in-my-20s']

        for shelf in popular_shelves:
            if not isinstance(shelf, dict) or 'name' not in shelf:
                continue
            
            shelf_name = shelf['name'].lower().strip() # Normalize shelf name

            # Skip if shelf name contains any ignore keywords
            if any(ignore in shelf_name for ignore in ignore_keywords):
                continue

            # Check if any genre keyword is present in the shelf name
            for keyword in genre_keywords:
                # Use word boundaries or careful checks to avoid partial matches (e.g., 'art' in 'heart')
                # Simple substring check for now, might need refinement depending on data
                if keyword in shelf_name: 
                    found_genres.add(keyword) # Add the base keyword
                    # Optional: break here if you only want the first/longest match per shelf
                    # break 

        return sorted(list(found_genres))
    except Exception as e:
        print(f"Error in extract_genres function: {e}")
        # Log the error message
        logging.error("Error in extract_genres function", exc_info=True)
        return []

In [None]:
books_df.head()

In [18]:
download_from_r2("data/new_authors.parquet", "data/new_authors.parquet")

Successfully downloaded data/new_authors.parquet to data/new_authors.parquet


In [19]:
authors_df = dd.read_parquet("data/new_authors.parquet")

In [None]:
# Create reduced DataFrame
reduced_books_df = books_df[['book_id', 'title', 'description']].copy()

# Modify extract_genres to return a string instead of a list
def extract_genres_string(shelves):
    genres = extract_genres(shelves)
    return ','.join(genres) if genres else ''

# Apply the modified function to get string representation of genres
reduced_books_df['genre'] = books_df['popular_shelves'].apply(extract_genres_string)

# Convert authors to string representation as well
def get_author_names(author_ids):
    author_names = []
    for author_id in author_ids:
        try:
            name = authors_df.loc[authors_df['author_id'] == author_id]['name'].compute().values[0]
            author_names.append(name)
        except:
            continue
    return ','.join(author_names)

reduced_books_df['authors'] = books_df['authors'].apply(get_author_names)

# Display sample of the reduced DataFrame
print("\nSample of reduced books DataFrame:")
print(reduced_books_df.head())

# Display genre distribution (need to split the strings for counting)
print("\nGenre distribution:")
genre_counts = reduced_books_df['genre'].apply(lambda x: x.split(',') if x else []).explode().value_counts()
print(genre_counts)

**Downloading as parquet** to be used in the finetuning data generation script.

In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import psutil

# Monitor memory usage
def print_memory_usage():
    process = psutil.Process()
    print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")

# Before processing
print_memory_usage()

# Set up progress bar
with ProgressBar():
    # Process your DataFrame
    result = reduced_books_df.compute()  # or whatever operation you're doing
    
    # Save to parquet with optimizations
    result.to_parquet(
        'data/reduce_books_df.parquet',
        compression='snappy',
        index=False,
        engine='pyarrow'
    )

# After processing
print_memory_usage()

In [None]:
test = dd.read_parquet('data/reduce_books_df.parquet')

In [None]:
# Create a DataFrame with user_id and their read books sorted by rating
def create_user_books_df(interactions_df):
    """
    Create a DataFrame with user_id and their read books sorted by rating.
    
    Args:
        interactions_df: DataFrame containing user-book interactions
        
    Returns:
        DataFrame with columns 'user_id' and 'books_read'
    """
    import pandas as pd
    
    # Get all unique users
    unique_users = interactions_df['user_id'].unique().compute().tolist()
    
    # Create a list to store user_id and read_books pairs
    user_books_data = []
    
    # For each user, get their read books sorted by rating
    for user_id in unique_users:
        # Filter interactions for the specific user
        user_interactions = interactions_df[interactions_df['user_id'] == user_id]
        
        # Sort by rating in descending order
        sorted_interactions = user_interactions.sort_values(by='rating', ascending=False)
        
        # Get all book_ids
        read_books = sorted_interactions['book_id'].compute().tolist()
        
        user_books_data.append({
            'user_id': user_id,
            'books_read': read_books
        })
    
    # Create DataFrame from the collected data
    user_books_df = pd.DataFrame(user_books_data)
    
    return user_books_df

# Create the user_books_df with all users
user_books_df = create_user_books_df(interactions_df)

# Display sample of the DataFrame
print("\nSample of user_books_df:")
print(user_books_df.head())


In [None]:
user_books_df.shape

In [None]:
# Sample 50,000 users from the pandas DataFrame
sampled_users_book_pd = user_books_df.sample(n=50000, random_state=42)

# If you need it back as a Dask DataFrame
sampled_users_book = dd.from_pandas(sampled_users_book_pd, npartitions=10)

In [31]:
test3 = dd.read_parquet("data/sampled_users_book.parquet")

In [29]:
test3.head()

Unnamed: 0,user_id,books_read,num_books
0,001af7947e217e17694c5a9c097afffb,"[57854, 34, 7332, 5470, 9646, 14142, 11138, 17...",38
1,0006260f85929db85eddee3a0bd0e504,"[29056083, 357, 5358, 78129, 375802, 10428708,...",20
2,000bcda59ab565512f51f9e1f531b5e5,"[862041, 2767052, 8933944, 3685, 1772910, 2641...",60
3,0005f52944ea1992e95d61f287acaea9,"[2219694, 169875, 18335634, 23705512, 7171637,...",65
4,000883382802f2d95a3dd545bb953882,"[22402154, 13372690, 13104080, 10429045, 67523...",155


In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import psutil

# Monitor memory usage
def print_memory_usage():
    process = psutil.Process()
    print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")

# Before processing
print_memory_usage()

# Set up progress bar
with ProgressBar():
    # Process your DataFrame
    result = sampled_users_book.compute()  # or whatever operation you're doing
    
    # Save to parquet with optimizations
    result.to_parquet(
        'data/sampled_users_book.parquet',
        compression='snappy',
        index=False,
        engine='pyarrow'
    )

# After processing
print_memory_usage()

In [30]:
download_from_r2("data/book_texts_reduced.parquet", "data/book_texts_reduced.parquet")

Successfully downloaded data/book_texts_reduced.parquet to data/book_texts_reduced.parquet


## Uploading data to Cloud

In [None]:
# Upload files
files_to_upload = [
    ("data/reduce_books_df.parquet", "data/sampled_users_book.parquet"),
]

upload_to_r2(files_to_upload, "bookdbio")

## Checking Data Before Running Script

In [None]:
download_from_r2("data/sampled_users_book.parquet")

In [33]:
import pandas as pd
import dask.dataframe as dd

# Load and check the input files
print("=== User-Book Interactions ===")
user_books = dd.read_parquet('data/sampled_users_book.parquet')
user_books_pd = user_books.compute()
print("Shape:", user_books_pd.shape)
print("\nColumns:", user_books_pd.columns.tolist())
print("\nSample:")
print(user_books_pd.head())
print("\nBooks_read column type:", type(user_books_pd['books_read'].iloc[0]))
print("Sample books_read value:", user_books_pd['books_read'].iloc[0])

print("\n=== Book Metadata ===")
books_df = dd.read_parquet('data/reduce_books_df.parquet')
books_pd = books_df.compute()
print("Shape:", books_pd.shape)
print("\nColumns:", books_pd.columns.tolist())
print("\nSample:")
print(books_pd.head())

print("\n=== Book Texts ===")
book_texts = dd.read_parquet('data/book_texts_reduced.parquet')
book_texts_pd = book_texts.compute()
print("Shape:", book_texts_pd.shape)
print("\nColumns:", book_texts_pd.columns.tolist())
print("\nSample:")
print(book_texts_pd.head())

=== User-Book Interactions ===
Shape: (50000, 3)

Columns: ['user_id', 'books_read', 'num_books']

Sample:
                            user_id  \
0  001af7947e217e17694c5a9c097afffb   
1  0006260f85929db85eddee3a0bd0e504   
2  000bcda59ab565512f51f9e1f531b5e5   
3  0005f52944ea1992e95d61f287acaea9   
4  000883382802f2d95a3dd545bb953882   

                                          books_read  num_books  
0  [57854, 34, 7332, 5470, 9646, 14142, 11138, 17...         38  
1  [29056083, 357, 5358, 78129, 375802, 10428708,...         20  
2  [862041, 2767052, 8933944, 3685, 1772910, 2641...         60  
3  [2219694, 169875, 18335634, 23705512, 7171637,...         65  
4  [22402154, 13372690, 13104080, 10429045, 67523...        155  

Books_read column type: <class 'str'>
Sample books_read value: [57854, 34, 7332, 5470, 9646, 14142, 11138, 17343, 30633, 92003, 23617, 46654, 15241, 18512, 77566, 538845, 1519, 665, 15997, 3836, 33, 597790, 30659, 102920, 23613, 103390, 5129, 11149, 100915, 767

## Check Training Data

In [36]:
import pandas as pd
import dask.dataframe as dd

# Load the training pairs
training_pairs = dd.read_parquet('data/training_pairs.parquet')

# Convert to pandas for easier inspection
training_pairs_pd = training_pairs.compute()

# Display basic information
print("Shape of the dataset:", training_pairs_pd.shape)
print("\nColumns:", training_pairs_pd.columns.tolist())
print("\nSample of the data:")
print(training_pairs_pd.head())

# Check the distribution of labels
print("\nLabel distribution:")
print(training_pairs_pd['label'].value_counts())

# Check some example user contexts and book texts
print("\nExample user context:")
print(training_pairs_pd['user_ctx'].iloc[0])
print("\nExample book text:")
print(training_pairs_pd['book_text'].iloc[0])

Shape of the dataset: (0, 5)

Columns: ['user_id', 'book_id', 'user_ctx', 'book_text', 'label']

Sample of the data:
Empty DataFrame
Columns: [user_id, book_id, user_ctx, book_text, label]
Index: []

Label distribution:
Series([], Name: count, dtype: int64)

Example user context:


IndexError: single positional indexer is out-of-bounds