# This notebook demonstrates the entire recommendation pipeline

In [8]:
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
import matplotlib.pyplot as plt
import boto3
from botocore.config import Config
import dask.dataframe as dd

In [18]:
chosen_user_id = 12345

## Import Necessary Data and Create Book Library and User Library

In the actual pipeline this is not necessary as we have databases setup, but for the purpose of this demo and experimentation we need load these sets localy into the notebook to create a user library.

In [5]:
import os
import boto3
from botocore.config import Config

def download_from_r2(object_name, local_path, bucket_name="bookdbio"):
    # ensure parent dir exists
    parent_dir = os.path.dirname(local_path)
    if parent_dir and not os.path.isdir(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)

    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )

    try:
        s3.download_file(bucket_name, object_name, local_path)
        print(f"Successfully downloaded {object_name} to {local_path}")
    except Exception as e:
        print(f"Download failed for {object_name}: {e}")

In [6]:
def list_bucket_contents(bucket_name="bookdbio"):
    """List all objects in the R2 bucket"""
    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )
    
    try:
        response = s3.list_objects_v2(Bucket=bucket_name)
        if 'Contents' in response:
            print("Available files in bucket:")
            for obj in response['Contents']:
                print(f"- {obj['Key']}")
        else:
            print("Bucket is empty")
    except Exception as e:
        print(f"Error listing bucket contents: {e}")

### 1. Import books_df

Contains book metadata for complete library.

In [None]:
download_from_r2("data/reduced_books.parquet", "data/reduced_books.parquet")

In [9]:
books_df = dd.read_parquet("data/reduced_books.parquet")

In [None]:
books_df.head() 

### 2. Import reduced_interactions

Reduced set of interactions: contains information about a users interaction with a book.

In [11]:
download_from_r2("data/reduced_interactions.parquet", "data/reduced_interactions.parquet")

Successfully downloaded data/reduced_interactions.parquet to data/reduced_interactions.parquet


In [12]:
interactions_df = dd.read_parquet("data/reduced_interactions.parquet")

### 3. Import authors

Used to create complete book information.

In [None]:
download_from_r2("data/new_authors.parquet", "data/new_authors.parquet")

In [14]:
authors_df = dd.read_parquet("data/new_authors.parquet")

### 4. Adding Authors and Extracting Genres to Improve Books Dataset

In [15]:

def extract_genres(popular_shelves):
    """
    Extracts potential genres from a list of popular shelves dictionaries,
    adding only the base genre keyword found.

    Args:
        popular_shelves: A list of dictionaries, where each dictionary has
                         'count' and 'name' keys.

    Returns:
        A list of unique base genre names found, or an empty list on error.
    """
    try:
        if not isinstance(popular_shelves, np.ndarray) or len(popular_shelves) == 0:
            return []
        
        # Use a set to store unique base genres found
        found_genres = set() 
        
        genre_keywords = [
            'action', 'adventure', 'comedy', 'crime', 'mystery', 'textbook', 'children', 'mathematics', 'fantasy',
            'historical', 'horror', 'romance', 'satire', 'science fiction',
            'scifi', 'speculative fiction', 'thriller', 'western', 'paranormal',
            'dystopian', 'urban fantasy', 'contemporary', 'young adult', 'ya',
            'middle grade', 'children\'s', 'literary fiction', 'magic realism',
            'historical fiction', 'gothic', 'suspense', 'biography', 'memoir',
            'nonfiction', 'poetry', 'drama', 'historical romance',
            'fantasy romance', 'romantic suspense', 'science fiction romance',
            'contemporary romance', 'paranormal romance', 'epic fantasy',
            'dark fantasy', 'sword and sorcery', 'steampunk', 'cyberpunk',
            'apocalyptic', 'post-apocalyptic', 'alternate history',
            'superhero', 'mythology', 'fairy tales', 'folklore', 'war',
            'military fiction', 'spy fiction', 'political fiction', 'social science fiction',
            'techno-thriller', 'medical thriller', 'legal thriller',
            'psychological thriller', 'cozy mystery', 'hardboiled', 'noir',
            'coming-of-age', 'lgbtq+', 'christian fiction', 'religious fiction',
            'humor', 'travel', 'food', 'cooking', 'health', 'self-help',
            'business', 'finance', 'history', 'science', 'technology', 'nature',
            'art', 'music', 'philosophy', 'education', 'true crime', 'spiritual',
            'anthology', 'short stories', 'plays', 'screenplays', 'graphic novel',
            'comics', 'manga', 'erotica', 'new adult', 'chick lit', 'womens fiction',
            'sports fiction', 'family saga', ' Regency romance', 'literature'
        ]
        # Sort keywords by length descending to match longer phrases first (e.g., "science fiction" before "science")
        genre_keywords.sort(key=len, reverse=True)

        ignore_keywords = ['to-read', 'owned', 'hardcover', 'shelfari-favorites', 'series', 'might-read',
                           'dnf-d', 'hambly-barbara', 'strong-females', 'first-in-series',
                           'no-thanks-series-collections-boxes', 'entertaining-but-limited',
                           'kate-own', 'e-book', 'compliation', 'my-books',
                           'books-i-own-but-have-not-read', 'everything-owned', 'books-to-find',
                           'i-own-it', 'favorite', 'not-read', 'read-some-day', 'library',
                           'audiobooks', 'status-borrowed', 'owned-books',
                           'spec-fic-awd-locus-nom', '01', 'hardbacks', 'paper', 'german',
                           'hardback', 'physical-scifi-fantasy', 'childhood-favorites',
                           'bundle-same-author', 'aa-sifi-fantasy', 'ready-to-read',
                           'bought-on-flee-markets', 'fantasy-general', 'hardcopy', 'box-2',
                           'unfinished', 'magic', 'duplicates', 'favorites', 'books-i-own',
                           'fantasy-classic', 'own-hard-copy', 'fantasy-read',
                           'book-club-edition', 'sci-fi-or-fantasy', 'fiction-fantasy',
                           'fiction-literature-poetry', 'paused-hiatus', 'statusâ€”borrowed',
                           'recs-fantasy', 'fantasy-scifi', 'omnibus', 'speculative',
                           'sf--fantasy', 'in-my-home-library', 'fant-myth-para-vamps',
                           'read-in-my-20s']

        for shelf in popular_shelves:
            if not isinstance(shelf, dict) or 'name' not in shelf:
                continue
            
            shelf_name = shelf['name'].lower().strip() # Normalize shelf name

            # Skip if shelf name contains any ignore keywords
            if any(ignore in shelf_name for ignore in ignore_keywords):
                continue

            # Check if any genre keyword is present in the shelf name
            for keyword in genre_keywords:
                # Use word boundaries or careful checks to avoid partial matches (e.g., 'art' in 'heart')
                # Simple substring check for now, might need refinement depending on data
                if keyword in shelf_name: 
                    found_genres.add(keyword) # Add the base keyword
                    # Optional: break here if you only want the first/longest match per shelf
                    # break 

        return sorted(list(found_genres))
    except Exception as e:
        print(f"Error in extract_genres function: {e}")
        # Log the error message
        logging.error("Error in extract_genres function", exc_info=True)
        return []

In [16]:
# Create reduced DataFrame
reduced_books_df = books_df[['book_id', 'title', 'description']].copy()

# Modify extract_genres to return a string instead of a list
def extract_genres_string(shelves):
    genres = extract_genres(shelves)
    return ','.join(genres) if genres else ''

# Apply the modified function to get string representation of genres
reduced_books_df['genre'] = books_df['popular_shelves'].apply(extract_genres_string)

# Convert authors to string representation as well
def get_author_names(author_ids):
    author_names = []
    for author_id in author_ids:
        try:
            name = authors_df.loc[authors_df['author_id'] == author_id]['name'].compute().values[0]
            author_names.append(name)
        except:
            continue
    return ','.join(author_names)

reduced_books_df['authors'] = books_df['authors'].apply(get_author_names)

# Display sample of the reduced DataFrame
print("\nSample of reduced books DataFrame:")
print(reduced_books_df.head())

# Display genre distribution (need to split the strings for counting)
print("\nGenre distribution:")
genre_counts = reduced_books_df['genre'].apply(lambda x: x.split(',') if x else []).explode().value_counts()
print(genre_counts)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('popular_shelves', 'object'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('authors', 'object'))




Sample of reduced books DataFrame:
      book_id                                              title  \
3     6066819                               Best Friends Forever   
15      89375  90 Minutes in Heaven: A True Story of Death an...   
479  11731782                              Collide (Collide, #1)   
583     54270                                         Mein Kampf   
807     38568                         A Quick Bite (Argeneau #1)   

                                           description  \
3    Addie Downs and Valerie Adler were eight when ...   
15   As he is driving home from a minister's confer...   
479  Sherry has always known there was something ou...   
583  Madman, tyrant, animal - history has given Ado...   
807  That hot guy tied to Lissianna Argeneau's bed?...   

                                                 genre  \
3    coming-of-age,contemporary,drama,humor,mystery...   
15     biography,memoir,nonfiction,self-help,spiritual   
479  contemporary,dystopian,fant

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('genre', 'object'))



In [None]:
### 4. Creating User Library
def get_user_library(user_id: str, interactions_df: pd.DataFrame) -> List[int]:
    """
    Get all book IDs associated with a specific user from the interactions DataFrame.
    
    Args:
        user_id: The ID of the user to get books for (as string)
        interactions_df: DataFrame containing user-book interactions with columns 'user_id' and 'book_id'
        
    Returns:
        List of book IDs that the user has interacted with
    """
    # Filter interactions for the specific user and get their book IDs
    user_books = interactions_df[interactions_df['user_id'] == user_id]['book_id'].tolist()
    return user_books

chosen_user_library = get_user_library(chosen_user_id)


## Generating Candidates Using Base Models

In [2]:
user_embeddings_path = "../embeddings/gmf_user_embeddings.parquet"
book_embeddings_path = "../embeddings/gmf_book_embeddings.parquet"
sbert_embeddings_path = "../embeddings/SBERT_embeddings.parquet"

user_embeddings_df = pd.read_parquet(user_embeddings_path)
book_embeddings_df = pd.read_parquet(book_embeddings_path)
sbert_embeddings_df = pd.read_parquet(sbert_embeddings_path)

user_embeddings_df['user_id'] = user_embeddings_df['user_id'].astype(int)
book_embeddings_df['item_id'] = book_embeddings_df['item_id'].astype(int)

# Store GMF embedding column names
gmf_embedding_cols = [str(i) for i in range(32)]

### Ranking based on NCF

Return NCF Candidates

In [3]:
def get_gmf_ranking(user_id: str, 
                   user_embeddings_df: pd.DataFrame,
                   book_embeddings_df: pd.DataFrame,
                   k: int = 250) -> List[str]:
    """
    Generate book recommendations using only GMF embeddings.
    
    Args:
        user_id: The user ID to get recommendations for
        user_embeddings_df: DataFrame containing user embeddings
        book_embeddings_df: DataFrame containing book embeddings
        k: Number of recommendations to return (default 10)
        
    Returns:
        List of recommended book IDs
    """
    # Convert user_id to int
    user_id_int = int(user_id)
    
    # Get user embedding
    user_row = user_embeddings_df[user_embeddings_df['user_id'] == user_id_int]
    if len(user_row) == 0:
        raise ValueError(f"No embedding found for user {user_id}")
    
    # Get embedding columns (assuming they're named 0-31)
    embedding_cols = [str(i) for i in range(32)]
    user_emb = user_row[embedding_cols].iloc[0].values.tolist()
    
    # Get all book embeddings
    book_scores = []
    for _, book_row in book_embeddings_df.iterrows():
        book_id = str(book_row['item_id'])
        book_emb = book_row[embedding_cols].values.tolist()
        
        # Compute score using dot product
        score = np.dot(user_emb, book_emb)
        if not (np.isnan(score) or np.isinf(score)):
            book_scores.append((book_id, score))
    
    # Sort by score in descending order and return top k
    book_scores.sort(key=lambda x: x[1], reverse=True)
    return [book_id for book_id, _ in book_scores[:k]]

In [4]:


ncf_recommendations = get_gmf_ranking(
    user_id=chosen_user_id,
    user_embeddings_df=user_embeddings_df,
    book_embeddings_df=book_embeddings_df,
    k=250
)

print("Top 10 GMF recommendations for user 12345:")
print(ncf_recommendations)

Top 10 GMF recommendations for user 12345:
['12042.0', '10774.0', '1085.0', '13668.0', '8733.0', '2741.0', '16184.0', '7894.0', '8269.0', '14932.0', '5702.0', '4869.0', '9947.0', '6569.0', '7002.0', '4048.0', '11636.0', '3459.0', '10186.0', '2596.0', '11635.0', '14367.0', '10185.0', '15617.0', '934.0', '17007.0', '16462.0', '16461.0', '933.0', '5207.0', '3603.0', '6417.0', '13945.0', '5977.0', '9410.0', '8676.0', '17006.0', '1865.0', '1349.0', '6418.0', '10184.0', '935.0', '8677.0', '8678.0', '16162.0', '5123.0', '7500.0', '3137.0', '588.0', '15742.0', '4870.0', '6502.0', '9700.0', '4590.0', '9794.0', '13615.0', '8270.0', '14706.0', '8076.0', '1350.0', '17637.0', '17180.0', '13035.0', '11609.0', '16136.0', '2184.0', '11664.0', '8570.0', '10773.0', '4236.0', '12011.0', '9177.0', '11782.0', '12686.0', '16946.0', '10940.0', '9717.0', '4019.0', '1060.0', '16241.0', '675.0', '9508.0', '2786.0', '13336.0', '422.0', '1771.0', '3338.0', '6972.0', '4433.0', '6544.0', '2645.0', '9332.0', '13534.

### Ranking based on SBERT

Return SBERT Candidates

In [None]:
def get_sbert_recommendations(
    user_library: List[str],
    book_embeddings_df: pd.DataFrame,
    sbert_embeddings_df: pd.DataFrame,
    k: int = 250
) -> List[Tuple[str, float]]:
    """
    Get book recommendations based on cosine similarity between user's library and other books.
    
    Args:
        user_library: List of book IDs in user's library
        book_embeddings_df: DataFrame containing book metadata
        sbert_embeddings_df: DataFrame containing SBERT embeddings
        k: Number of recommendations to return
        
    Returns:
        List of tuples (book_id, similarity_score)
    """
    # Get all book IDs excluding user's library
    all_book_ids = sbert_embeddings_df['book_id'].tolist()
    candidate_book_ids = [bid for bid in all_book_ids if str(bid) not in user_library]
    
    # Get embeddings for user's library books
    library_embeddings = []
    for book_id in user_library:
        book_row = sbert_embeddings_df[sbert_embeddings_df['book_id'] == book_id]
        if not book_row.empty:
            # Get embedding columns (assuming they're named 0-31)
            embedding_cols = [str(i) for i in range(32)]
            embedding = book_row[embedding_cols].iloc[0].values
            library_embeddings.append(embedding)
    
    if not library_embeddings:
        return []
    
    # Compute average library embedding
    avg_library_embedding = np.mean(library_embeddings, axis=0)
    
    # Compute similarity scores for candidate books
    scores = {}
    for book_id in candidate_book_ids:
        book_row = sbert_embeddings_df[sbert_embeddings_df['book_id'] == book_id]
        if not book_row.empty:
            # Get embedding columns
            embedding_cols = [str(i) for i in range(32)]
            book_embedding = book_row[embedding_cols].iloc[0].values
            
            # Compute cosine similarity
            similarity = cosine_similarity(avg_library_embedding, book_embedding)
            scores[str(book_id)] = similarity
    
    # Sort by similarity score
    ranked_books = sorted(
        [(book_id, score) for book_id, score in scores.items()],
        key=lambda x: x[1],
        reverse=True
    )
    
    return ranked_books[:k]

def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """
    Compute cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

In [None]:
# Example usage
query = "mystery thriller with plot twists"
recommendations = get_sbert_recommendations(
    query=query,
    book_embeddings_df=book_embeddings_df,
    sbert_embeddings_df=sbert_embeddings_df,
    k=10
)

# Print recommendations
for book_id, score in recommendations:
    print(f"Book ID: {book_id}, Similarity Score: {score:.3f}")

## Rules Based Filtering
	1.	Filters out duplicates and already-read books
	2.	Boosts diversity across genres and authors via a simple MMR-style re-rank
	3.	Leaves you with a cleaned, diversified list (~K=50) to feed into your
    cross-encoder

In [None]:
def filter_candidates(
    candidates: List[str], 
    book_meta: dd.DataFrame, 
    user_history: Set[str]
) -> List[str]:
    """
    1) Remove books the user already read.
    2) Remove duplicates by normalized title.
    """
    seen_titles = set()
    out = []
    for b in candidates:
        if b in user_history:
            continue
        # Get title from dask dataframe
        title = book_meta[book_meta['item_id'] == int(b)]['title'].compute().iloc[0].lower().strip()
        if title in seen_titles:
            continue
        seen_titles.add(title)
        out.append(b)
    return out

In [None]:
def mmr_diversify(
    candidates: List[str],
    book_meta: dd.DataFrame,
    initial_scores: Dict[str, float],
    k: int = 50,
    lambda_param: float = 0.7
) -> List[str]:
    """
    Maximal Marginal Relevance for genre/author diversity.

    - initial_scores[b]: the rule-based or CF/SBERT score for b
    - book_meta[b]['genres'] & ['authors'] are lists
    - k: target number of books to pick
    - lambda_param: trade-off between relevance vs. diversity
    """
    # precompute embeddings as sets
    rep = {}
    for b in candidates:
        rep[b] = set(book_meta[b].get('genres', [])) | set(book_meta[b].get('authors', []))
    
    selected = []
    # pick the highest-scored book first
    first = max(candidates, key=lambda b: initial_scores.get(b, 0))
    selected.append(first)
    
    # remaining pool
    pool = set(candidates) - {first}
    
    # MMR loop
    while pool and len(selected) < k:
        mmr_scores = {}
        for b in pool:
            relevance = initial_scores.get(b, 0)
            # diversity = max similarity with any already-selected
            sim_to_sel = max(
                len(rep[b].intersection(rep[s])) / len(rep[b].union(rep[s])) 
                if rep[b] and rep[s] else 0
                for s in selected
            )
            mmr_scores[b] = lambda_param * relevance - (1 - lambda_param) * sim_to_sel
        # pick best
        best = max(mmr_scores, key=mmr_scores.get)
        selected.append(best)
        pool.remove(best)
    return selected

In [None]:
def rule_and_diverse_rerank(
    raw_candidates: List[str],
    book_meta: dd.DataFrame,
    user_history: Set[str],
    base_scores: Dict[str, float],
    final_k: int = 50
) -> List[str]:
    # 1) Hard filter
    clean = filter_candidates(raw_candidates, book_meta, user_history)

    # 2)  Apply simple heuristics to adjust base_scores to boost recent publications
    for b in clean:
        base_scores[b] += 0.01 * (book_meta[b]['pub_year'] - 2000)

    # 3) Diversify via MMR
    diversified = mmr_diversify(clean, book_meta, base_scores, k=final_k)

    return diversified

### Obtain final candidates for cross-encoder

In [None]:

raw = cf_candidates + sbert_candidates
user_hist = user_library
# Weighting can be altered
scores = {b: cf_score[b] + 0.5*sbert_score[b] for b in raw}
top50 = rule_and_diverse_rerank(raw, reduced_books_df, Set(user_hist), scores, final_k=50)


## Cross Encoder Reranking