## Reranker Logic Notebook

In [46]:
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from sentence_transformers import CrossEncoder

class LocalBookReranker:
    def __init__(
        self,
        user_embeddings_path: str = "../embeddings/gmf_user_embeddings.parquet",
        book_embeddings_path: str = "../embeddings/gmf_book_embeddings.parquet",
        sbert_embeddings_path: str = "../embeddings/SBERT_embeddings.parquet",
        diversity_weight: float = 0.3,
        gmf_weight: float = 0.7,
        sbert_weight: float = 0.3,
        k: int = 10
    ):
        print("Loading embeddings...")
        # Load with proper data types
        self.user_embeddings_df = pd.read_parquet(user_embeddings_path)
        self.book_embeddings_df = pd.read_parquet(book_embeddings_path)
        self.sbert_embeddings_df = pd.read_parquet(sbert_embeddings_path)
        
        # Convert IDs to proper types and ensure they're integers
        self.user_embeddings_df['user_id'] = self.user_embeddings_df['user_id'].astype(int)
        self.book_embeddings_df['item_id'] = self.book_embeddings_df['item_id'].astype(int)
        
        # Store GMF embedding column names
        self.gmf_embedding_cols = [str(i) for i in range(32)]
        
        self.diversity_weight = diversity_weight
        self.gmf_weight = gmf_weight
        self.sbert_weight = sbert_weight
        self.k = k
        print("Embeddings loaded successfully!")

    def _get_initial_ranking(
        self,
        candidates: List[str],
        gmf_scores: Dict[str, float]
    ) -> List[Tuple[str, float]]:
        """Create initial ranking based on GMF scores"""
        print(f"Debug - Number of candidates: {len(candidates)}")
        print(f"Debug - Number of GMF scores: {len(gmf_scores)}")
        print(f"Debug - First few candidates: {candidates[:5]}")
        print(f"Debug - First few GMF scores keys: {list(gmf_scores.keys())[:5]}")
        print(f"Debug - Sample GMF scores values: {list(gmf_scores.values())[:5]}")
        
        # Convert float book IDs to integer strings
        gmf_scores_fixed = {
            str(int(float(k))): v 
            for k, v in gmf_scores.items()
        }
        
        print(f"Debug - First few fixed GMF scores keys: {list(gmf_scores_fixed.keys())[:5]}")
        
        # Ensure all IDs are strings and filter out any missing scores
        scored_candidates = []
        for book_id in candidates:
            str_id = str(book_id)
            if str_id in gmf_scores_fixed:
                score = gmf_scores_fixed[str_id]
                if not (np.isnan(score) or np.isinf(score)):
                    scored_candidates.append((str_id, score))
        
        print(f"Debug - Number of scored candidates: {len(scored_candidates)}")
        if scored_candidates:
            print(f"Debug - First few scored candidates: {scored_candidates[:5]}")
        
        # Sort by score in descending order
        return sorted(scored_candidates, key=lambda x: x[1], reverse=True)

    def _compute_gmf_scores(
        self,
        user_emb: List[float],
        book_embeddings: Dict[str, List[float]]
    ) -> Dict[str, float]:
        """Compute GMF scores using dot product"""
        scores = {}
        for book_id, book_emb in book_embeddings.items():
            score = np.dot(user_emb, book_emb)
            if not (np.isnan(score) or np.isinf(score)):  # Filter out invalid scores
                scores[str(book_id)] = float(score)  # Ensure score is a regular float
        return scores

    def get_recommendations(
        self,
        user_id: str,
        query: str = None,
        candidate_books: List[str] = None
    ) -> List[str]:
        """Get reranked book recommendations for a user"""
        print(f"\nGetting recommendations for user {user_id}")
        
        # Convert user_id to int for comparison
        user_id_int = int(user_id)
        
        # Get user embedding
        user_emb = self._get_user_embedding(user_id_int)
        if user_emb is None:
            raise ValueError(f"No embedding found for user {user_id}")
        
        # Get candidate books if not provided
        if candidate_books is None:
            candidate_books = [str(id) for id in self.book_embeddings_df['item_id'].tolist()]
        
        print(f"Number of candidate books: {len(candidate_books)}")
        
        # Get book embeddings and compute GMF scores
        book_embeddings = self._get_book_embeddings(candidate_books)
        print(f"Number of book embeddings retrieved: {len(book_embeddings)}")
        
        if not book_embeddings:
            print("Warning: No book embeddings retrieved!")
            return []
        
        gmf_scores = self._compute_gmf_scores(user_emb, book_embeddings)
        print(f"Number of GMF scores computed: {len(gmf_scores)}")
        
        # Get initial ranking
        initial_ranking = self._get_initial_ranking(candidate_books, gmf_scores)
        print(f"Length of initial ranking: {len(initial_ranking)}")
        
        if len(initial_ranking) == 0:
            print("Warning: Initial ranking is empty!")
            return []
        
        return [book_id for book_id, _ in initial_ranking[:self.k]]

    def _get_user_embedding(self, user_id: int) -> List[float]:
        """Get user embedding from dataframe"""
        user_row = self.user_embeddings_df[
            self.user_embeddings_df['user_id'] == user_id
        ]
        if len(user_row) == 0:
            return None
        return user_row[self.gmf_embedding_cols].iloc[0].values.tolist()

    def _get_book_embeddings(self, book_ids: List[str]) -> Dict[str, List[float]]:
        """Get book embeddings from dataframe"""
        # Convert book_ids to integers for comparison
        book_ids_int = [int(id) for id in book_ids]
        book_rows = self.book_embeddings_df[
            self.book_embeddings_df['item_id'].isin(book_ids_int)
        ]
        
        # Create dictionary with string keys for consistency
        return {
            str(row['item_id']): row[self.gmf_embedding_cols].values.tolist()
            for _, row in book_rows.iterrows()
        }
        
    def _incorporate_sbert_scores(
        self,
        initial_ranking: List[Tuple[str, float]],
        sbert_scores: Dict[str, float]
    ) -> List[Tuple[str, float]]:
        """Combine GMF and SBERT scores"""
        combined_scores = []
        for book_id, gmf_score in initial_ranking:
            sbert_score = sbert_scores.get(book_id, 0)
            combined_score = (
                self.gmf_weight * gmf_score + 
                self.sbert_weight * sbert_score
            )
            combined_scores.append((book_id, combined_score))
        return sorted(combined_scores, key=lambda x: x[1], reverse=True)

    def _compute_text_similarity(self, query: str, text: str) -> float:
        """Compute similarity between query and text"""
        # Simple implementation - in practice, use a proper text similarity model
        return 0.5  # Placeholder

    def get_random_user_id(self):
        """Get a random valid user ID for testing"""
        return str(self.user_embeddings_df['user_id'].sample(1).iloc[0])

    def display_user_embedding(self, user_id: str):
        """Display the embedding for a specific user"""
        # Convert user_id to int for comparison
        user_id_int = int(user_id)
        
        user_row = self.user_embeddings_df[
            self.user_embeddings_df['user_id'] == user_id_int
        ]
        
        if len(user_row) == 0:
            print(f"No embedding found for user ID: {user_id}")
            return None
        
        # Get embedding from numbered columns
        embedding = user_row[self.gmf_embedding_cols].iloc[0].values.tolist()
        print(f"User ID: {user_id}")
        print(f"Embedding dimension: {len(embedding)}")
        print("\nFirst 5 values of embedding:")
        print(embedding[:5])
        print("...")
        print("\nLast 5 values of embedding:")
        print(embedding[-5:])
        
        return embedding

    def _compute_sbert_scores(
        self,
        query: str,
        book_ids: List[str]
    ) -> Dict[str, float]:
        """Compute similarity between query and book texts"""
        book_rows = self.sbert_embeddings_df[
            self.sbert_embeddings_df['book_id'].isin(book_ids)
        ]
        scores = {}
        for _, row in book_rows.iterrows():
            similarity = self._compute_text_similarity(query, row['text'])
            scores[str(row['book_id'])] = similarity
        return scores

    def _diversity_rerank(
        self,
        ranked_items: List[Tuple[str, float]],
        book_embeddings: Dict[str, List[float]]
    ) -> List[str]:
        """Apply diversity-aware reranking"""
        selected = []
        candidates = ranked_items.copy()
        
        while len(selected) < self.k and candidates:
            if not selected:
                selected.append(candidates.pop(0)[0])
                continue
            
            diversity_scores = []
            for book_id, base_score in candidates:
                if book_id in book_embeddings:  # Check if embedding exists
                    diversity_score = self._calculate_diversity(
                        book_id,
                        selected,
                        book_embeddings
                    )
                    final_score = (
                        (1 - self.diversity_weight) * base_score +
                        self.diversity_weight * diversity_score
                    )
                    diversity_scores.append((book_id, final_score))
            
            if diversity_scores:  # Check if we have valid scores
                best_candidate = max(diversity_scores, key=lambda x: x[1])[0]
                selected.append(best_candidate)
                candidates = [c for c in candidates if c[0] != best_candidate]
            else:
                break
        
        return selected
    
    def _calculate_diversity(
        self,
        candidate_id: str,
        selected_ids: List[str],
        embeddings: Dict[str, List[float]]
    ) -> float:
        """Calculate diversity score for a candidate"""
        if not selected_ids:
            return 1.0
        
        candidate_emb = embeddings[candidate_id]
        similarities = []
        
        for selected_id in selected_ids:
            selected_emb = embeddings[selected_id]
            similarity = np.dot(candidate_emb, selected_emb)
            similarities.append(similarity)
        
        return 1.0 - np.mean(similarities)
    
    def list_available_users(self, n=10):
        """
        Display first n available user IDs
        
        Args:
            n: Number of users to display (default 10)
        Returns:
            List of user IDs
        """
        users = self.user_embeddings_df['user_id'].tolist()
        print(f"Total number of users: {len(users)}")
        print(f"\nFirst {n} user IDs:")
        for i, user_id in enumerate(users[:n]):
            print(f"{i+1}. {user_id}")
        return users

    def search_user_id(self, partial_id: str):
        """
        Search for user IDs containing the given string
        
        Args:
            partial_id: Part of the user ID to search for
        Returns:
            List of matching user IDs
        """
        matching_users = self.user_embeddings_df[
            self.user_embeddings_df['user_id'].astype(str).str.contains(partial_id)
        ]['user_id'].tolist()
        
        print(f"Found {len(matching_users)} matching users:")
        for i, user_id in enumerate(matching_users[:10]):  # Show first 10 matches
            print(f"{i+1}. {user_id}")
        
        if len(matching_users) > 10:
            print(f"... and {len(matching_users) - 10} more")
        
        return matching_users
    
    # Add this method to your LocalBookReranker class
    def inspect_data_structure(self):
        """
        Print the structure of the loaded dataframes
        """
        print("User Embeddings DataFrame Columns:")
        print(self.user_embeddings_df.columns.tolist())
        print("\nFirst row sample:")
        print(self.user_embeddings_df.iloc[0])
        
        print("\nBook Embeddings DataFrame Columns:")
        print(self.book_embeddings_df.columns.tolist())
        print("\nFirst row sample:")
        print(self.book_embeddings_df.iloc[0])
        
        print("\nSBERT Embeddings DataFrame Columns:")
        print(self.sbert_embeddings_df.columns.tolist())
        print("\nFirst row sample:")
        print(self.sbert_embeddings_df.iloc[0])

    def debug_recommendations(self, user_id: str):
        """Debug the recommendation process"""
        print(f"Debugging recommendations for user {user_id}")
        
        # Convert user_id to int
        user_id_int = int(user_id)
        
        # Check user embedding
        user_emb = self._get_user_embedding(user_id_int)
        print(f"\nUser embedding exists: {user_emb is not None}")
        if user_emb is not None:
            print(f"User embedding dimension: {len(user_emb)}")
        
        # Get some candidate books
        candidate_books = [str(id) for id in self.book_embeddings_df['item_id'].head(100).tolist()]
        print(f"\nNumber of candidate books: {len(candidate_books)}")
        print("First 5 candidate book IDs:", candidate_books[:5])
        
        # Get book embeddings
        book_embeddings = self._get_book_embeddings(candidate_books)
        print(f"Number of book embeddings retrieved: {len(book_embeddings)}")
        
        if len(book_embeddings) == 0:
            print("\nWARNING: No book embeddings retrieved!")
            print("First 5 book IDs in embeddings DataFrame:", 
                  self.book_embeddings_df['item_id'].head().tolist())
        
        # Compute GMF scores
        if user_emb is not None and book_embeddings:
            gmf_scores = self._compute_gmf_scores(user_emb, book_embeddings)
            print(f"Number of GMF scores computed: {len(gmf_scores)}")
            
            # Get initial ranking
            initial_ranking = self._get_initial_ranking(candidate_books, gmf_scores)
            print(f"Length of initial ranking: {len(initial_ranking)}")
            
            if len(initial_ranking) > 0:
                print("\nTop 5 initial scores:")
                for book_id, score in initial_ranking[:5]:
                    print(f"Book {book_id}: {score}")
        
        return None

In [47]:
# Initialize reranker with local files
# Initialize reranker
reranker = LocalBookReranker(
    user_embeddings_path="../embeddings/gmf_user_embeddings.parquet",
    book_embeddings_path="../embeddings/gmf_book_embeddings.parquet",
    sbert_embeddings_path="../embeddings/SBERT_embeddings.parquet"
)

# Inspect data structure
reranker.inspect_data_structure()

# Get a random user ID
random_user = reranker.get_random_user_id()
print(f"\nRandom user ID: {random_user}")

# Display embedding for the random user
print("\nUser embedding details:")
reranker.display_user_embedding(random_user)

# Debug the recommendation process
print("\nDebugging recommendations:")
reranker.debug_recommendations(random_user)

# Get recommendations
recommendations = reranker.get_recommendations(random_user)
print("\nRecommended book IDs:", recommendations)


Loading embeddings...
Embeddings loaded successfully!
User Embeddings DataFrame Columns:
['user_id', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']

First row sample:
user_id    0.000000
0         -1.839949
1         -0.107026
2         -1.247430
3          1.235410
4         -2.745796
5          3.354668
6          1.215874
7         -0.187588
8          0.332815
9          1.625460
10        -1.902273
11        -0.464240
12         0.143594
13         0.419341
14        -1.652989
15         0.224399
16         1.739050
17         0.911732
18        -0.702986
19        -1.368686
20        -0.918555
21        -2.103576
22        -0.870400
23        -3.136705
24         0.554513
25         0.136379
26         1.170567
27         0.576086
28        -0.698404
29         1.002196
30        -1.492236
31        -0.387739
Name: 0, dtype: float64

Book Embeddin