In [13]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai
from IPython.display import Markdown, display
import gradio as gr
import re
import os
from dotenv import load_dotenv

In [14]:
#Read in Data
Df = pd.read_csv('../Resources/GoodReads_100k_books.csv')

In [15]:
#Display the Dataframe
Df.head()


Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


In [16]:
#Describe the Data
Df.columns.tolist() 

['author',
 'bookformat',
 'desc',
 'genre',
 'img',
 'isbn',
 'isbn13',
 'link',
 'pages',
 'rating',
 'reviews',
 'title',
 'totalratings']

In [17]:
#Drop unnecessary columns

Df = Df.drop(columns=['link','bookformat','img','isbn', 'isbn13'])

In [18]:
#Check if there are any null values
null_check = Df.isnull().sum()
print("Count of null values in each column:")
print(null_check)

Count of null values in each column:
author              0
desc             6772
genre           10467
pages               0
rating              0
reviews             0
title               1
totalratings        0
dtype: int64


In [19]:
#Drop rows with null values in target columns (desc and genre) 
Df = Df.dropna(subset=['desc', 'genre'])

In [20]:
genre_set=[]

for genres in Df["genre"]:
    genre_list = genres.split(",")
    for single_genre in genre_list:
        genre_set.append(single_genre)

genre_set = set(genre_set)

len(genre_set)

1179

In [21]:
Df["genre"] = Df["genre"].str.split(",")

Df.head()

#End of Preprocessing

Unnamed: 0,author,desc,genre,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Reveals that several hundred thousand Indians ...,"[History, Military History, Civil War, America...",0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Fashion Sourcebook - 1920s is the first book i...,"[Couture, Fashion, Historical, Art, Nonfiction]",576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,The seminal history and analysis of the Hungar...,"[Politics, History]",124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life a...","[Labor, History]",324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
5,Jeffrey Pfeffer,Why is common sense so uncommon when it comes ...,"[Business, Leadership, Romance, Historical Rom...",368,3.73,7,The Human Equation: Building Profits by Puttin...,119


In [22]:
# Explicitly load the API key and verify it's working


# Get the API key

#GEMINI_API_KEY = "AIzaSyBoPcz12RYBXir_QO8KWgjIe32nTGaxuKo"
#genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Load environment variables.
load_dotenv()

# Set the model name for our LLMs.
GEMINI_MODEL = "gemini-1.5-flash"

# Store the API key in a variable.
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

In [23]:
# Gradio Interface

# Create a cache for genre-specific dataframes and embeddings
genre_cache = {}

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

def clean_description(text):
    """Clean description text to remove odd characters and formatting issues"""
    if not isinstance(text, str):
        return ""
    
    # Replace multiple commas with a single one
    text = re.sub(r',+', ',', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r' +', ' ', text)
    
    # Replace common problematic characters
    replacements = {
        'â€™': "'",
        'â€"': "—",
        'â€œ': '"',
        'â€': '"',
        '\\': '',
        '\xa0': ' ',
        '\u200b': '',
        ',,': ',',
    }
    
    for old, new in replacements.items():
        text = text.replace(old, new)
    
    # Remove unnecessary whitespace
    text = text.strip()
    
    return text

def rank_books(similar_books, weight_similarity=0.5, weight_rating=0.3, weight_totalratings=0.2):
    """Rank books based on similarity, rating, and popularity"""
    # Create a copy to avoid modifying the original
    ranked_books = similar_books.copy()
    
    # Normalize similarity score (already between 0-1)
    ranked_books["normalized_similarity"] = ranked_books["similarity"]
    
    # Normalize rating (assuming ratings are on a 1-5 scale)
    ranked_books["normalized_rating"] = ranked_books["rating"] / 5.0
    
    # Normalize total ratings using log scale
    ranked_books["normalized_totalratings"] = np.log1p(ranked_books["totalratings"]) / np.log1p(ranked_books["totalratings"].max())
    
    # Compute final weighted score
    ranked_books["final_score"] = (
        (ranked_books["normalized_similarity"] * weight_similarity) +
        (ranked_books["normalized_rating"] * weight_rating) +
        (ranked_books["normalized_totalratings"] * weight_totalratings)
    )
    
    # Sort by final score
    ranked_books = ranked_books.sort_values(by="final_score", ascending=False)
    
    return ranked_books

def filter_by_genre(genre):
    """Filter books by genre and compute embeddings"""
    if not genre:
        return "Please select a genre to continue."
    
    # If we've already computed this genre, don't recompute
    if genre in genre_cache:
        return f"Using cached data for genre: {genre}. Now enter your description above and click 'Get Recommendations'."
    
    # Filter by genre
    filtered_df = Df[Df["genre"].apply(lambda x: genre in x)]
    
    # Compute embeddings for this subset
    embeddings = model.encode(filtered_df["desc"].tolist(), convert_to_tensor=True)
    
    # Store in cache
    genre_cache[genre] = {
        'df': filtered_df,
        'embeddings': embeddings
    }
    
    num_books = len(filtered_df)
    return f"Selected genre: {genre} ({num_books} books available). Now enter your description above and click 'Get Recommendations'."

def format_books_for_prompt(ranked_books):
    """Format books data for the Gemini prompt"""
    books_text = ""
    
    for i, row in ranked_books.iterrows():
        books_text += f"BOOK {i+1}:\n"
        books_text += f"Title: {row['title']}\n"
        
        # Extract author if available
        if 'author' in row:
            books_text += f"Author: {row['author']}\n"
            
        books_text += f"Rating: {row['rating']} out of 5 (based on {int(row['totalratings'])} ratings)\n"
        books_text += f"Similarity to query: {row['similarity']:.2f}\n"
        books_text += f"Description: {row['desc']}\n\n"
        
    return books_text

def create_gemini_prompt(books_text, user_query, genre):
    """Create an enhanced prompt for Gemini"""
    prompt = f"""
    You are a literary expert creating beautiful book recommendation summaries.
    
    The user searched for books in the "{genre}" genre with this query: "{user_query}"
    
    Here are the top book recommendations from the algorithm:
    
    {books_text}
    
    Create an elegant, well-structured summary of these book recommendations with:
    
    1. An engaging title that captures the theme of these books
    
    2. A thoughtful introduction that:
       - Acknowledges the user's interest in {genre} books related to "{user_query}"
       - Provides context or background about this type of literature
       - Mentions what these particular books have in common
       - Uses warm, inviting language that shows enthusiasm for the recommendations
    
    3. Individual sections for each book that include:
       - Clear title and author formatting (if author is available)
       - Star rating with number of ratings
       - A concise, compelling paragraph that captures the essence of the book
       - Highlight what makes this book special or particularly relevant to the user's query
       - Avoid truncating descriptions - provide a complete picture of each book
    
    4. A thoughtful conclusion that:
       - Summarizes the value of these recommendations
       - Offers a gentle suggestion about which book might be best to start with
       - Encourages the reader with a positive closing thought
    
    Format your response in Markdown with appropriate headings, emphasis, and structure.
    Make the summary engaging, informative, and polished - as if written by a passionate book lover.
    
    Only include the formatted markdown content in your response, with no additional explanations.
    """
    
    return prompt

def recommend_books_interface(query, genre, use_gemini=True):
    """Generate recommendations based on query and selected genre"""
    if not query.strip():
        return "Please enter a description of the books you're looking for."
    
    if not genre or genre not in genre_cache:
        return "Please select a genre first and click 'Apply Genre Filter'."
    
    # Use the cached subset and embeddings
    subset_df = genre_cache[genre]['df']
    book_embeddings = genre_cache[genre]['embeddings']
    
    # Find similar books within the subset
    query_embedding = model.encode([query], convert_to_tensor=True)
    similarities = cosine_similarity(query_embedding.cpu().numpy(), book_embeddings.cpu().numpy())[0]
    
    # Get top books
    top_n = 5
    top_indices = np.argsort(similarities)[::-1][:top_n*2]  # Get more than needed
    similar_books = subset_df.iloc[top_indices].copy()
    similar_books['similarity'] = similarities[top_indices]
    
    # Rank books
    similar_books = rank_books(similar_books).head(top_n)
    
    # Clean the descriptions
    similar_books['desc'] = similar_books['desc'].apply(clean_description)
    
    if use_gemini:
        try:
            # Format book data for the prompt
            books_text = format_books_for_prompt(similar_books)
            
            # Create prompt for Gemini
            prompt = create_gemini_prompt(books_text, query, genre)
            
            # Generate summary
            # Initialize the model.
            #model_gemini = genai.GenerativeModel(model_name="gemini-1.5-pro")
            #response = model_gemini.generate_content(prompt)
            # return response.text
            model_gemini = ChatGoogleGenerativeAI(google_api_key=GEMINI_API_KEY, model= GEMINI_MODEL, temperature=0.3)
            response = model_gemini.invoke(prompt)
            return response.content
        
        except Exception as e:
            print(f"Gemini formatting failed: {e}")
            print("Falling back to basic formatting...")
            # Fall back to basic formatting
    
    # Basic formatting
    output = "# Book Recommendations\n\n"
    for i, row in similar_books.iterrows():
        output += f"## {row['title']}\n"
        if 'author' in row:
            output += f"**Author:** {row['author']}\n"
        output += f"**Rating:** {row['rating']} ({int(row['totalratings'])} ratings)\n\n"
        output += f"{row['desc']}\n\n"  # Show full description
        output += "---\n\n"
    return output

# Get all unique genres
all_genres = set()
for genres in Df["genre"]:
    all_genres.update(genres)
all_genres = sorted(list(all_genres))

# Updated Gradio interface with improved design
# Make sure these variables are set before running this code:
# - Df: Your DataFrame with books
# - all_genres: List of genres from your DataFrame
# - GEMINI_API_KEY: Your API key for Google Gemini
# - All necessary functions: filter_by_genre, recommend_books_interface, etc.

# If you need to load your API key again:
if 'GEMINI_API_KEY' not in globals() or GEMINI_API_KEY is None:
    from dotenv import load_dotenv
    import os
    load_dotenv()
    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
    if GEMINI_API_KEY:
        print("API key loaded successfully!")
        # Configure the Google Gemini API
        #setup_gemini_api(GEMINI_API_KEY)
    else:
        print("WARNING: GEMINI_API_KEY not found. Please check your .env file.")

# Make sure we have all_genres defined
if 'all_genres' not in globals() or not all_genres:
    all_genres = set()
    for genres in Df["genre"]:
        all_genres.update(genres)
    all_genres = sorted(list(all_genres))
    print(f"Generated {len(all_genres)} genres")



In [24]:
# Create a more attractive blocks interface without external images
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo")) as interface:
    # Header section with emoji instead of image
    gr.Markdown(
        """
        # 📚 AI Book Recommendation System
        
        Discover your next favorite read with our AI-powered book recommendation system.
        This tool analyzes thousands of books to find the perfect matches for your interests.
        """
    )
    
    with gr.Accordion("How it works", open=False):
        gr.Markdown(
            """
            ### How to use this tool:
            
            1. **Select a genre** - Choose a book genre you're interested in
            2. **Describe what you're looking for** - Be specific about themes, periods, or styles
            3. **Click "Get Recommendations"** - Our AI will find perfect matches and craft a personalized summary
            
            The system uses semantic similarity to find books that match your interests, then ranks them based on 
            relevance, ratings, and popularity. Google Gemini AI creates detailed, personalized summaries of your recommendations.
            """
        )
    
    with gr.Tabs():
        with gr.TabItem("Find Books"):
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown(
                        """
                        ## Step 1: Select a Genre
                        Choose a genre to narrow down your search
                        """
                    )
                    genre_dropdown = gr.Dropdown(
                        choices=all_genres, 
                        label="Book Genre",
                        info="Filtering by genre improves recommendation quality and speed",
                        container=True
                    )
                    genre_button = gr.Button("Apply Genre Filter", variant="secondary")
                    genre_status = gr.Markdown("*Please select a genre to begin*")
                    
                with gr.Column(scale=2):
                    gr.Markdown(
                        """
                        ## Step 2: Describe What You're Looking For
                        Tell us about the kind of books you want to discover
                        """
                    )
                    query_input = gr.Textbox(
                        label="Your Interests", 
                        placeholder="E.g., 'Novels exploring the ethical implications of artificial intelligence' or 'Books about resilience during difficult times'",
                        lines=3,
                        container=True
                    )
                    
                    with gr.Row():
                        with gr.Column(scale=3):
                            use_gemini = gr.Checkbox(
                                label="Use AI for beautiful formatting", 
                                value=True,
                                info="Creates a polished, personalized summary of your recommendations"
                            )
                        with gr.Column(scale=2):
                            recommend_button = gr.Button("📚 Get Recommendations", variant="primary", scale=2)
    
        with gr.TabItem("Explore Popular Genres"):
            with gr.Row():
                with gr.Column():
                    gr.Markdown("### Fiction")
                    gr.Markdown(
                        """
                        - Fantasy: Magical worlds and extraordinary beings
                        - Science Fiction: Futuristic scenarios and scientific possibilities
                        - Mystery: Puzzling events and detective work
                        - Historical Fiction: Stories set in the past
                        - Romance: Love stories and relationships
                        """
                    )
                
                with gr.Column():
                    gr.Markdown("### Non-Fiction")
                    gr.Markdown(
                        """
                        - Biography: Life stories of notable people
                        - History: Events of the past
                        - Science: Scientific discoveries and concepts
                        - Philosophy: Explorations of knowledge and existence
                        - Self-Help: Personal improvement and growth
                        """
                    )
    
    # Output area for recommendations - using Column instead of Box
    gr.Markdown("### Your Personalized Book Recommendations")
    with gr.Column():
        recommendations = gr.Markdown()
    
    # Set up event handlers
    genre_button.click(
        fn=filter_by_genre, 
        inputs=genre_dropdown, 
        outputs=genre_status
    )
    
    recommend_button.click(
        fn=recommend_books_interface, 
        inputs=[query_input, genre_dropdown, use_gemini], 
        outputs=recommendations
    )
    
    # Examples section with better formatting
    gr.Markdown("### Try these example searches:")
    with gr.Row():
        with gr.Column():
            example1 = gr.Button("Historical fiction about World War II", variant="secondary", size="sm")
            example1.click(
                lambda: ["Books about the aftermath of World War II in Europe", "Historical Fiction"],
                outputs=[query_input, genre_dropdown]
            )
            
            example2 = gr.Button("Space exploration science fiction", variant="secondary", size="sm")
            example2.click(
                lambda: ["Space exploration novels with themes of first contact", "Science Fiction"],
                outputs=[query_input, genre_dropdown]
            )
            
        with gr.Column():
            example3 = gr.Button("Female scientists biographies", variant="secondary", size="sm")
            example3.click(
                lambda: ["Biographies of influential women in science", "Biography"],
                outputs=[query_input, genre_dropdown]
            )
            
            example4 = gr.Button("Victorian detective stories", variant="secondary", size="sm")
            example4.click(
                lambda: ["Detective novels set in Victorian London", "Mystery"],
                outputs=[query_input, genre_dropdown]
            )
    
    gr.Markdown(
        """
        ---
        ### About This Project
        
        This book recommendation system uses natural language processing to match your interests with books you'll love.
        It analyzes book descriptions, ratings, and popularity to find the perfect recommendations.
        
        *Created as a project for AI-powered content recommendation systems.*
        """
    )

# Launch the interface
interface.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


