<a href="https://colab.research.google.com/github/vvpokhilko/llm-semantic-book-recommender/blob/main/gradio_dashboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Let's set up the environment, make sure we have all installed all the packages we need.
%%writefile requirements.txt
pandas
matplotlib
seaborn
python-dotenv
langchain-community
langchain-openai
langchain-chroma
transformers
gradio
notebook
ipywidgets

Writing requirements.txt


In [2]:
!pip install -r ./requirements.txt -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.1/322.1 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m26.1 MB/s[0m eta [36m0:00:0

In [20]:
# We will be using OpenAI embeddings model
# Let's set up access to OpenAI via API key
from google.colab import userdata
import os

openai_key = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_key

In [8]:
# Let's load the cleaned books csv file from github
from pathlib import Path

import requests
import os

def download_data(source: str) -> Path:
    """Downloads a zipped dataset from source and unzips to a data file.

    Args:
        source (str): A link to a zipped file containing data.
        remove_source (bool): Whether to remove the source after downloading and extracting.

    Returns:
        pathlib.Path to downloaded data.

    Example usage:
        download_data(source="https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/data/books_with_emotions.csv",
                      destination="pizza_steak_sushi")
    """

    # Setup path to data folder
    data_path = Path("data/")

    # If the image folder doesn't exist, download it and prepare it...
    if data_path.is_dir() and os.listdir(data_path):
        print(f"[INFO] {data_path} directory exists, skipping directory creation.")
    else:
        print(f"[INFO] Did not find {data_path} directory, creating one...")
        data_path.mkdir(parents=True, exist_ok=True)

    # Download pizza, steak, sushi data
    target_file = Path(source).name
    with open(data_path / target_file, "wb") as f:
        request = requests.get(source)
        print(f"[INFO] Downloading {target_file} from {source}...")
        f.write(request.content)

    return data_path

data_path_1 = download_data(source="https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/data/books_with_emotions.csv")
data_path_2 = download_data(source="https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/cover-not-found.jpg")
data_path_1, data_path_2
download_data(source="https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/data/tagged_description.txt")

[INFO] data directory exists, skipping directory creation.
[INFO] Downloading books_with_emotions.csv from https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/data/books_with_emotions.csv...
[INFO] data directory exists, skipping directory creation.
[INFO] Downloading cover-not-found.jpg from https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/cover-not-found.jpg...
[INFO] data directory exists, skipping directory creation.
[INFO] Downloading tagged_description.txt from https://github.com/vvpokhilko/llm-semantic-book-recommender/raw/main/data/tagged_description.txt...


PosixPath('data')

In [9]:
# Setup device agnostic code
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [6]:
import pandas as pd
import numpy as np

# Import necessary modules from LangChain for document processing
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

In [10]:
import gradio as gr  # Import Gradio for building the web interface

In [11]:
# Load book data from a CSV file
books = pd.read_csv("data/books_with_emotions.csv")
books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,...,title_and_subtitle,tagged_description,simple_categories,anger,disgust,fear,joy,sadness,surprise,neutral
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,...,Gilead,9780002005883 A NOVEL THAT READERS and critics...,Fiction,0.064134,0.273591,0.928168,0.932797,0.646216,0.967158,0.729603
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,...,Spider's Web: A Novel,9780002261982 A new 'Christie for Christmas' -...,Fiction,0.612619,0.348284,0.942528,0.704422,0.887939,0.11169,0.252545
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,...,Rage of angels,"9780006178736 A memorable, mesmerizing heroine...",Fiction,0.064134,0.104007,0.972321,0.767237,0.549477,0.11169,0.078766
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,...,The Four Loves,9780006280897 Lewis' work on the nature of lov...,Nonfiction,0.351484,0.150723,0.360706,0.251881,0.732685,0.11169,0.078766
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,...,The Problem of Pain,"9780006280934 ""In The Problem of Pain, C.S. Le...",Nonfiction,0.081412,0.184495,0.095043,0.040564,0.88439,0.475881,0.078766


In [31]:
# Create a larger thumbnail URL for better image resolution
books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"

# Handle missing thumbnail images by replacing NaN values with a placeholder image
books["large_thumbnail"] = np.where(
    books["large_thumbnail"].isna(), "data/cover-not-found.jpg", books["large_thumbnail"]
)

books.head()

Unnamed: 0,isbn13,isbn10,title,authors,categories,thumbnail,description,published_year,average_rating,num_pages,...,tagged_description,simple_categories,anger,disgust,fear,joy,sadness,surprise,neutral,large_thumbnail
0,9780002005883,2005883,Gilead,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,...,9780002005883 A NOVEL THAT READERS and critics...,Fiction,0.064134,0.273591,0.928168,0.932797,0.646216,0.967158,0.729603,http://books.google.com/books/content?id=KQZCP...
1,9780002261982,2261987,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,...,9780002261982 A new 'Christie for Christmas' -...,Fiction,0.612619,0.348284,0.942528,0.704422,0.887939,0.11169,0.252545,http://books.google.com/books/content?id=gA5GP...
2,9780006178736,6178731,Rage of angels,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,...,"9780006178736 A memorable, mesmerizing heroine...",Fiction,0.064134,0.104007,0.972321,0.767237,0.549477,0.11169,0.078766,http://books.google.com/books/content?id=FKo2T...
3,9780006280897,6280897,The Four Loves,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,...,9780006280897 Lewis' work on the nature of lov...,Nonfiction,0.351484,0.150723,0.360706,0.251881,0.732685,0.11169,0.078766,http://books.google.com/books/content?id=XhQ5X...
4,9780006280934,6280935,The Problem of Pain,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,...,"9780006280934 ""In The Problem of Pain, C.S. Le...",Nonfiction,0.081412,0.184495,0.095043,0.040564,0.88439,0.475881,0.078766,http://books.google.com/books/content?id=Kk-uV...


In [25]:
# Load text documents (e.g., book descriptions) for semantic search
raw_documents = TextLoader("data/tagged_description.txt").load()

In [28]:
import logging

logging.getLogger().setLevel(logging.ERROR)


In [29]:
from langchain_core.globals import set_verbose, set_debug

# Disable verbose logging
set_verbose(False)

# Disable debug logging
set_debug(False)

# Split the text into smaller chunks for processing in the vector database
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=0, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
documents[5]

Document(metadata={'source': 'data/tagged_description.txt'}, page_content="9780006380832 Until Vasco da Gama discovered the sea-route to the East in 1497-9 almost nothing was known in the West of the exotic cultures and wealth of the Indian Ocean and its peoples. It is this civilization and its destruction at the hands of the West that Richard Hall recreates in this book. Hall's history of the exploration and exploitation by Chinese and Arab travellers, and by the Portuguese, Dutch and British alike is one of brutality, betrayal and colonial ambition.")

In [21]:
# Create a Chroma database for storing book-related embeddings
# OpenAIEmbeddings is used for generating vector representations
# of text descriptions for semantic similarity searches
db_books = Chroma.from_documents(documents, OpenAIEmbeddings())

In [18]:
def retrieve_semantic_recommendations(
        query: str,
        category: str = None,
        tone: str = None,
        initial_top_k: int = 50,
        final_top_k: int = 16,
) -> pd.DataFrame:
    """
    Retrieves book recommendations based on a semantic query.

    Args:
        query (str): The user's input describing the desired book.
        category (str, optional): The selected book category (default is None, meaning all categories are considered).
        tone (str, optional): The selected emotional tone filter (default is None).
        initial_top_k (int): Number of top initial results from the semantic search.
        final_top_k (int): Final number of recommendations after filtering.

    Returns:
        pd.DataFrame: A dataframe containing recommended books.
    """

    # Perform a semantic search using the query to retrieve relevant book embeddings
    recs = db_books.similarity_search(query, k=initial_top_k)

    # Extract book ISBNs from search results and retrieve matching book records
    books_list = [int(rec.page_content.strip('"').split()[0]) for rec in recs]
    book_recs = books[books["isbn13"].isin(books_list)].head(initial_top_k)

    # Filter results by category if a specific one is selected
    if category != "All":
        book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
    else:
        book_recs = book_recs.head(final_top_k)

    # Sort results based on the selected emotional tone
    if tone == "Happy":
        book_recs.sort_values(by="joy", ascending=False, inplace=True)
    elif tone == "Surprising":
        book_recs.sort_values(by="surprise", ascending=False, inplace=True)
    elif tone == "Angry":
        book_recs.sort_values(by="anger", ascending=False, inplace=True)
    elif tone == "Suspenseful":
        book_recs.sort_values(by="fear", ascending=False, inplace=True)
    elif tone == "Sad":
        book_recs.sort_values(by="sadness", ascending=False, inplace=True)

    return book_recs

In [19]:

def recommend_books(
        query: str,
        category: str,
        tone: str
):
    """
    Generates a list of book recommendations in a format suitable for display in Gradio.

    Args:
        query (str): The user's input describing the desired book.
        category (str): The selected book category.
        tone (str): The selected emotional tone filter.

    Returns:
        list: A list of tuples containing book thumbnail URLs and descriptions.
    """

    # Retrieve recommendations using the semantic search function
    recommendations = retrieve_semantic_recommendations(query, category, tone)
    results = []

    for _, row in recommendations.iterrows():
        description = row["description"]
        truncated_desc_split = description.split()
        truncated_description = " ".join(truncated_desc_split[:30]) + "..."

        # Format author names for better readability
        authors_split = row["authors"].split(";")
        if len(authors_split) == 2:
            authors_str = f"{authors_split[0]} and {authors_split[1]}"
        elif len(authors_split) > 2:
            authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
        else:
            authors_str = row["authors"]

        # Generate a formatted caption for each book
        caption = f"{row['title']} by {authors_str}: {truncated_description}"
        results.append((row["large_thumbnail"], caption))
    return results


In [32]:
# Define category and tone filter options
categories = ["All"] + sorted(books["simple_categories"].unique())
tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]

# Create a Gradio interface for the book recommender
with gr.Blocks(theme=gr.themes.Glass()) as dashboard:
    gr.Markdown("# Semantic Book Recommender")

    with gr.Row():
        user_query = gr.Textbox(label="Please enter a description of a book:",
                                placeholder="e.g., A story about forgiveness")
        category_dropdown = gr.Dropdown(choices=categories, label="Select a category:", value="All")
        tone_dropdown = gr.Dropdown(choices=tones, label="Select an emotional tone:", value="All")
        submit_button = gr.Button("Find recommendations")

    gr.Markdown("## Recommendations")
    output = gr.Gallery(label="Recommended books", columns=8, rows=2)

    # Link user inputs to the recommendation function
    submit_button.click(fn=recommend_books,
                        inputs=[user_query, category_dropdown, tone_dropdown],
                        outputs=output)

# Run the Gradio interface when the script is executed
if __name__ == "__main__":
    dashboard.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a5448034fea2b780d5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
