In [84]:
import faiss
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from flask import Flask, render_template, request, jsonify

### Functions ###

In [85]:
# Function to determine which columns in the dataset are relevant for embeddings based on the user's query
def determine_embedding_columns(query):
    """
    Identifies the relevant dataset columns for generating embeddings based on keywords in the query.

    Args:
        query (str): The user query string.

    Returns:
        list: A list of column names that should be included for embedding generation.
    """
    query = query.lower()
    columns = ["description", "characters"]  # Always include these key columns

    if any(keyword in query for keyword in ["title", "name", "series"]):
        columns.extend(["title", "series"])
    if any(keyword in query for keyword in ["author", "writer", "by"]):
        columns.append("author")
    if "genre" in query:
        columns.append("genres")
    if "pages" in query:
        columns.append("pages")

    return columns

In [86]:
# Function to create embeddings for all books in the dataset
def create_book_embeddings(df):
    """
    Generates embeddings for each book in the dataset by concatenating embeddings for selected text fields.

    Args:
        df (pd.DataFrame): The dataset containing book information.

    Returns:
        np.ndarray: A numpy array of embeddings for all books.
    """
    embeddings = []
    for _, row in df.iterrows():
        # Always include "description" and "characters" fields
        text_fields = [
            row.get(col, "") if pd.notna(row.get(col, "")) else ""
            for col in ["description", "characters"]
        ]

        # Generate embeddings for each text field and concatenate them
        row_embedding = np.concatenate([model.encode(text) for text in text_fields])
        embeddings.append(row_embedding)

    # Convert to numpy array and ensure it's in float32 format for FAISS compatibility
    return np.array(embeddings).astype('float32')

In [87]:
# Function to clean the query by removing unnecessary words
def clean_query(query):
    """
    Cleans the user query by removing stopwords.

    Args:
        query (str): The user query string.

    Returns:
        str: The cleaned query string.
    """
    query = query.lower()  # Convert to lowercase
    words = query.split()  # Split into individual words
    cleaned_words = [word for word in words if word not in combined_stopwords]  # Remove stopwords
    return " ".join(cleaned_words)  # Reconstruct the cleaned query

In [88]:
# Function to generate embeddings for the user query
def create_query_embedding(query):
    """
    Creates an embedding for the user query by concatenating embeddings for relevant fields.

    Args:
        query (str): The cleaned user query string.

    Returns:
        np.ndarray: The embedding for the query.
    """
    columns = determine_embedding_columns(query)  # Determine relevant fields for the query
    query = clean_query(query)  # Clean the query

    # Generate embeddings for each relevant field and concatenate them
    query_embedding_parts = [
        model.encode(query) for _ in ["description", "characters"]
    ]
    query_embedding = np.concatenate(query_embedding_parts)  # Concatenate embeddings
    return np.array(query_embedding).astype('float32')

In [89]:
# Function to find books based on the user query
def find_books(query, k=3):
    """
    Finds the top-k books that match the user query using FAISS for similarity search.

    Args:
        query (str): The user query string.
        k (int): The number of top results to return.

    Returns:
        list: A list of dictionaries representing the top matching books.
    """
    query_embedding = create_query_embedding(query)  # Generate embedding for the query
    distances, indices = index.search(query_embedding.reshape(1, -1), k)  # Search FAISS index
    results = [df.iloc[idx].to_dict() for idx in indices[0]]  # Retrieve books based on indices
    return results

In [90]:
# Function to generate a natural language response for the user
def generate_response(query):
    """
    Generates a user-friendly response with the top matching books.

    Args:
        query (str): The user query string.

    Returns:
        str: A well-formatted response string with book recommendations.
    """
    books = find_books(query)  # Find books based on the query

    # If no books are found, return a friendly message
    if not books:
        return "Sorry, I couldn't find any books matching your query. Please try again with different keywords."

    # Construct a user-friendly response with book details
    response = "Here's what I found for you: "
    for i, book in enumerate(books, start=1):
        title = book.get('title', 'Unknown Title')
        author = book.get('author', 'Unknown Author')
        rating = book.get('rating', 'N/A')
        response += f"{i}. '{title}' by {author} (Rating: {rating}) "
    return response

### Preparing ###

In [91]:
# Load dataset and sort it by 'rating' in descending order for prioritization
df = pd.read_csv('books.csv', nrows=5000)
df = df.sort_values(by='rating', ascending=False)  # Ensure higher-rated books appear first

In [92]:
# Download stopwords and combine them with custom stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  # Default NLTK stopwords
custom_stopwords = {"book", "books", "description", "characters", "title", "name", "series", "author", "writer", "by", "genre", "pages"}  # Custom stopwords
combined_stopwords = stop_words.union(custom_stopwords)  # Merge both sets

[nltk_data] Downloading package stopwords to /home/nata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
# Initialize the SentenceTransformer model for embedding generation
model = SentenceTransformer('all-MiniLM-L6-v2')

In [94]:
# Create book embeddings and build a FAISS index
book_embeddings = create_book_embeddings(df)
index = faiss.IndexFlatL2(book_embeddings.shape[1])  # Create a FAISS index
index.add(book_embeddings)  # Add book embeddings to the FAISS index

### Web-Page ###

In [None]:
app = Flask(__name__)

@app.route("/main")
def main():
    return render_template("main.html")

@app.route("/send_message", methods=["POST"])
def send_message():
    data = request.get_json()
    user_message = data.get("message", "")
    response = generate_response(user_message)
    return jsonify({"response": response})

if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


127.0.0.1 - - [19/Dec/2024 02:50:31] "GET /main HTTP/1.1" 200 -
127.0.0.1 - - [19/Dec/2024 02:51:15] "POST /send_message HTTP/1.1" 200 -
127.0.0.1 - - [19/Dec/2024 02:51:23] "POST /send_message HTTP/1.1" 200 -
127.0.0.1 - - [19/Dec/2024 02:51:34] "POST /send_message HTTP/1.1" 200 -
127.0.0.1 - - [19/Dec/2024 02:51:39] "POST /send_message HTTP/1.1" 200 -
