# BOOK RECOMMENDER

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import requests
from random import randint
from time import sleep
from itertools import islice
from pandas import json_normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from IPython.display import Image, display, HTML
from IPython.display import Image, display
from textblob import TextBlob

import warnings
# Ignore all warnings
warnings.filterwarnings('ignore')

## Load Data

In [2]:
# Load the DataFrames from the CSV files
final_df = pd.read_csv('final_df.csv')
goodreads_df = pd.read_csv('goodreads_df_cleaned.csv')

## Check Data

In [3]:
final_df.head()

Unnamed: 0,title,author,image_url,url,cluster
0,Between Two Fires: American Indians in the Civ...,Laurence M. Hauptman,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001053.Betwee...,1
1,Fashion Sourcebook 1920s,"Charlotte Fiell,Emmanuelle Dirix",https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/10010552-fashi...,5
2,Hungary 56,Andy Anderson,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001077.Hungar...,12
3,All-American Anarchist: Joseph A. Labadie and ...,Carlotta R. Anderson,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001079.All_Am...,6
4,The Human Equation: Building Profits by Puttin...,Jeffrey Pfeffer,https://i.gr-assets.com/images/S/compressed.ph...,https://goodreads.com/book/show/1001090.The_Hu...,8


In [4]:
final_df.shape

(73038, 5)

In [5]:
goodreads_df.head()

Unnamed: 0,title,author,image_url,rating,rating_count
0,"Fourth Wing (The Empyrean, #1)",Rebecca Yarros,https://i.gr-assets.com/images/S/compressed.ph...,4.63,844282
1,Happy Place,Emily Henry,https://i.gr-assets.com/images/S/compressed.ph...,4.06,574386
2,Yellowface,R.F. Kuang,https://i.gr-assets.com/images/S/compressed.ph...,3.87,227094
3,"Love, Theoretically",Ali Hazelwood,https://i.gr-assets.com/images/S/compressed.ph...,4.17,239576
4,"Divine Rivals (Letters of Enchantment, #1)",Rebecca Ross,https://i.gr-assets.com/images/S/compressed.ph...,4.26,161237


In [6]:
goodreads_df.shape

(100, 5)

In [7]:
# Create a new DataFrame with selected columns for the recommender
selected_columns = ['title', 'author', 'image_url']
goodreads_rec_df = goodreads_df[selected_columns]

# Display the new DataFrame
goodreads_rec_df.head()

Unnamed: 0,title,author,image_url
0,"Fourth Wing (The Empyrean, #1)",Rebecca Yarros,https://i.gr-assets.com/images/S/compressed.ph...
1,Happy Place,Emily Henry,https://i.gr-assets.com/images/S/compressed.ph...
2,Yellowface,R.F. Kuang,https://i.gr-assets.com/images/S/compressed.ph...
3,"Love, Theoretically",Ali Hazelwood,https://i.gr-assets.com/images/S/compressed.ph...
4,"Divine Rivals (Letters of Enchantment, #1)",Rebecca Ross,https://i.gr-assets.com/images/S/compressed.ph...


In [8]:
goodreads_rec_df.shape

(100, 3)

In [9]:
# Save to a CSV file
goodreads_rec_df.to_csv('goodreads_rec_df.csv', index=False)

## Loading Machine Learning Models

In [10]:
# Load the KMeans model from a pickle file
with open('kmeans.pkl', 'rb') as file:
    kmeans = pickle.load(file)

# Load the scaler
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)
    
# Load the encoder
with open('encoder.pkl', 'rb') as file:
    encoder = pickle.load(file)

In [11]:
kmeans

In [12]:
scaler

In [13]:
encoder

### Verifying the KMeans Model

In [14]:
# Print the shape of the KMeans cluster centers to verify the model is loaded correctly
print(kmeans.cluster_centers_.shape)

(16, 10)


## Book Recommender

### Breakdown of the process:

1. **User Input**: Ask the user for a book title and author, ensuring case insensitivity in the matching process.
2. **Recommendation**: If the book is found in the `goodreads_rec_df`, recommend another book randomly from this dataframe.
3. **Google Books API**: If the book is not found, use the Google Books API to extract the required information.
4. **Preprocessing**: Preprocess this new data to fit the model, including sentiment analysis and encoding genres.
5. **Model Prediction**: Use my saved KMeans model to predict the cluster for the new book.
6. **Final Recommendation**: Recommend a book from the `final_df` that is in the same cluster as the user input.

### Some specific considerations:
    
- For the Google API part, when for example I get genres like ['Fiction', 'Fantasy'], I would need to preprocess them to match the way my model was trained. I trained the model with genres in lowercase and only single genres per book, so I would need to apply a similar transformation to the genres I get from the Google Books API.

- The function handles the cases where the genre might not be recognized by the encoder and defaults to 'other'. 

- It also makes sure to not recommend the exact book that the user has input by filtering it out from the recommendations. 

- In summary: The book recommendation process involves several steps, including user input, searching in a predefined list, fetching book info from an API if not found, preprocessing the data, and finally clustering and recommending a book from the same cluster.



### Try the recommender with a book that is present in the goodreads_rec_df

In [15]:
# Function to get user input
def get_user_input():
    title = input("Enter the book title: ").strip().lower()
    author = input("Enter the author's name: ").strip().lower()
    return title, author

# Function to recommend a book from goodreads_rec_df
def recommend_from_goodreads(title, author, goodreads_df):
    match = goodreads_df[(goodreads_df['title'].str.lower() == title) & 
                         (goodreads_df['author'].str.lower() == author)]
    if not match.empty:
        print("The book is already in our list. Here's a recommendation from our collection:")
        recommendations = goodreads_df[goodreads_df['title'].str.lower() != title]
        if not recommendations.empty:
            recommendation = recommendations.sample(n=1).iloc[0]
            print(f"Title: {recommendation['title']}")
            print(f"Author: {recommendation['author']}")
            display(Image(url=recommendation['image_url'], width=100, unconfined=True))
            return True
        else:
            print("No other books found for recommendation.")
            return False
    else:
        print("Book not found in our list. Searching online...")
        return False

# Function to preprocess genre similar to how I did during training
def preprocess_genre(genres):
    genres = set(genres.lower().split(', '))
    model_genres = ['nonfiction', 'history', 'romance', 'other', 'fiction', 'childrens', 'fantasy']
    processed_genres = []
    for genre in genres:
        if genre in model_genres:
            processed_genres.append(genre)
    
    return processed_genres if processed_genres else ['other']

# Function to fetch book info from Google Books API
def fetch_book_info(title, author):
    query = f'intitle:{title}+inauthor:{author}'
    response = requests.get(f'https://www.googleapis.com/books/v1/volumes?q={query}')
    if response.status_code == 200:
        data = response.json()
        if 'items' in data:
            book_data = data['items'][0]['volumeInfo']
            pages = book_data.get('pageCount', 0)
            rating = book_data.get('averageRating', 0)
            description = book_data.get('description', '')
            genres = ', '.join(book_data.get('categories', []))
            processed_genres = preprocess_genre(genres)
            return {
                'pages': pages,
                'rating': rating,
                'description': description,
                'genre': processed_genres
            }
    else:
        raise ValueError("Failed to fetch data from Google Books API")

# Function to preprocess and predict the cluster
def preprocess_and_predict(book_info, kmeans, scaler, encoder):
    # book_info['genre'] is already a list of lowercase genres
    try:
        # Transform the genre using the encoder
        # We'll need to ensure the encoder expects a list of genres and not a single string
        genre_processed = encoder.transform([book_info['genre']])
    except ValueError:
        # If the genre is unknown, set it to 'other'
        genre_processed = encoder.transform([['other']])
    
    # Apply sentiment analysis on the description
    polarity, subjectivity = TextBlob(book_info['description']).sentiment
    
    # Create a feature array
    # Since genre_processed is a 2D array after encoding, we use genre_processed[0] to get the first row
    features = np.array([[book_info['pages'], book_info['rating'], polarity, subjectivity] + list(genre_processed[0])])
    
    # Scale the numerical features 'pages' and 'rating'
    features_scaled = scaler.transform(features[:, :2])
    
    # Combine scaled numerical features with preprocessed categorical features
    features_combined = np.concatenate((features_scaled, features[:, 2:]), axis=1)
    
    # Predict the cluster for the new book
    cluster_label = kmeans.predict(features_combined)
    
    return cluster_label

# Function to recommend a book from the same cluster
def recommend_from_cluster(cluster_label, final_df, title, author):
    cluster_label = cluster_label[0]
    cluster_books = final_df[final_df['cluster'] == cluster_label]
    if not cluster_books.empty:
        cluster_books = cluster_books[~((cluster_books['title'].str.lower() == title) & 
                                        (cluster_books['author'].str.lower() == author))]
        if not cluster_books.empty:
            recommendation = cluster_books.sample(n=1).iloc[0]
            print("We recommend this book for you:")
            print(f"Title: {recommendation['title']}")
            print(f"Author: {recommendation['author']}")
            display(Image(url=recommendation['image_url'], width=100, unconfined=True))
            print(f"URL: {recommendation['url']}")
        else:
            print("No other books found in the same cluster for recommendation.")
    else:
        print("No books found in the same cluster.")

# Main function to run the recommender
def run_recommender():
    title, author = get_user_input()
    recommendation_found = recommend_from_goodreads(title, author, goodreads_rec_df)
    if not recommendation_found:
        book_info = fetch_book_info(title, author)
        if book_info:
            cluster_label = preprocess_and_predict(book_info, kmeans, scaler, encoder)
            recommend_from_cluster(cluster_label, final_df, title, author)
        else:
            print("Unable to find the book online.")

# Run the recommender
run_recommender()

Enter the book title: happy place
Enter the author's name: Emily Henry
The book is already in our list. Here's a recommendation from our collection:
Title: All the Dangerous Things
Author: Stacy Willingham


### Try the recommender with a book that is NOT present in the goodreads_rec_df

In [18]:
# Run the recommender
run_recommender()

Enter the book title: hunger games
Enter the author's name: Suzanne Collins
Book not found in our list. Searching online...
We recommend this book for you:
Title: Lessons from the Heart
Author: Dorothy Clark


URL: https://goodreads.com/book/show/1421293.Lessons_from_the_Heart


### More recommender tries

In [20]:
run_recommender()

Enter the book title: Yellowface
Enter the author's name: R.F. Kuang
The book is already in our list. Here's a recommendation from our collection:
Title: Go as a River
Author: Shelley Read


In [22]:
run_recommender()

Enter the book title: 1984
Enter the author's name: George Orwell
Book not found in our list. Searching online...
We recommend this book for you:
Title: Osaka: The Merchants' Capital of Early Modern Japan
Author: James L. McClain,Wakita Osamu,Wakita Haruko,Uchida Kusuo


URL: https://goodreads.com/book/show/1076775.Osaka
