In [1]:
import re
import numpy  as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.colors as colors
import warnings
import matplotlib.colors as mcolors
import seaborn as sns

from wordcloud import WordCloud
from sklearn.neighbors import NearestNeighbors
from scipy.cluster.vq import kmeans, vq
from pylab import plot, show
from matplotlib.lines import Line2D
from sklearn.cluster import KMeans
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
warnings.filterwarnings("ignore")

In [3]:
books_data = pd.read_csv("./data/books.csv", on_bad_lines='skip')
books_data.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [4]:
books_data.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [5]:
# Rename columns
new_column_names = {
    'bookID': 'book_id',
    'title': 'title',
    'authors': 'authors',
    'average_rating': 'average_rating',
    'isbn': 'isbn',
    'isbn13': 'isbn13',
    'language_code': 'language_code',
    '  num_pages':'num_of_pages',
    'ratings_count': 'ratings_count',
    'text_reviews_count': 'text_reviews_count',
    'publication_date': 'publication_date',
    'publisher': 'publisher'
}

books_data = books_data.rename(columns=new_column_names)
books_data.head()

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [6]:
books_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   book_id             11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7   num_of_pages        11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [7]:
print(f"dataset contains {books_data.shape[0]} rows and {books_data.shape[1]} columns")

dataset contains 11123 rows and 12 columns


In [8]:
# What are the most frequently repeated books on the list? (Top 20 books)
top_20_frequent_books = books_data['title'].value_counts()[:20]
print(top_20_frequent_books)

title
The Iliad                     9
The Brothers Karamazov        9
The Odyssey                   8
Gulliver's Travels            8
'Salem's Lot                  8
Anna Karenina                 8
A Midsummer Night's Dream     7
The Picture of Dorian Gray    7
Sense and Sensibility         6
The Great Gatsby              6
Macbeth                       6
Treasure Island               6
Robinson Crusoe               6
Romeo and Juliet              6
Collected Stories             6
Jane Eyre                     6
The Secret Garden             6
The Histories                 6
The Scarlet Letter            6
Much Ado about Nothing        5
Name: count, dtype: int64


In [58]:
# Define a color palette with enough unique colors
color_palette = px.colors.qualitative.Plotly

# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(top_20_frequent_books))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=top_20_frequent_books,
    x=top_20_frequent_books.index,
    orientation='v',
    marker=dict(color=bar_colors)  # Set the bar colors using the list of colors
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Frequently Mentioned Books (Top 20)",
    yaxis_title="Frequency",
    xaxis_title="Titles",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )    
)
# Show the plot
fig.show()

In [10]:
### What are the top 20 rated books?
# Sort the dataframe by ratings_count in descending order and select the top 20
top_20_highest_rated = books_data.sort_values('ratings_count', ascending=False).head(20)
top_20_highest_rated

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher
10336,41865,Twilight (Twilight #1),Stephenie Meyer,3.59,0316015849,9780316015844,eng,501,4597666,94265,9/6/2006,Little Brown and Company
1697,5907,The Hobbit or There and Back Again,J.R.R. Tolkien,4.27,0618260307,9780618260300,eng,366,2530894,32871,8/15/2002,Houghton Mifflin
1462,5107,The Catcher in the Rye,J.D. Salinger,3.8,0316769177,9780316769174,eng,277,2457092,43499,1/30/2001,Back Bay Books
307,960,Angels & Demons (Robert Langdon #1),Dan Brown,3.89,1416524797,9781416524793,eng,736,2418736,21303,4/1/2006,Pocket Books
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4415,15881,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,0439064864,9780439064866,eng,341,2293963,34692,6/2/1999,Arthur A. Levine Books / Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
23,34,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien,4.36,0618346252,9780618346257,eng,398,2128944,13670,9/5/2003,Houghton Mifflin Harcourt
2114,7613,Animal Farm,George Orwell/Boris Grabnar/Peter Škerl,3.93,0452284244,9780452284241,eng,122,2111750,29677,5/6/2003,NAL
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.


In [57]:
# Define the number of bars and obtain a color palette with enough unique colors
num_bars = len(top_20_highest_rated)
color_palette = px.colors.qualitative.Set3

# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(top_20_highest_rated))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=top_20_highest_rated['ratings_count'],
    x=top_20_highest_rated['title'],
    orientation='v',
    marker=dict(color=bar_colors)  # Set the bar colors using the list of colors
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Most Frequently Rated Books (Top 20)",
    yaxis_title="Number of Ratings",
    xaxis_title="Book Title",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)

# Show the plot
fig.show()

In [12]:
## What is the language distribution of books?
# Count the number of books for each language code
language_counts = books_data['language_code'].value_counts().sort_values()
language_counts =language_counts[17:27]
language_counts

language_code
grc        11
zho        14
mul        19
jpn        46
ger        99
fre       144
en-GB     214
spa       218
en-US    1408
eng      8908
Name: count, dtype: int64

In [51]:
# Define a color palette with enough unique colors
color_palette = px.colors.qualitative.Set3

# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(language_counts))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=language_counts.index,
    x=language_counts,
    orientation = 'h',
    marker=dict(color=bar_colors)  # Set the bar colors using the color palette
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Most Common Book Languages",
    yaxis_title="Language",
    xaxis_title="Count",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)


# Show the plot
fig.show()

In [83]:
## Which authors have the most books? (Top 20)
# Group the dataframe by authors and count the number of titles for each author
most_books = books_data.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(20)
most_books = most_books.sort_values(by='title', ascending=True)
most_books

Unnamed: 0,authors,title
1978,Gabriel García Márquez,20
1331,Dean Koontz,20
71,Alan Dean Foster,21
608,Bill Bryson,21
2873,Janet Evanovich,21
1120,Dan Simmons,21
2167,Gordon Korman,22
2786,James Patterson,23
6030,Terry Pratchett,23
4019,Margaret Weis/Tracy Hickman,23


In [84]:
# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(most_books))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    x=most_books['title'],
    y=most_books['authors'],
    orientation='h',
    marker=dict(color=bar_colors)  # Set the bar colors using a qualitative color palette
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Most Frequent Authors (Top 20)",
    xaxis_title="Number of Books",
    yaxis_title="Author",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)

# Add labels to the bars
for i, val in enumerate(most_books['title']):
    fig.add_annotation(
        x=val + 1,
        y=most_books['authors'].iloc[i],
        text=str(val),
        font=dict(color='white', size=10),
        showarrow=False
    )

# Show the plot
fig.show()

In [85]:
most_text = books_data.sort_values('text_reviews_count', ascending=False).head(20)
most_text=most_text.sort_values(by='text_reviews_count', ascending=True)
most_text

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher,ratings_dist
2764,10210,Jane Eyre,Charlotte Brontë/Michael Mason,4.12,0142437204,9780142437209,eng,532,1409369,27884,2/4/2003,Penguin,Between 4 and 5
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,Between 4 and 5
2951,10917,My Sister's Keeper,Jodi Picoult,4.07,0743454537,9780743454537,eng,423,986711,29652,2/1/2005,Washington Square Press,Between 4 and 5
2114,7613,Animal Farm,George Orwell/Boris Grabnar/Peter Škerl,3.93,0452284244,9780452284241,eng,122,2111750,29677,5/6/2003,NAL,Between 3 and 4
1697,5907,The Hobbit or There and Back Again,J.R.R. Tolkien,4.27,0618260307,9780618260300,eng,366,2530894,32871,8/15/2002,Houghton Mifflin,Between 4 and 5
1782,6288,The Road,Cormac McCarthy,3.97,0307265439,9780307265432,eng,241,606433,34301,9/26/2006,Alfred A. Knopf,Between 3 and 4
2959,10964,Outlander (Outlander #1),Diana Gabaldon,4.23,0440242940,9780440242949,eng,850,673350,34690,7/26/2005,Dell Publishing Company,Between 4 and 5
4415,15881,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,0439064864,9780439064866,eng,341,2293963,34692,6/2/1999,Arthur A. Levine Books / Scholastic Inc.,Between 4 and 5
514,1618,The Curious Incident of the Dog in the Night-Time,Mark Haddon,3.88,1400032717,9781400032716,eng,226,1054308,35537,5/18/2004,Vintage,Between 3 and 4
310,968,The Da Vinci Code (Robert Langdon #2),Dan Brown,3.84,0307277674,9780307277671,eng,489,1679706,35877,3/28/2006,Anchor,Between 3 and 4


In [94]:
# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(most_text))]

fig = go.Figure(data=[
    go.Bar(
        x=most_text['text_reviews_count'],
        y=most_text['title'],
        orientation='h',
        marker=dict(color=bar_colors),
    )
])

fig.update_layout(
    title='Books with Highest Amount of Text Reviews',
    xaxis_title='Number of Text Reviews',
    yaxis_title='Book Title',
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    coloraxis=dict(colorscale='magma'),
    width=1200,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)


fig.show()

In [18]:
rating = books_data['average_rating'].astype(float)
rating

0        4.57
1        4.49
2        4.42
3        4.56
4        4.78
         ... 
11118    4.06
11119    4.08
11120    3.96
11121    3.72
11122    3.91
Name: average_rating, Length: 11123, dtype: float64

In [19]:
# Create the histogram
fig = px.histogram(rating, histnorm='density')

# Update layout with dark theme and grid lines, and remove the legend
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Distribution of Average Ratings",
    xaxis_title="Average Rating",
    yaxis_title="Density",
    width=800,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on y-axis
    ),
    showlegend=False  # Remove the legend
)

fig.show()

In [20]:
def segregate(data):
    values = []
    for val in data.average_rating:
        if val >= 0 and val <= 1:
            values.append("Between 0 and 1")
        elif val > 1 and val <= 2:
            values.append("Between 1 and 2")
        elif val > 2 and val <= 3:
            values.append("Between 2 and 3")
        elif val > 3 and val <= 4:
            values.append("Between 3 and 4")
        elif val > 4 and val <= 5:
            values.append("Between 4 and 5")
        else:
            values.append("NaN")
    return values

In [21]:
books_data['ratings_dist'] = segregate(books_data)
books_data.head()

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher,ratings_dist
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,Between 4 and 5
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,Between 4 and 5
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,Between 4 and 5
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,Between 4 and 5
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,Between 4 and 5


In [22]:
print(books_data['ratings_dist'].value_counts().index)
print(books_data['ratings_dist'].value_counts().values)

Index(['Between 3 and 4', 'Between 4 and 5', 'Between 2 and 3',
       'Between 0 and 1', 'Between 1 and 2'],
      dtype='object', name='ratings_dist')
[6285 4735   69   27    7]


In [23]:
## ratings_dist
labels = books_data['ratings_dist'].value_counts().index
values = books_data['ratings_dist'].value_counts().values

colors = ['lightblue', 'darkmagenta', 'coral', 'bisque', 'black']

fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=0.5,
    marker=dict(colors=colors, line=dict(color='#000000', width=1)),
)])

fig.update_layout(
    title="Distribution of Ratings",
    plot_bgcolor='rgb(17, 17, 17)',
    paper_bgcolor='rgb(17, 17, 17)',
    font=dict(color='white'),
    width=600,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on y-axis
    )
)

fig.show()

In [24]:
##  Is there a relationship between number of pages and ratings?

fig = go.Figure(data=go.Scatter(
    y=books_data['num_of_pages'],
    x=books_data['average_rating'],
    orientation='v',
    mode='markers',
    marker=dict(
        color='green',
        size=8,
        opacity=1,
        line=dict(
            color='white',  # Outline color
            width=0.5  # Outline width
    )
    )
    
))

fig.update_layout(
    title="Relationship between Number of Pages and Ratings",
    xaxis_title="Number of Pages",
    yaxis_title="Average Rating",
    plot_bgcolor='rgb(17, 17, 17)',
    paper_bgcolor='rgb(17, 17, 17)',
    font=dict(color='white'),
    width=800,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on y-axis
    ),
)

fig.show()

In [25]:
## Titles Word Cloud
# Concatenate all book titles into a single string
text = ' '.join(books_data['title'])

# Create a WordCloud object with dark background and white font color
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='black',
    colormap='viridis',
    contour_color='white',
    contour_width=2,
).generate(text)

# Generate the word cloud image
wordcloud_image = wordcloud.to_image()

# Create a plotly figure and add the word cloud image
fig = go.Figure()
fig.add_layout_image(
    source=wordcloud_image,
    x=0.5, y=0.5,
    xref='paper', yref='paper',
    sizex=1.2, sizey=1.5,
    xanchor='center', yanchor='middle'
)

# Set the layout properties
fig.update_layout(
    title='Book Titles Word Cloud',
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
    plot_bgcolor='white',
    showlegend=False,
    font=dict(color='black'),
    width=800,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
)

# Display the figure
fig.show()

In [26]:
books_data.head()

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher,ratings_dist
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,Between 4 and 5
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,Between 4 and 5
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,Between 4 and 5
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,Between 4 and 5
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,Between 4 and 5


In [27]:
books_database = books_data[['book_id','title','authors','average_rating','num_of_pages','ratings_count','text_reviews_count','publisher','ratings_dist']]
books_database.head()

Unnamed: 0,book_id,title,authors,average_rating,num_of_pages,ratings_count,text_reviews_count,publisher,ratings_dist
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,652,2095690,27591,Scholastic Inc.,Between 4 and 5
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,Scholastic Inc.,Between 4 and 5
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,352,6333,244,Scholastic,Between 4 and 5
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,435,2339585,36325,Scholastic Inc.,Between 4 and 5
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,2690,41428,164,Scholastic,Between 4 and 5


In [28]:
books_database['ratings_dist'] = books_database['ratings_dist'].replace({
    'Between 0 and 1':0,
    'Between 1 and 2':1,
    'Between 2 and 3':2,
    'Between 3 and 4':3,
    'Between 4 and 5':4
});
books_database.head()

Unnamed: 0,book_id,title,authors,average_rating,num_of_pages,ratings_count,text_reviews_count,publisher,ratings_dist
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,652,2095690,27591,Scholastic Inc.,4
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,Scholastic Inc.,4
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,352,6333,244,Scholastic,4
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,435,2339585,36325,Scholastic Inc.,4
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,2690,41428,164,Scholastic,4


In [29]:
# Creating an instance of the NearestNeighbors model
model = NearestNeighbors(n_neighbors=6, algorithm='ball_tree')

# Fitting the model to the feature matrix of books
model.fit(books_database[['average_rating','num_of_pages','ratings_count','text_reviews_count','ratings_dist']])

# Querying the model to find the nearest neighbors
distance, indices = model.kneighbors(books_database[['average_rating','num_of_pages','ratings_count','text_reviews_count','ratings_dist']])

In [30]:
indices.shape

(11123, 6)

In [31]:
books_database['indices'] = indices.tolist()
books_database['distance']= distance.tolist()
books_database.head()

Unnamed: 0,book_id,title,authors,average_rating,num_of_pages,ratings_count,text_reviews_count,publisher,ratings_dist,indices,distance
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,652,2095690,27591,Scholastic Inc.,4,"[0, 2114, 23, 1, 2116, 4415]","[0.0, 16203.576685707387, 36051.17575120262, 5..."
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,Scholastic Inc.,4,"[1, 23, 2114, 0, 2116, 4415]","[0.0, 28789.06240253232, 41426.26377448973, 57..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,352,6333,244,Scholastic,4,"[2, 254, 8166, 839, 10615, 7824]","[0.0, 77.3889714365038, 78.23977505080137, 85...."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,435,2339585,36325,Scholastic Inc.,4,"[3, 4415, 307, 1462, 1, 1697]","[0.0, 45651.31333291081, 80564.4579665804, 117..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,2690,41428,164,Scholastic,4,"[4, 1643, 7870, 4143, 5456, 3624]","[0.0, 2022.5205749509694, 2100.808411088455, 2..."


In [32]:
books_database.to_csv('./data/BOOKSDB.csv',index=False)

In [33]:
class BookQuest:
    def __init__(self, dataframe, indices, distances):
        self.df = dataframe
        self.indices = indices
        self.distances = distances  # A list of lists with distances corresponding to book indices
        self.all_books_names = list(self.df["title"].values)
    
    def find_id(self, name):
        for index, string in enumerate(self.all_books_names):
            if re.search(name, string, re.IGNORECASE):  # Case-insensitive search
                return index
        return None  # Return None if not found

    def print_similar_books(self, query=None, top_n=5):
        if query:
            found_id = self.find_id(query)
            if found_id is not None:
                # Extract indices and distances
                similar_indices = self.indices[found_id][1:]
                similar_distances = self.distances[found_id][1:]
                
                # Combine indices and distances into a list of tuples and sort by distance
                recommendations = list(zip(similar_indices, similar_distances))
                recommendations.sort(key=lambda x: x[1])  # Sort by distance
                
                print(f"Books similar to '{self.all_books_names[found_id]}':")
                for i, (id, dist) in enumerate(recommendations[:top_n]):
                    book_title = self.df.iloc[id]["title"]
                    book_author = self.df.iloc[id]["author"] if 'author' in self.df.columns else "Unknown Author"
                    book_genre = self.df.iloc[id]["genre"] if 'genre' in self.df.columns else "Unknown Genre"
                    print(f"{i + 1}. {book_title} (Distance: {dist:.2f})")
            else:
                print(f"Book '{query}' not found in the dataset.")

In [34]:
recsys2 = BookQuest(books_database,books_database.indices, books_database.distance)
recsys2.print_similar_books("The Book Thief")

Books similar to 'The Book Thief':
1. The Giver (The Giver  #1) (Distance: 75554.62)
2. Little Women (Distance: 77615.77)
3. The Alchemist (Distance: 118974.47)
4. Jane Eyre (Distance: 122185.18)
5. Eat  Pray  Love (Distance: 159025.77)


In [35]:
recsys2.print_similar_books("The Catcher")

Books similar to 'The Catcher in the Rye':
1. Angels & Demons (Robert Langdon  #1) (Distance: 44317.67)
2. The Hobbit  or There and Back Again (Distance: 74563.38)
3. Harry Potter and the Prisoner of Azkaban (Harry Potter  #3) (Distance: 117725.89)
4. Harry Potter and the Chamber of Secrets (Harry Potter  #2) (Distance: 163366.58)
5. Harry Potter and the Order of the Phoenix (Harry Potter  #5) (Distance: 304260.77)


In [36]:
trial = books_data[['average_rating', 'ratings_count']]
data = np.asarray([np.asarray(trial['average_rating']), np.asarray(trial['ratings_count'])]).T
data

array([[4.570000e+00, 2.095690e+06],
       [4.490000e+00, 2.153167e+06],
       [4.420000e+00, 6.333000e+03],
       ...,
       [3.960000e+00, 8.200000e+02],
       [3.720000e+00, 7.690000e+02],
       [3.910000e+00, 1.130000e+02]])

In [37]:
# Assuming `data` is your dataset
X = data
distortions = []

# Compute distortions for each k
for k in range(2, 30):
    k_means = KMeans(n_clusters=k)
    k_means.fit(X)
    distortions.append(k_means.inertia_)

# Create the Plotly figure
fig = go.Figure()

# Add a line trace for the elbow curve
fig.add_trace(go.Scatter(
    x=list(range(2, 30)),
    y=distortions,
    mode='markers+lines',
    marker=dict(color='cyan', symbol='x', size=10),
    line=dict(color='cyan', width=2),
    name='Distortion',
    showlegend=False
))

# Find the elbow point (example: choosing k=6 as the elbow point)
# You can use methods like kneedle or visual inspection to find the best k
elbow_k = 6
elbow_distortion = distortions[elbow_k - 2]  # Adjust index to match the range used

# Add a vertical line and annotation for the elbow point
fig.add_trace(go.Scatter(
    x=[elbow_k, elbow_k],
    y=[0, elbow_distortion],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Elbow Point Line',
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=[elbow_k],
    y=[elbow_distortion],
    mode='markers+text',
    marker=dict(color='red', size=10),
    text=['Elbow Point'],
    textposition='top right',
    name='Elbow Point',
    showlegend=False
))

# Update layout for the figure
fig.update_layout(
    title='Elbow Curve',
    xaxis_title='Number of Clusters (k)',
    yaxis_title='Distortion (Inertia)',
    plot_bgcolor='black',  # Background color of the plot area
    paper_bgcolor='black',  # Background color of the entire figure
    font=dict(color='white'),  # Font color
    width=800,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        showline=True,
        showgrid=True,
        zeroline=True,
        gridcolor='rgba(255, 255, 255, 0.3)'  # Grid line color with alpha
    ),
    yaxis=dict(
        showline=True,
        showgrid=True,
        zeroline=True,
        gridcolor='rgba(255, 255, 255, 0.3)'  # Grid line color with alpha
    )
)

# Show the figure
fig.show()

In [38]:
#Computing K means with K = 5, thus, taking it as 5 clusters
centroids, _ = kmeans(data, 6)

#assigning each sample to a cluster
#Vector Quantisation:

idx, _ = vq(data, centroids)

In [39]:
# Define colors for the clusters
colors = ['red', 'blue', 'yellow', 'magenta', 'green']
cluster_labels = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']

# Create the Plotly figure
fig = go.Figure()

# Add scatter plots for each cluster
for i in range(5):
    fig.add_trace(go.Scatter(
        x=data[idx == i, 0],
        y=data[idx == i, 1],
        mode='markers',
        marker=dict(color=colors[i], symbol='circle'),
        name=cluster_labels[i]
    ))

# Add scatter plot for centroids
fig.add_trace(go.Scatter(
    x=centroids[:, 0],
    y=centroids[:, 1],
    mode='markers',
    marker=dict(color='cyan', symbol='square', size=8),
    name='Centroids'
))

# Update layout for the figure
fig.update_layout(
    title='Clusters and Centroids',
    xaxis_title='X Coordinate',
    yaxis_title='Y Coordinate',
    plot_bgcolor='black',  # Background color of the plot area
    paper_bgcolor='black',  # Background color of the entire figure
    font=dict(color='white'),  # Font color
    width=800,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels,
    legend=dict(
        x=0.01,
        y=0.99,
        traceorder='normal',
        font=dict(size=12, color='white'),
        bordercolor='white',
        borderwidth=1
    )
)

# Show the figure
fig.show()