In [1]:
import re
import numpy  as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.colors as colors
from wordcloud import WordCloud
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings("ignore")

In [2]:
books_data = pd.read_csv("./data/books.csv", on_bad_lines='skip')
books_data.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [3]:
books_data.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [4]:
# Rename columns
new_column_names = {
    'bookID': 'book_id',
    'title': 'title',
    'authors': 'authors',
    'average_rating': 'average_rating',
    'isbn': 'isbn',
    'isbn13': 'isbn13',
    'language_code': 'language_code',
    '  num_pages':'num_of_pages',
    'ratings_count': 'ratings_count',
    'text_reviews_count': 'text_reviews_count',
    'publication_date': 'publication_date',
    'publisher': 'publisher'
}

books_data = books_data.rename(columns=new_column_names)
books_data.head()

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [5]:
books_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   book_id             11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7   num_of_pages        11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [6]:
print(f"dataset contains {books_data.shape[0]} rows and {books_data.shape[1]} columns")

dataset contains 11123 rows and 12 columns


In [31]:
# What are the most frequently repeated books on the list? (Top 20 books)
top_20_frequent_books = books_data['title'].value_counts()[:20]
print(top_20_frequent_books)

title
The Iliad                     9
The Brothers Karamazov        9
The Odyssey                   8
Gulliver's Travels            8
'Salem's Lot                  8
Anna Karenina                 8
A Midsummer Night's Dream     7
The Picture of Dorian Gray    7
Sense and Sensibility         6
The Great Gatsby              6
Macbeth                       6
Treasure Island               6
Robinson Crusoe               6
Romeo and Juliet              6
Collected Stories             6
Jane Eyre                     6
The Secret Garden             6
The Histories                 6
The Scarlet Letter            6
Much Ado about Nothing        5
Name: count, dtype: int64


In [64]:
# Define a color palette with enough unique colors
color_palette = colors.qualitative.Plotly

# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(top_20_frequent_books))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=top_20_frequent_books,
    x=top_20_frequent_books.index,
    orientation='v',
    marker=dict(color=bar_colors)  # Set the bar colors using the list of colors
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Top 20 Books",
    xaxis_title="Frequency",
    yaxis_title="Titles",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )    
)
# Show the plot
fig.show()

In [26]:
### What are the top 20 rated books?
# Sort the dataframe by ratings_count in descending order and select the top 20
top_20_highest_rated = books_data.sort_values('ratings_count', ascending=False).head(20)
top_20_highest_rated

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher
10336,41865,Twilight (Twilight #1),Stephenie Meyer,3.59,0316015849,9780316015844,eng,501,4597666,94265,9/6/2006,Little Brown and Company
1697,5907,The Hobbit or There and Back Again,J.R.R. Tolkien,4.27,0618260307,9780618260300,eng,366,2530894,32871,8/15/2002,Houghton Mifflin
1462,5107,The Catcher in the Rye,J.D. Salinger,3.8,0316769177,9780316769174,eng,277,2457092,43499,1/30/2001,Back Bay Books
307,960,Angels & Demons (Robert Langdon #1),Dan Brown,3.89,1416524797,9781416524793,eng,736,2418736,21303,4/1/2006,Pocket Books
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4415,15881,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,0439064864,9780439064866,eng,341,2293963,34692,6/2/1999,Arthur A. Levine Books / Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
23,34,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien,4.36,0618346252,9780618346257,eng,398,2128944,13670,9/5/2003,Houghton Mifflin Harcourt
2114,7613,Animal Farm,George Orwell/Boris Grabnar/Peter Škerl,3.93,0452284244,9780452284241,eng,122,2111750,29677,5/6/2003,NAL
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.


In [29]:
# Define the number of bars and obtain a color palette with enough unique colors
num_bars = len(top_20_highest_rated)
color_palette = colors.qualitative.Set3

# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(top_20_highest_rated))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=top_20_highest_rated['ratings_count'],
    x=top_20_highest_rated['title'],
    orientation='v',
    marker=dict(color=bar_colors)  # Set the bar colors using the list of colors
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Top 20 Most Rated Books",
    xaxis_title="Number of Ratings",
    yaxis_title="Book Title",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)

# Show the plot
fig.show()

In [57]:
## What is the language distribution of books?
# Count the number of books for each language code
language_counts = books_data['language_code'].value_counts().sort_values()
language_counts =language_counts[17:27]
language_counts

language_code
grc        11
zho        14
mul        19
jpn        46
ger        99
fre       144
en-GB     214
spa       218
en-US    1408
eng      8908
Name: count, dtype: int64

In [59]:
# Define a color palette with enough unique colors
color_palette = colors.qualitative.Set3

# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(language_counts))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=language_counts.index,
    x=language_counts,
    orientation = 'h',
    marker=dict(color=bar_colors)  # Set the bar colors using the color palette
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Distribution of Books for Language Codes",
    xaxis_title="Language Code",
    yaxis_title="Count",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)


# Show the plot
fig.show()

In [47]:
## Which authors have the most books? (Top 20)
# Group the dataframe by authors and count the number of titles for each author
most_books = books_data.groupby('authors')['title'].count().reset_index().sort_values('title', ascending=False).head(20)
most_books

Unnamed: 0,authors,title
4747,P.G. Wodehouse,40
5786,Stephen King,40
5470,Rumiko Takahashi,39
4711,Orson Scott Card,35
46,Agatha Christie,33
4987,Piers Anthony,30
5528,Sandra Brown,29
4305,Mercedes Lackey,29
1401,Dick Francis,28
2786,James Patterson,23


In [65]:
# Create a list of distinct colors for each bar
bar_colors = [color_palette[i % len(color_palette)] for i in range(len(most_books))]

# Create a Plotly bar plot
fig = go.Figure(data=[go.Bar(
    y=most_books['title'],
    x=most_books['authors'],
    orientation='v',
    marker=dict(color=bar_colors)  # Set the bar colors using a qualitative color palette
)])

# Update the layout with dark background and white font
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Top 20 Authors with Most Books",
    xaxis_title="Total Number of Books",
    yaxis_title="Authors",
    width=1000,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # Black grid lines with 0.3 alpha on y-axis
    )
)

# Add labels to the bars
for i, val in enumerate(most_books['title']):
    fig.add_annotation(
        y=val + 1,
        x=most_books['authors'].iloc[i],
        text=str(val),
        font=dict(color='white', size=10),
        showarrow=False
    )

# Show the plot
fig.show()

In [79]:
rating = books_data['average_rating'].astype(float)
rating

0        4.57
1        4.49
2        4.42
3        4.56
4        4.78
         ... 
11118    4.06
11119    4.08
11120    3.96
11121    3.72
11122    3.91
Name: average_rating, Length: 11123, dtype: float64

In [81]:
# Create the histogram
fig = px.histogram(rating, histnorm='density')

# Update layout with dark theme and grid lines, and remove the legend
fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  # Dark background color
    paper_bgcolor='rgb(17, 17, 17)',  # Dark background color
    font=dict(color='white'),  # White font color
    title="Distribution of Average Ratings",
    xaxis_title="Average Rating",
    yaxis_title="Density",
    width=800,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on y-axis
    ),
    showlegend=False  # Remove the legend
)

fig.show()

In [82]:
def segregate(data):
    values = []
    for val in data.average_rating:
        if val >= 0 and val <= 1:
            values.append("Between 0 and 1")
        elif val > 1 and val <= 2:
            values.append("Between 1 and 2")
        elif val > 2 and val <= 3:
            values.append("Between 2 and 3")
        elif val > 3 and val <= 4:
            values.append("Between 3 and 4")
        elif val > 4 and val <= 5:
            values.append("Between 4 and 5")
        else:
            values.append("NaN")
    return values

In [83]:
books_data['ratings_dist'] = segregate(books_data)
books_data.head()

Unnamed: 0,book_id,title,authors,average_rating,isbn,isbn13,language_code,num_of_pages,ratings_count,text_reviews_count,publication_date,publisher,ratings_dist
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,Between 4 and 5
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,Between 4 and 5
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,Between 4 and 5
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,Between 4 and 5
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,Between 4 and 5


In [85]:
print(books_data['ratings_dist'].value_counts().index)
print(books_data['ratings_dist'].value_counts().values)

Index(['Between 3 and 4', 'Between 4 and 5', 'Between 2 and 3',
       'Between 0 and 1', 'Between 1 and 2'],
      dtype='object', name='ratings_dist')
[6285 4735   69   27    7]


In [88]:
## ratings_dist
labels = books_data['ratings_dist'].value_counts().index
values = books_data['ratings_dist'].value_counts().values

colors = ['lightblue', 'darkmagenta', 'coral', 'bisque', 'black']

fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=0.5,
    marker=dict(colors=colors, line=dict(color='#000000', width=1)),
)])

fig.update_layout(
    title="Distribution of Ratings",
    plot_bgcolor='rgb(17, 17, 17)',
    paper_bgcolor='rgb(17, 17, 17)',
    font=dict(color='white'),
    width=600,  # Width of the plot in pixels
    height=600,  # Height of the plot in pixels
    xaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on x-axis
    ),
    yaxis=dict(
        gridcolor='rgba(255, 255, 255, 0.3)'  # White grid lines with 0.3 alpha on y-axis
    )
)

fig.show()