# The analysis of Goodreads dataset

In [77]:
import pandas as pd
import plotly.express as px

In [78]:
df = pd.read_csv("./books.csv", on_bad_lines="skip")
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


## Cleaning the data

### Remove spaces from column names

In [94]:
df = df.rename(columns=lambda column: column.strip())

### Check for rows with null or duplicated values

In [79]:
df.isna().values.any()
df.duplicated().values.any()

False

### Group same language values

In [80]:
df["language_code"] = df["language_code"].replace(["en-US", "en-GB", "en-CA"], "eng")

## Get 5 oldest books available on Goodreads

In [88]:
df.sort_values(by="publication_date").head()[["title", "authors", "publication_date"]]

Unnamed: 0,title,authors,publication_date
9371,Consider the Lilies,Iain Crichton Smith/Isobel Murray,1/1/1900
6488,On Duties (De Officiis),Marcus Tullius Cicero/Walter Miller,1/1/1913
6816,Agricola / Germania / Dialogue on Oratory,Tacitus/Maurice Hutton/Robert Maxwell Ogilvie/...,1/1/1914
435,History of the Peloponnesian War: Bk. 1-2,Thucydides/C.F. Smith,1/1/1919
7141,The Library 1 Books 1-3.9,Apollodorus/James George Frazer,1/1/1921


## Get 5 longest books available on Goodreads

In [98]:
df.sort_values(by="num_pages", ascending=False).head()[["title", "authors", "num_pages"]]

Unnamed: 0,title,authors,num_pages
6497,The Complete Aubrey/Maturin Novels (5 Volumes),Patrick O'Brian,6576
6802,The Second World War,Winston S. Churchill/John Keegan,4736
10906,Remembrance of Things Past (Boxed Set),Marcel Proust/C.K. Scott Moncrieff/Frederick A...,3400
6,Harry Potter Collection (Harry Potter #1-6),J.K. Rowling,3342
6822,Summa Theologica 5 Vols,Thomas Aquinas,3020


## Get the most represented authors

In [None]:
most_represented_authors = df["authors"].mode().to_list()
print(f"The most represented authors: {", ".join(most_represented_authors)}.")

The most represented authors: P.G. Wodehouse, Stephen King.


## Get 10 highest rated books

In [None]:
df.sort_values(by="average_rating", ascending=False).head(n=10)[["title", "authors", "average_rating"]]

Unnamed: 0,title,authors,average_rating
624,Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...,Aristophanes/F.W. Hall/W.M. Geldart,5.0
9893,His Princess Devotional: A Royal Encounter Wit...,Sheri Rose Shepherd,5.0
4788,The Diamond Color Meditation: Color Pathway to...,John Diamond,5.0
9324,Tyrannosaurus Wrecks (Stanley #1),Laura Driscoll/Alisa Klayman-Grodsky/Eric ...,5.0
9720,The Irish Anatomist: A Study of Flann O'Brien,Keith Donohue,5.0
4933,Bulgakov's the Master and Margarita: The Text ...,Elena N. Mahlow,5.0
6775,Delwau Duon: Peintiadau Nicholas Evans = Symph...,Nicholas Evans/Rhonda Evans,5.0
9282,Oliver Wendell Holmes in Paris: Medicine Theo...,William C. Dowling,5.0
786,Willem de Kooning: Late Paintings,Julie Sylvester/David Sylvester,5.0
4125,Zone of the Enders: The 2nd Runner Official St...,Tim Bogenn,5.0


## Get the highest rated authors

In [112]:
df[(df["average_rating"] == 5.0) & (df["authors"] != "NOT A BOOK")][["authors"]]

Unnamed: 0,authors
624,Aristophanes/F.W. Hall/W.M. Geldart
786,Julie Sylvester/David Sylvester
855,Tara MacCarthy
1243,Middlesex Borough Heritage Committee
4125,Tim Bogenn
4788,John Diamond
4933,Elena N. Mahlow
5023,Ian Martin/Katie Elliott
5647,Dennis Adler/R.L. Wilson
5648,R. McL. Wilson


## Get 10 most reviewed books

In [None]:
df.sort_values(by="ratings_count", ascending=False).head(n=10)[["title", "authors", "ratings_count"]]

Unnamed: 0,title,authors,ratings_count
10336,Twilight (Twilight #1),Stephenie Meyer,4597666
1697,The Hobbit or There and Back Again,J.R.R. Tolkien,2530894
1462,The Catcher in the Rye,J.D. Salinger,2457092
307,Angels & Demons (Robert Langdon #1),Dan Brown,2418736
3,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,2339585
4415,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,2293963
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,2153167
23,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien,2128944
2114,Animal Farm,George Orwell/Boris Grabnar/Peter Škerl,2111750
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,2095690


## Show the distribution of publishing houses

In [None]:
languages_chart = px.bar(x=df["language_code"].values, y=df["language_code"].index)
languages_chart.update_layout(xaxis_title="Languages", yaxis_title="Number of books")
languages_chart.show()

## Show the distribution of languages

In [None]:
publishers_chart = px.bar(x=df["publisher"].index, y=df["publisher"].values, orientation="h")
publishers_chart.update_layout(xaxis_title="Publishers", yaxis_title="Number of books")
publishers_chart.show()