## Goodreads


In [39]:
import pandas as pd
import altair as alt
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

# Configure Altair settings
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [40]:
# Load datasets
goodreads_df = pd.read_csv('https://raw.githubusercontent.com/melaniewalsh/responsible-datasets-in-context/main/datasets/top-500-novels/library_top_500.csv', encoding='utf-8')
goodreads_df

Unnamed: 0,top_500_rank,title,author,pub_year,orig_lang,genre,author_birth,author_death,author_gender,author_primary_lang,...,gr_num_ratings,gr_num_reviews,gr_avg_rating_rank,gr_num_ratings_rank,oclc_owi,author_viaf,gr_url,wiki_url,pg_eng_url,pg_orig_url
0,1,Don Quixote,Miguel de Cervantes,1605,Spanish,action,1547,1616,male,spa,...,269435,12053,318,211,1.810748e+09,17220427,https://www.goodreads.com/book/show/3836.Don_Q...,https://en.wikipedia.org/wiki/Don_Quixote,https://www.gutenberg.org/cache/epub/996/pg996...,https://www.gutenberg.org/cache/epub/2000/pg20...
1,2,Alice's Adventures in Wonderland,Lewis Carroll,1865,English,fantasy,1832,1898,male,eng,...,561016,15380,172,133,1.156132e+10,66462036,https://www.goodreads.com/book/show/24213.Alic...,https://en.wikipedia.org/wiki/Alice%27s_Advent...,https://www.gutenberg.org/cache/epub/11/pg11.txt,
2,3,The Adventures of Huckleberry Finn,Mark Twain,1884,English,action,1835,1910,male,eng,...,1262480,19440,373,68,3.373178e+09,50566653,https://www.goodreads.com/book/show/2956.The_A...,https://en.wikipedia.org/wiki/Adventures_of_Hu...,https://www.gutenberg.org/cache/epub/76/pg76.txt,
3,4,The Adventures of Tom Sawyer,Mark Twain,1876,English,action,1835,1910,male,eng,...,931898,13603,301,88,3.373178e+09,50566653,https://www.goodreads.com/book/show/24583.The_...,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.gutenberg.org/cache/epub/74/pg74.txt,
4,5,Treasure Island,Robert Louis Stevenson,1883,English,action,1850,1894,male,eng,...,486155,16307,368,145,3.434000e+03,95207986,https://www.goodreads.com/book/show/295.Treasu...,https://en.wikipedia.org/wiki/Treasure_Island,https://www.gutenberg.org/cache/epub/120/pg120...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Stranger in a Strange Land,Robert A. Heinlein,1961,English,scifi,1907,1988,male,eng,...,311859,9961,310,190,7.894120e+05,12309757,,https://en.wikipedia.org/wiki/Stranger_in_a_St...,NA_not-pub-domain,
496,497,Vision in White,Nora Roberts,2009,English,romance,1965,ALIVE,female,eng,...,138445,4652,128,277,1.559638e+08,66448023,,https://en.wikipedia.org/wiki/Vision_in_White,NA_not-pub-domain,
497,498,The Whipping Boy,Sid Fleischman,1986,English,action,1920,2010,male,eng,...,27444,1623,476,445,4.415520e+08,66438084,,https://en.wikipedia.org/wiki/The_Whipping_Boy,NA_not-pub-domain,
498,499,Room,Emma Donoghue,2010,English,na,1969,ALIVE,female,eng,...,801989,50594,171,101,4.859780e+08,39539889,,https://en.wikipedia.org/wiki/Room_(novel),NA_not-pub-domain,


Amount of Male and Female authors:

In [41]:
goodreads_df["author_gender"].value_counts()

male      354
female    145
Name: author_gender, dtype: int64

Changing `gr_num_ratings` column from string to an int

In [42]:
goodreads_df['gr_num_ratings'] = goodreads_df['gr_num_ratings'].str.replace(',', '').astype(int)

Now we are able to view the top 10 largest `gr_num_ratings`'s that corresponds to the biggest ratings in books.

In [43]:
top_10_rated_books = goodreads_df.nlargest(10, 'gr_num_ratings')
top_10_rated_books

Unnamed: 0,top_500_rank,title,author,pub_year,orig_lang,genre,author_birth,author_death,author_gender,author_primary_lang,...,gr_num_ratings,gr_num_reviews,gr_avg_rating_rank,gr_num_ratings_rank,oclc_owi,author_viaf,gr_url,wiki_url,pg_eng_url,pg_orig_url
44,45,Harry Potter and the Sorcerer's Stone,J.K. Rowling,1997,English,fantasy,1965,ALIVE,female,eng,...,9977698,160913,9,1,298916.0,116796842,https://www.goodreads.com/book/show/42844155-h...,https://en.wikipedia.org/wiki/Harry_Potter_and...,NA_not-pub-domain,
172,173,The Hunger Games,Suzanne Collins,2008,English,scifi,1962,ALIVE,female,eng,...,8603378,216829,22,2,170720900.0,150638554,https://www.goodreads.com/book/show/2767052-th...,https://en.wikipedia.org/wiki/The_Hunger_Games...,NA_not-pub-domain,
131,132,Twilight,Stephenie Meyer,2005,English,romance,1973,ALIVE,female,eng,...,6536249,129360,461,3,2875569000.0,102313919,https://www.goodreads.com/book/show/41865.Twil...,https://en.wikipedia.org/wiki/Twilight_(Meyer_...,NA_not-pub-domain,
28,29,To Kill a Mockingbird,Harper Lee,1960,English,na,1926,2016,female,eng,...,6080747,116479,48,4,181435.0,12431460,https://www.goodreads.com/book/show/2657.To_Ki...,https://en.wikipedia.org/wiki/To_Kill_a_Mockin...,NA_not-pub-domain,
33,34,The Great Gatsby,F. Scott Fitzgerald,1925,English,na,1896,1940,male,eng,...,5198202,103746,298,5,85201.0,100254195,https://www.goodreads.com/book/show/4671.The_G...,https://en.wikipedia.org/wiki/The_Great_Gatsby,https://www.gutenberg.org/cache/epub/64317/pg6...,
302,303,The Fault In Our Stars,John Green,2012,English,romance,1977,ALIVE,male,eng,...,5106984,178231,113,6,1035140000.0,41229323,https://www.goodreads.com/book/show/11870085-t...,https://en.wikipedia.org/wiki/The_Fault_in_Our...,NA_not-pub-domain,
85,86,Nineteen Eighty-Four,George Orwell,1949,English,political,1903,1950,male,eng,...,4558358,113880,77,7,1908976000.0,95155403,https://www.goodreads.com/book/show/5471.Ninet...,https://en.wikipedia.org/wiki/Nineteen_Eighty-...,NA_not-pub-domain,
5,6,Pride and Prejudice,Jane Austen,1813,English,romance,1775,1817,female,eng,...,4239555,113241,34,8,1881837000.0,102333412,https://www.goodreads.com/book/show/1885.Pride...,https://en.wikipedia.org/wiki/Pride_and_Prejudice,https://www.gutenberg.org/cache/epub/1342/pg13...,
102,103,Harry Potter and the Prisoner of Azkaban,J.K. Rowling,1999,English,fantasy,1965,ALIVE,female,eng,...,4146575,83149,2,9,3770154000.0,116796842,https://www.goodreads.com/book/show/5.Harry_Po...,https://en.wikipedia.org/wiki/Harry_Potter_and...,NA_not-pub-domain,
17,18,"The Hobbit, or, There and Back Again",J.R.R. Tolkien,1937,English,fantasy,1892,1973,male,eng,...,3941429,70212,35,10,4061665000.0,95218067,https://www.goodreads.com/book/show/437049.The...,https://en.wikipedia.org/wiki/The_Hobbit,NA_not-pub-domain,


In [44]:
genre_count = goodreads_df['genre'].value_counts().reset_index()
genre_count.columns = ['genre', 'count']

bar_chart = alt.Chart(genre_count).mark_bar().encode(
    x='genre',
    y='count'
).properties(
    title='Number of Books by Genre'
)
bar_chart

In [45]:
# Initial Data Exploration

# Check for missing values in the Goodreads dataset
goodreads_missing_values = goodreads_df.isnull().sum().sum()
print("Missing values in Goodreads dataset:", goodreads_missing_values)

Missing values in Goodreads dataset: 707


In [46]:
# Display the structure of the Goodreads dataset
print("Structure of Goodreads dataset:\n", goodreads_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   top_500_rank              500 non-null    int64  
 1   title                     500 non-null    object 
 2   author                    500 non-null    object 
 3   pub_year                  500 non-null    int64  
 4   orig_lang                 500 non-null    object 
 5   genre                     500 non-null    object 
 6   author_birth              499 non-null    object 
 7   author_death              496 non-null    object 
 8   author_gender             499 non-null    object 
 9   author_primary_lang       499 non-null    object 
 10  author_nationality        499 non-null    object 
 11  author_field_of_activity  329 non-null    object 
 12  author_occupation         458 non-null    object 
 13  oclc_holdings             495 non-null    float64
 14  oclc_ehold

In [47]:
# Display summary statistics for the Goodreads dataset
goodreads_summary = goodreads_df.describe()
print("Summary statistics for Goodreads dataset:\n", goodreads_summary)

Summary statistics for Goodreads dataset:
        top_500_rank     pub_year  oclc_holdings  oclc_eholdings  \
count    500.000000   500.000000     495.000000      495.000000   
mean     250.500000  1932.132000   10401.438384     2218.593939   
std      144.481833    84.306546    5694.509003     3084.595715   
min        1.000000  1021.000000     996.000000       25.000000   
25%      125.750000  1900.000000    6800.000000      352.000000   
50%      250.500000  1948.000000    8318.000000      558.000000   
75%      375.250000  1989.000000   12184.000000     3223.000000   
max      500.000000  2015.000000   37702.000000    15545.000000   

       oclc_total_editions  oclc_holdings_rank  oclc_editions_rank  \
count           495.000000          495.000000          495.000000   
mean            896.642424          248.000000          248.000000   
std            1019.015572          143.038456          143.038456   
min              21.000000            1.000000            1.000000   
25%

## New York Times BestSellers
    - tabular data of fiction bestseller list of The NYT between 1931-2020

In [48]:
nyt_bestseller_df = pd.read_csv('nyt_bestseller.csv')

In [49]:
# Check for missing values in the New York Times Bestseller dataset
nyt_missing_values = nyt_bestseller_df.isnull().sum().sum()
print("Missing values in New York Times Bestseller dataset:", nyt_missing_values)

Missing values in New York Times Bestseller dataset: 10


In [50]:
# Display the structure of the New York Times Bestseller dataset
print("Structure of New York Times Bestseller dataset:\n", nyt_bestseller_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60386 entries, 0 to 60385
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0    year     60386 non-null  int64 
 1   week      60386 non-null  object
 2   rank      60386 non-null  int64 
 3   title_id  60386 non-null  int64 
 4   title     60386 non-null  object
 5   author    60376 non-null  object
dtypes: int64(3), object(3)
memory usage: 2.8+ MB
Structure of New York Times Bestseller dataset:
 None


In [51]:
# Display summary statistics for the New York Times Bestseller dataset
nyt_summary = nyt_bestseller_df.describe()
print("Summary statistics for New York Times Bestseller dataset:\n", nyt_summary)

Summary statistics for New York Times Bestseller dataset:
                year          rank      title_id
count  60386.000000  60386.000000  60386.000000
mean    1980.291375      7.578329   3810.020154
std       24.440366      4.401472   2118.650219
min     1931.000000      1.000000      0.000000
25%     1958.000000      4.000000   2012.000000
50%     1982.000000      7.000000   3931.500000
75%     2001.000000     11.000000   5599.000000
max     2020.000000     18.000000   7430.000000


In [52]:
# chart = alt.Chart(nyt_bestseller_df).mark_bar().encode(
#     x='author',
#     y='count()'
# ).properties(
#     title='Number of Bestsellers by Author'
# )
# chart

In [60]:
import pandas as pd
import altair as alt

# Load the dataset
nyt_bestseller_df = pd.read_csv('nyt_bestseller.csv')

# Filter for soccer-themed books
soccer_books = nyt_bestseller_df[nyt_bestseller_df['title'].str.contains('soccer', case=False, na=False)]

# Display the filtered dataframe
soccer_books

Unnamed: 0,year,week,rank,title_id,title,author


In [57]:
# Visualization 1: Distribution of Goodreads Ratings
rating_distribution = alt.Chart(goodreads_df).mark_bar().encode(
    alt.X('gr_avg_rating:Q', bin=alt.Bin(maxbins=30), title='Goodreads Average Rating'),
    alt.Y('count()', title='Number of Books')
).properties(
    title='Distribution of Goodreads Ratings'
)
rating_distribution.display()

# Analysis: This chart shows the distribution of average ratings for books on Goodreads. 
# Most books have an average rating between 3.5 and 4.5, indicating that readers generally rate books positively.

# Visualization 2: Number of Books by Publication Year
books_by_year = alt.Chart(goodreads_df).mark_bar().encode(
    alt.X('pub_year:O', title='Publication Year'),
    alt.Y('count()', title='Number of Books')
).properties(
    title='Number of Books by Publication Year'
)
books_by_year.display()

# Analysis: This chart shows the number of books published each year. 
# There is a noticeable increase in the number of books published in recent years, 
# which could be due to the growth of the publishing industry and the rise of self-publishing.

# Visualization 3: Top 10 Authors with Most Books
top_authors = goodreads_df['author'].value_counts().nlargest(10).reset_index()
top_authors.columns = ['author', 'count']

top_authors_chart = alt.Chart(top_authors).mark_bar().encode(
    alt.X('author', sort='-y', title='Author'),
    alt.Y('count', title='Number of Books')
).properties(
    title='Top 10 Authors with Most Books'
)
top_authors_chart.display()

# Analysis: This chart shows the top 10 authors with the most books in the dataset. 
# It highlights the most prolific authors and their contribution to literature.

# Visualization 4: Number of Books by Genre
genre_chart = alt.Chart(genre_count).mark_bar().encode(
    alt.X('genre', sort='-y', title='Genre'),
    alt.Y('count', title='Number of Books')
).properties(
    title='Number of Books by Genre'
)
genre_chart.display()

# Analysis: This chart shows the number of books by genre. 
# It provides insight into the most popular genres in the dataset, with 'na' (not available) being the most common, 
# followed by 'history' and 'fantasy'.

# Visualization 5: Number of Books by Author Gender
gender_chart = alt.Chart(goodreads_df).mark_bar().encode(
    alt.X('author_gender', title='Author Gender'),
    alt.Y('count()', title='Number of Books')
).properties(
    title='Number of Books by Author Gender'
)
gender_chart.display()

# Analysis: This chart shows the number of books by author gender. 
# It reveals the gender distribution of authors in the dataset, with a higher number of male authors compared to female authors.