# Goodreads Books Dataset: Exploratory Data Analysis

In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import pyarrow.parquet as pq

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Books

In [None]:
# Load the dataset into a dataframe
books_df = dd.read_parquet("data/new_books.parquet")
books_df.head()

In [None]:
books_df.dtypes

In [None]:
len(books_df)

## Book Works

In [2]:
books_works_df = dd.read_parquet("data/books_works_df.parquet")
books_works_df.head()

Unnamed: 0,books_count,reviews_count,original_publication_month,default_description_language_code,text_reviews_count,best_book_id,original_publication_year,original_title,rating_dist,default_chaptering_book_id,original_publication_day,original_language_id,ratings_count,media_type,ratings_sum,work_id
0,1,6,8.0,,1,5333265,1984,W. C. Fields: A Life on Film,5:1|4:1|3:1|2:0|1:0|total:3,,,,3,book,12,5400751
1,22,10162,,,741,25717,2001,Good Harbor,5:517|4:1787|3:2763|2:966|1:196|total:6229,,,,6229,book,20150,1323437
2,2,268,,,7,7327624,1987,,5:49|4:58|3:26|2:5|1:3|total:141,,,,141,book,568,8948723
3,38,89252,7.0,,3504,6066819,2009,Best Friends Forever,5:9152|4:16855|3:19507|2:6210|1:1549|total:53273,,14.0,,53273,book,185670,6243154
4,2,49,,,5,287140,1990,Runic Astrology: Starcraft and Timekeeping in ...,5:6|4:1|3:3|2:3|1:2|total:15,,,,15,book,51,278577


In [3]:
books_works_df.dtypes

books_count                                    int64
reviews_count                                  int64
original_publication_month           string[pyarrow]
default_description_language_code    string[pyarrow]
text_reviews_count                             int64
best_book_id                                   int64
original_publication_year            string[pyarrow]
original_title                       string[pyarrow]
rating_dist                          string[pyarrow]
default_chaptering_book_id           string[pyarrow]
original_publication_day             string[pyarrow]
original_language_id                 string[pyarrow]
ratings_count                                  int64
media_type                           string[pyarrow]
ratings_sum                                    int64
work_id                                        int64
dtype: object

In [None]:
len(books_works_df)

## Reducing Books Dataset based on Books Works

The books_works dataset has a best_book_id for each work. There are potentially many different books in the books dataset for the same work (different editions, language, etc).

We can use the only keep the best_book_id for each work in the books dataset.

In [None]:
# Get a mapping of work_id to best_book_id
work_to_best_book = books_works_df[['work_id', 'best_book_id']].compute()
work_to_best_book = dict(zip(work_to_best_book['work_id'], work_to_best_book['best_book_id']))

# Filter books_df to only keep rows where book_id matches best_book_id for that work
books_df_best = books_df[books_df['book_id'].isin(work_to_best_book.values())]

print(f"Original books dataset size: {len(books_df):,}")
print(f"Filtered books dataset size: {len(books_df_best):,}")


Then, we save this new dataset to `books_dedup.parquet`

In [19]:
original_schema = pq.read_schema("data/new_books.parquet")
books_df_best.to_parquet(
    "data/books_dedup.parquet",
    schema=original_schema,
    engine='pyarrow',
    write_metadata_file=False,
    write_index=False
)

In [21]:
df1 = dd.read_parquet("data/books_dedup.parquet")
df1.compute().to_parquet(
    "data/books_dedup_single.parquet",
    engine='pyarrow',
    index=False
)