In [16]:
import pandas as pd
import ast
import json

# Load json files

In [34]:
with open("../data/raw_data/2019.json", 'r') as file:
    json2019 = json.load(file)

with open("../data/raw_data/2020.json", 'r') as file:
    json2020 = json.load(file)

with open("../data/raw_data/2021.json", 'r') as file:
    json2021 = json.load(file)

with open("../data/raw_data/2022.json", 'r') as file:
    json2022 = json.load(file)

with open("../data/raw_data/2023.json", 'r') as file:
    json2023 = json.load(file)

# Extract the information

In [35]:
def get_bestsellers_info_df(data):
    books_info = []
    for item in data.values():
        results = item['results']
        bestsellers_date = results['bestsellers_date']
        for list_info in results['lists']:
            list_name = list_info['list_name_encoded']
            for book in list_info['books']:
                book_details = {
                    "title": book['title'],
                    "rank": book['rank'],
                    "publisher": book['publisher'],
                    "weeks_on_list": book['weeks_on_list'],
                    "primary_isbn13": book['primary_isbn13'],
                    "list_name": list_name,
                    "bestsellers_date": bestsellers_date
                }
                books_info.append(book_details)
    books_info_df = pd.DataFrame(books_info)
    return books_info_df

In [37]:
df_2019 = get_bestsellers_info_df(json2019)
df_2019.shape

(2490, 7)

In [38]:
df_2020 = get_bestsellers_info_df(json2020)
df_2020.shape

(2760, 7)

In [40]:
df_2021 = get_bestsellers_info_df(json2021)
df_2021.shape

(2760, 7)

In [43]:
df_2022 = get_bestsellers_info_df(json2022)
df_2022.shape

(2760, 7)

In [44]:
df_2023 = get_bestsellers_info_df(json2023)
df_2023.shape

(2760, 7)

# Create a new dataframe

In [45]:
df = pd.concat([df_2019,df_2020,df_2021,df_2022,df_2023], ignore_index=True)
df.shape

(13530, 7)

In [48]:
df.dtypes

title               object
rank                 int64
publisher           object
weeks_on_list        int64
primary_isbn13      object
list_name           object
bestsellers_date    object
dtype: object

In [46]:
df.isna().sum()

title               0
rank                0
publisher           0
weeks_on_list       0
primary_isbn13      0
list_name           0
bestsellers_date    0
dtype: int64

In [47]:
df.duplicated().sum()

0

In [49]:
df.nunique()

title               3011
rank                  15
publisher            412
weeks_on_list        545
primary_isbn13      4224
list_name             20
bestsellers_date      60
dtype: int64

# Export as csv file

In [50]:
df.to_csv('../data/database/bestsellers.csv', index=False)

# Extract the list of books and the list of lists

In [51]:
df.head(3)

Unnamed: 0,title,rank,publisher,weeks_on_list,primary_isbn13,list_name,bestsellers_date
0,THE RECKONING,1,Doubleday,9,9780385544153,combined-print-and-e-book-fiction,2018-12-22
1,EVERY BREATH,2,Grand Central,10,9781538728529,combined-print-and-e-book-fiction,2018-12-22
2,FIRE AND BLOOD,3,Bantam,5,9781524796280,combined-print-and-e-book-fiction,2018-12-22


In [70]:
title = {}
title = df['title'].apply(lambda x: x.lower()).unique()
title_df = pd.DataFrame(title,columns=['title'])
title_df['title_id'] = range(1, len(title_df) + 1)
title_df

Unnamed: 0,title,title_id
0,the reckoning,1
1,every breath,2
2,fire and blood,3
3,where the crawdads sing,4
4,target: alex cross,5
...,...,...
3006,shooting iron,3007
3007,in a holidaze,3008
3008,collateral damage,3009
3009,falling stars,3010


In [71]:
title_df.to_csv('../data/database/bestsellers_title.csv', index=False)

In [88]:
lists = {}
lists = df['list_name'].unique()
lists_df = pd.DataFrame(lists,columns=['list_name'])
lists_df['list_id'] = range(1, len(lists_df) + 1)
lists_df

Unnamed: 0,list_name,list_id
0,combined-print-and-e-book-fiction,1
1,combined-print-and-e-book-nonfiction,2
2,hardcover-fiction,3
3,hardcover-nonfiction,4
4,trade-fiction-paperback,5
5,paperback-nonfiction,6
6,advice-how-to-and-miscellaneous,7
7,childrens-middle-grade-hardcover,8
8,picture-books,9
9,series-books,10


In [89]:
lists_df.to_csv('../data/database/lists_bestsellers.csv', index=False)

# Mapping the title_id and list_id

In [75]:
df['title2'] = df['title']
df['title'] = df['title2'].apply(lambda x: x.lower())

In [78]:
# Merging the DataFrames on 'title'
df_merged = pd.merge(df, title_df, on='title', how='left')

In [84]:
df_merged.columns

Index(['title', 'rank', 'publisher', 'weeks_on_list', 'primary_isbn13',
       'list_name', 'bestsellers_date', 'title2', 'title_id'],
      dtype='object')

In [87]:
df = df_merged[['title_id','title2','publisher','rank','weeks_on_list','primary_isbn13','list_name','bestsellers_date']]
df

Unnamed: 0,title_id,title2,publisher,rank,weeks_on_list,primary_isbn13,list_name,bestsellers_date
0,1,THE RECKONING,Doubleday,1,9,9780385544153,combined-print-and-e-book-fiction,2018-12-22
1,2,EVERY BREATH,Grand Central,2,10,9781538728529,combined-print-and-e-book-fiction,2018-12-22
2,3,FIRE AND BLOOD,Bantam,3,5,9781524796280,combined-print-and-e-book-fiction,2018-12-22
3,4,WHERE THE CRAWDADS SING,Putnam,4,15,9780735219090,combined-print-and-e-book-fiction,2018-12-22
4,5,TARGET: ALEX CROSS,"Little, Brown",5,5,9780316273947,combined-print-and-e-book-fiction,2018-12-22
...,...,...,...,...,...,...,...,...
13525,2962,THE HAUNTING,Delacorte,6,0,9780593481516,young-adult-paperback-monthly,2023-11-18
13526,2904,A THOUSAND BOY KISSES,Bloom,7,0,9781728297088,young-adult-paperback-monthly,2023-11-18
13527,2522,THE WAY I USED TO BE,Margaret K. McElderry,8,0,9781481449366,young-adult-paperback-monthly,2023-11-18
13528,857,THE BOOK THIEF,Knopf,9,0,9780375842207,young-adult-paperback-monthly,2023-11-18
