In [72]:
!pip install isbnlib

zsh(36658) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
sh(36658) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
bash(36658) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
python(36658) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
Python(36658) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [73]:
# Prepare the data

In [74]:
import isbnlib
import pandas as pd
from skimpy import skim
import requests

In [75]:
isbn_df = pd.read_csv('updated_isbn_list.csv')

In [76]:
isbn_df.count()

ISBN-10    31008837
ISBN-13    31008837
dtype: int64

In [77]:
# check nan
isbn_df.isnull().sum()

ISBN-10    0
ISBN-13    0
dtype: int64

In [78]:
skim(isbn_df)

Randomly select 100 rows from the dataframe

In [79]:
isbn_df = isbn_df.sample(n=100)
isbn_10 = isbn_df['ISBN-10'].tolist()

In [80]:
# turn isbn13 into integer
isbn_df['ISBN-13'] = isbn_df['ISBN-13'].astype(int)
isbn_df['ISBN-10'] = isbn_df['ISBN-10'].astype(str)

In [81]:
isbn_df.head()

Unnamed: 0,ISBN-10,ISBN-13
11300673,1986981126,9781986981125
2143428,367021862,9780367021863
30552735,972263667,9780972263665
16664687,1523205210,9781523205219
8755069,2803432447,9782803432448


In [82]:
isbn_df.reset_index(drop=True, inplace=True)
skim(isbn_df)

## Fetch Metadata Using ISBNLib
Use the `isbnlib.meta()` function to fetch metadata for each ISBN. The function returns a dictionary with metadata information. If the ISBN is not found, the function returns an empty dictionary.

In [83]:
def fetch_metadata(isbn, service="goob"):
    try:
        metadata = isbnlib.meta(isbn, service=service)
        return metadata if metadata else {}
    except Exception as e:
        return {}

In [84]:
def normalize_metadata(isbnlib_df, services):
    for service in services:
        isbnlib_df[f"{service}_metadata"] = isbnlib_df['ISBN-10'].apply(lambda x: fetch_metadata(x, service=service))
        normalized = pd.json_normalize(isbnlib_df[f"{service}_metadata"])
        normalized.columns = [f"{service}_{col}" for col in normalized.columns]
        isbnlib_df = pd.concat([isbnlib_df, normalized], axis=1)
        if f"{service}_metadata" in isbnlib_df.columns:
            isbnlib_df.drop(columns=[f"{service}_metadata"], inplace=True)

    return isbnlib_df

In [85]:
isbnlib_df = isbn_df

In [86]:
services = ["goob", "openl","wiki"]
isbnlib_df = normalize_metadata(isbnlib_df, services)

In [87]:
isbnlib_df.head()

Unnamed: 0,ISBN-10,ISBN-13,goob_ISBN-13,goob_Title,goob_Authors,goob_Publisher,goob_Year,goob_Language,openl_ISBN-13,openl_Title,openl_Authors,openl_Publisher,openl_Year,openl_Language,wiki_ISBN-13,wiki_Title,wiki_Authors,wiki_Publisher,wiki_Year,wiki_Language
0,1986981126,9781986981125,9781986981125.0,No Quest For The Wicked,[Shanna Swendson],Createspace Independent Publishing Platform,2018.0,en,9781986981125,No Quest for the Wicked,[Shanna Swendson],CreateSpace Independent Publishing Platform,2018,,,,,,,
1,367021862,9780367021863,9780367021863.0,"Electromagnetism, Man And The Environment",[Joseph H. Battocletti],Routledge,2019.0,en,9780367021863,Electromagnetism Man and the Environment,[Joseph H. Battocletti],Taylor & Francis Group,2019,,9780367021863.0,Electromagnetism Man and the Environment,[Joseph H Battocletti],Routledge,2019.0,
2,972263667,9780972263665,9780972263665.0,Creating Documents With Business Objects XI - ...,[Robert D. Schmidt],Schmidt Ink Incorporated,2006.0,en,9780972263665,Business Objects XI - Web Intelligence XI Course,[Robert D. Schmidt],"Schmidt Ink, Inc.",2006,,,,,,,
3,1523205210,9781523205219,,,,,,,9781523205219,Searching for Stolen Love,[Kenneth Szulczyk],CreateSpace Independent Publishing Platform,2016,,,,,,,
4,2803432447,9782803432448,,,,,,,9782803432448,365 jeux et activités - pour les tout-petits,[Son Tyberg],Chantecler,1997,,,,,,,


In [88]:
skim(isbnlib_df)

it is a dictionary, so we need to convert it to a dataframe

## Google Books Data

In [89]:
def get_book_data(isbn):
    url = f"https://www.googleapis.com/books/v1/volumes?q=isbn:{isbn}"
    response = requests.get(url)
    data = response.json()

    if "items" in data:
        book = data["items"][0]["volumeInfo"]
        return {
            "ISBN": isbn,
            "Title": book.get("title", "N/A"),
            "Authors": ", ".join(book.get("authors", ["Unknown"])),
            "Published Date": book.get("publishedDate", "N/A"),
            "Description": book.get("description", "No description available"),
            "Page Count": book.get("pageCount", "N/A"),
            "Categories": ", ".join(book.get("categories", ["Unknown"])),
            "Thumbnail": book.get("imageLinks", {}).get("thumbnail", "")
        }
    else:
        return {"ISBN": isbn, "Title": "Not Found"}

In [90]:
books_data = [get_book_data(isbn) for isbn in isbn_10]

books_df = pd.DataFrame(books_data)

In [91]:
books_df.head()

Unnamed: 0,ISBN,Title,Authors,Published Date,Description,Page Count,Categories,Thumbnail
0,1986981126,No Quest for the Wicked,Shanna Swendson,2018-03-29,"A Questing We Will Go Now that the Magic, Spel...",254.0,Unknown,http://books.google.com/books/content?id=FE22t...
1,367021862,"Electromagnetism, Man and the Environment",Joseph H. Battocletti,2019-06-07,Electromagnetic pollution is the permeation of...,94.0,Electromagnetism,http://books.google.com/books/content?id=iOdxx...
2,972263667,Creating Documents with Business Objects XI,Robert D. Schmidt,2006-10-23,This course is designed to teach the basics th...,0.0,Business,http://books.google.com/books/content?id=CT49A...
3,1523205210,Not Found,,,,,,
4,2803432447,Not Found,,,,,,


In [92]:
books_df['Authors'].isnull().sum()

np.int64(51)

In [93]:
skim(books_df)

## Goodreads Data


In [94]:
import os
print(os.getcwd())

os.environ['MallocStackLogging'] = '0'


/Users/wuyuetong/Documents/ASCoR Intership/Invisible book


In [95]:
!mkdir classic_book_metadata

zsh(36908) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
mkdir(36908) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
mkdir: classic_book_metadata: File exists


In [96]:
# Select the first 100 rows of the 'ISBN-13' column and convert to a list
isbn_13 = isbn_df['ISBN-13'].head(100).tolist()
print(isbn_13)

[9781986981125, 9780367021863, 9780972263665, 9781523205219, 9782803432448, 9780906614006, 9781421256252, 9781502764553, 9781593331832, 9780787698522, 9780408718882, 9781515960225, 9781977872210, 9780684133980, 9781096813002, 9783832523718, 9780128188446, 9780343713911, 9781846685729, 9780673604675, 9780552143301, 9780395160312, 9780071615143, 9781848023826, 9789062100743, 9788757420616, 9783319055091, 9788416011667, 9781365601743, 9785439607112, 9781466600652, 9781798563816, 9780850458077, 9781517115678, 9780905180106, 9781580543798, 9781480010802, 9780970466709, 9781614759577, 9788408167389, 9780309567572, 9780865041813, 9781678716523, 9780732282653, 9783889978257, 9781546864929, 9781101132593, 9782879003191, 9781720946724, 9788427118775, 9788467272017, 9780750298452, 9785868840623, 9780132180177, 9789995327217, 9780333651575, 9788472454484, 9786072101609, 9781402511752, 9781465768773, 9780597713248, 9789640471685, 9780747814238, 9783354005143, 9789781259937, 9781639619467, 978172159

In [97]:
with open('cleaned_isbn_list.txt', 'w') as f:
    for isbn in isbn_13:
        f.write(f"{isbn}\n")

In [98]:
with open('cleaned_isbn_list.txt', 'r') as f:
    isbn_list = f.readlines()
print(isbn_list)
# change to integer
isbn_list = [int(isbn) for isbn in isbn_list]
with open('isbn_list.txt', 'w') as f:
    for isbn in isbn_list:
        f.write(f"{isbn}\n")

['9781986981125\n', '9780367021863\n', '9780972263665\n', '9781523205219\n', '9782803432448\n', '9780906614006\n', '9781421256252\n', '9781502764553\n', '9781593331832\n', '9780787698522\n', '9780408718882\n', '9781515960225\n', '9781977872210\n', '9780684133980\n', '9781096813002\n', '9783832523718\n', '9780128188446\n', '9780343713911\n', '9781846685729\n', '9780673604675\n', '9780552143301\n', '9780395160312\n', '9780071615143\n', '9781848023826\n', '9789062100743\n', '9788757420616\n', '9783319055091\n', '9788416011667\n', '9781365601743\n', '9785439607112\n', '9781466600652\n', '9781798563816\n', '9780850458077\n', '9781517115678\n', '9780905180106\n', '9781580543798\n', '9781480010802\n', '9780970466709\n', '9781614759577\n', '9788408167389\n', '9780309567572\n', '9780865041813\n', '9781678716523\n', '9780732282653\n', '9783889978257\n', '9781546864929\n', '9781101132593\n', '9782879003191\n', '9781720946724\n', '9788427118775\n', '9788467272017\n', '9780750298452\n', '9785868840

In [99]:
!python get_books.py --isbn_list_path isbn_list.txt --output_directory_path classic_book_metadata --format csv

zsh(36909) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
python(36909) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
Python(36909) MallocStackLogging: could not tag MSL-related memory as no_footprint, so those pages will be included in process footprint - (null)
ISBNs to scrape: ['9781986981125', '9780367021863', '9780972263665', '9781523205219', '9782803432448', '9780906614006', '9781421256252', '9781502764553', '9781593331832', '9780787698522', '9780408718882', '9781515960225', '9781977872210', '9780684133980', '9781096813002', '9783832523718', '9780128188446', '9780343713911', '9781846685729', '9780673604675', '9780552143301', '9780395160312', '9780071615143', '9781848023826', '9789062100743', '9788757420616', '9783319055091', '9788416011667', '9781365601743', '9785439607112', '9781466600652', '9781798563816', 

In [100]:
goodreads_df = pd.read_csv('classic_book_metadata/all_books.csv')

In [101]:
skim(goodreads_df)

## ISBNDB Data

In [102]:
import os
from dotenv import load_dotenv
import requests
import pandas as pd
from pandas import json_normalize
import requests as req

In [103]:
load_dotenv()
API_KEY = os.getenv('API_KEY')
BASE_URL = "https://api2.isbndb.com/book/"

In [104]:
h = {
    'Authorization': API_KEY
}

isbndb = []


In [105]:
isbn_df.head()

Unnamed: 0,ISBN-10,ISBN-13,goob_metadata
0,1986981126,9781986981125,"{'ISBN-13': '9781986981125', 'Title': 'No Ques..."
1,367021862,9780367021863,"{'ISBN-13': '9780367021863', 'Title': 'Electro..."
2,972263667,9780972263665,"{'ISBN-13': '9780972263665', 'Title': 'Creatin..."
3,1523205210,9781523205219,{}
4,2803432447,9782803432448,{}


In [110]:
isbn_df

Unnamed: 0,ISBN-10,ISBN-13,goob_metadata
0,1986981126,9781986981125,"{'ISBN-13': '9781986981125', 'Title': 'No Ques..."
1,0367021862,9780367021863,"{'ISBN-13': '9780367021863', 'Title': 'Electro..."
2,0972263667,9780972263665,"{'ISBN-13': '9780972263665', 'Title': 'Creatin..."
3,1523205210,9781523205219,{}
4,2803432447,9782803432448,{}
...,...,...,...
95,1016828411,9781016828413,"{'ISBN-13': '9781016828413', 'Title': 'An Intr..."
96,8494417630,9788494417634,{}
97,0078817447,9780078817441,"{'ISBN-13': '9780078817441', 'Title': 'PC Tool..."
98,1981020519,9781981020515,{}


In [112]:
isbn_13 = isbn_df['ISBN-13'].head(100).tolist()

In [120]:
isbn_10= isbn_df['ISBN-10'].head(100).tolist()

In [125]:
for isbn in isbn_13:
    isbn_str = str(isbn)
    resp = req.get("https://api2.isbndb.com/book/" + isbn_str, headers=h)
    if resp.status_code == 200:
        isbndb.append(resp.json())
    elif resp.status_code == 404:
        print(f"ISBN {isbn_str} not found. Error 404: {resp.text}")
    else:
        print(f"Error {resp.status_code}: {resp.text}")

In [127]:
isbn_data = json_normalize(isbndb, sep='_')
isbn_data.drop_duplicates(subset=['book_isbn'], inplace=True)
isbn_data

Unnamed: 0,book_publisher,book_synopsis,book_language,book_image,book_title_long,book_dimensions,book_dimensions_structured_length_unit,book_dimensions_structured_length_value,book_dimensions_structured_width_unit,book_dimensions_structured_width_value,...,book_msrp,book_binding,book_isbn,book_isbn10,book_edition,book_other_isbns,book_related_ePub,book_related_Hardcover,book_related_eBook,book_related_Paperback
0,CreateSpace Independent Publishing Platform,"A Questing We Will GoNow that the Magic, Spell...",en,https://images.isbndb.com/covers/1224920348289...,"No Quest for the Wicked (Enchanted, Inc.)","Height: 9 Inches, Length: 6 Inches, Weight: 0....",inches,6.00000,inches,0.580000,...,0.00,Paperback,1986981126,1986981126,,,,,,
1,Routledge,Electromagnetic pollution is the permeation of...,en,https://images.isbndb.com/covers/1815741348231...,Electromagnetism Man And The Environment,Weight: 0.99869404686 Pounds,,,,,...,0.00,Hardcover,0367021862,0367021862,1,"[{'isbn': '9780429727924', 'binding': 'print'}]",,,,
2,Schmidt Ink Inc,"Book by Schmidt, Robert D.",en,https://images.isbndb.com/covers/4044793482534...,Creating Documents with BusinessObjects XI: We...,"Height: 11 Inches, Length: 8.75 Inches, Weight...",inches,8.75000,inches,0.500000,...,99.99,Paperback,0972263667,0972263667,2nd,,,,,
3,CreateSpace Independent Publishing Platform,I’m a U.S. finance professor and was excited t...,en,https://images.isbndb.com/covers/8645493482730...,Searching for Stolen Love,"Height: 9 Inches, Length: 6 Inches, Weight: 0....",inches,6.00000,inches,0.470000,...,0.00,Paperback,1523205210,1523205210,,,,,,
4,Chantecler,,fr,https://images.isbndb.com/covers/3797543483186...,365 jeux et activites pour les tout-petits,"Height: 7.4409448743 Inches, Length: 4.7637795...",inches,4.76378,inches,0.944882,...,0.00,Paperback,2803432447,2803432447,CHANTECLER,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158,Stockport Historical Society,,en,https://images.isbndb.com/covers/7114103482510...,Portwood: A Collection of Memories,,,,,,...,0.00,Paperback,0905164709,0905164709,,,,,,
166,Legare Street Press,,fr,https://images.isbndb.com/covers/1429009348255...,Correspondance De Philippe Ii Sur Les Affaires...,"Height: 9.21 inches, Length: 6.14 inches, Weig...",inches,6.14000,inches,1.810000,...,0,Hardcover,1021034037,1021034037,,,,,,
171,Legare Street Press,,en,https://images.isbndb.com/covers/1136181348255...,The Poetical Works of Sir William Jones: Colla...,"Height: 9.21 inches, Length: 6.14 inches, Weig...",inches,6.14000,inches,0.810000,...,0,Hardcover,102074121X,102074121X,,,,,,
175,Legare Street Press,,en,https://images.isbndb.com/covers/319633482550.jpg,An Introduction to the Irish Language: In Thre...,"Height: 9.21 inches, Length: 6.14 inches, Weig...",inches,6.14000,inches,0.690000,...,0,Hardcover,1016828411,1016828411,,,,,,


In [129]:
# reset index
isbn_data.reset_index(drop=True, inplace=True)
skim(isbn_data)