# Getting started

In [1]:
import numpy as np

from common import CURRENT_DUMP_FILENAME, get_data_directory

from allisbns.dataset import CodeDataset, load_bencoded, unpack_data
from allisbns.isbn import (
    LAST_ISBN,
    MaskedISBN,
    get_prefix_bounds,
    normalize_isbn,
    validate_isbn,
)


%load_ext autoreload
%autoreload 2

## Load data

In [2]:
input_path = get_data_directory() / CURRENT_DUMP_FILENAME
input_path.name

'aa_isbn13_codes_20251222T170326Z.benc.zst'

### From input file

In [3]:
md5 = CodeDataset.from_file(input_path, collection="md5")
md5

CodeDataset(array([    6,     1,     9, ...,     1, 91739,     1],
      shape=(14754227,), dtype=int32), bounds=(978000000000, 979999468900))

### From unpacked data

In [4]:
with open(input_path, "rb") as f:
    input_data = load_bencoded(f)
input_data.keys()

dict_keys([b'airitibooks', b'bloomsbury', b'cadal_ssno', b'cerlalc', b'chinese_architecture', b'duxiu_ssid', b'edsebk', b'gbooks', b'goodreads', b'hathi', b'huawen_library', b'ia', b'isbndb', b'isbngrp', b'kulturpass', b'libby', b'md5', b'motw', b'nexusstc', b'nexusstc_download', b'oclc', b'ol', b'ptpress', b'rgb', b'sciencereading', b'shukui', b'sklib', b'trantor', b'wanfang', b'zjjd'])

In [5]:
md5 = CodeDataset(unpack_data(input_data[b"md5"]))
md5

CodeDataset(array([    6,     1,     9, ...,     1, 91739,     1],
      shape=(14754227,), dtype=int32), bounds=(978000000000, 979999468900))

## Expand dataset

In [6]:
md5.bounds

ISBNBounds(start=978000000000, end=979999468900)

In [7]:
md5 = CodeDataset(unpack_data(input_data[b"md5"]), fill_to_isbn=LAST_ISBN)
md5

CodeDataset(array([     6,      1,      9, ...,  91739,      1, 531099],
      shape=(14754228,)), bounds=(978000000000, 979999999999))

## Reframe dataset

In [8]:
start_isbn, end_isbn = get_prefix_bounds("978-2-36590")
cropped_registrant = md5.reframe(start_isbn, end_isbn)
cropped_registrant

CodeDataset(array([  1,  17,   1,  15,   1,  52,   1,   2,   1,  24,   1,   1,   1,
       882], dtype=int32), bounds=(978236590000, 978236590999))

In [9]:
CodeDataset(unpack_data(input_data[b"md5"])).reframe(None, LAST_ISBN)

CodeDataset(array([     6,      1,      9, ...,  91739,      1, 531099],
      shape=(14754228,), dtype=int32), bounds=(978000000000, 979999999999))

In [10]:
%xdel input_data

## Query and check ISBNs

In [11]:
some_isbn_string = "978-2-36590-117-X"
some_isbn_number = int(some_isbn_string.replace("-", "")[:12])
query_result = md5.query_isbn(some_isbn_number)
query_result

QueryResult(is_streak=True, segment_index=8655360, position_in_segment=0)

In [12]:
md5.codes[query_result.segment_index : query_result.segment_index + 3]

array([   1, 1890,    1])

In [13]:
end_isbn = some_isbn_number + md5.codes[query_result.segment_index + 1] + 1
isbns_to_check = np.arange(some_isbn_number, end_isbn + 1)
print(isbns_to_check)

md5.check_isbns(isbns_to_check)

[978236590117 978236590118 978236590119 ... 978236592006 978236592007
 978236592008]


array([ True, False, False, ..., False, False,  True], shape=(1892,))

## Get and count filled ISBNs

In [14]:
filled_isbns = cropped_registrant.get_filled_isbns()
filled_isbns

array([978236590000, 978236590018, 978236590034, 978236590087,
       978236590090, 978236590115, 978236590117])

In [15]:
cropped_registrant.count_filled_isbns()

7

In [16]:
cropped = md5.reframe(*get_prefix_bounds("978-0"))

In [17]:
%timeit -n 10 len(cropped.get_filled_isbns())
%timeit -n 10 cropped.count_filled_isbns()

232 ms ± 652 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


3.79 ms ± 31.6 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Unpack dataset

In [18]:
md5.reframe(isbns_to_check[0], isbns_to_check[-1]).unpack_codes()

array([ True, False, False, ..., False, False,  True], shape=(1892,))

## Work with ISBNs

### Normalize and complete ISBNs

In [19]:
some_isbn_string

'978-2-36590-117-X'

In [20]:
canonical_isbn = normalize_isbn(some_isbn_string)
canonical_isbn

CanonicalISBN(978236590117X)

In [21]:
canonical_isbn.complete()

CanonicalISBN(9782365901178)

### Convert to ISBN12

In [22]:
isbn12 = canonical_isbn.to_isbn12()
isbn12

978236590117

In [23]:
md5.query_isbn(isbn12)

QueryResult(is_streak=True, segment_index=8655360, position_in_segment=0)

### Masked ISBNs

In [24]:
masked_isbn = MaskedISBN.from_canonical(canonical_isbn)
masked_isbn

MaskedISBN(bookland='978', group='2', registrant='36590', publication='117', check_digit='X')

In [25]:
masked_isbn.hyphenate()

'978-2-36590-117-X'

In [26]:
masked_isbn[:3]

'978-2-36590'

### Check and validate ISBNs

In [27]:
isbn_with_bad_check_digit = normalize_isbn(f"{masked_isbn[:-1]}-0")
isbn_with_bad_check_digit

CanonicalISBN(9782365901170)

In [28]:
validate_isbn(isbn_with_bad_check_digit, return_reasons=True)

(False, [<InvalidISBNReason.BAD_CHECK_DIGIT: 'bad_check_digit'>])

In [29]:
validate_isbn(normalize_isbn("979-0-00000000-0"), return_reasons=True)

(False,
 [<InvalidISBNReason.BAD_CHECK_DIGIT: 'bad_check_digit'>,
  <InvalidISBNReason.BAD_GROUP: 'bad_group'>])