# Getting started

In [29]:
import numpy as np

from common import LATEST_DUMP_FILENAME

from allisbns.dataset import CodeDataset, load_bencoded, unpack_data
from allisbns.isbn import (
    LAST_ISBN,
    MaskedISBN,
    get_prefix_bounds,
    normalize_isbn,
    validate_isbn,
)


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data

In [30]:
input_path = LATEST_DUMP_FILENAME
input_path

'aa_isbn13_codes_20251118T170842Z.benc.zst'

### From input file

In [31]:
md5 = CodeDataset.from_file(input_path, collection="md5")
md5

CodeDataset(array([    6,     1,     9, ...,     1, 91739,     1],
      shape=(14737375,), dtype=int32), bounds=(978000000000, 979999468900))

### From unpacked data

In [32]:
with open(input_path, "rb") as f:
    input_data = load_bencoded(f)
input_data.keys()

dict_keys([b'airitibooks', b'bloomsbury', b'cadal_ssno', b'cerlalc', b'chinese_architecture', b'duxiu_ssid', b'edsebk', b'gbooks', b'goodreads', b'hathi', b'huawen_library', b'ia', b'isbndb', b'isbngrp', b'kulturpass', b'libby', b'md5', b'nexusstc', b'nexusstc_download', b'oclc', b'ol', b'ptpress', b'rgb', b'sciencereading', b'shukui', b'sklib', b'trantor', b'wanfang', b'zjjd'])

In [33]:
md5 = CodeDataset(unpack_data(input_data[b"md5"]))
md5

CodeDataset(array([    6,     1,     9, ...,     1, 91739,     1],
      shape=(14737375,), dtype=int32), bounds=(978000000000, 979999468900))

## Expand dataset

In [34]:
md5.bounds

(978000000000, 979999468900)

In [35]:
md5 = CodeDataset(unpack_data(input_data[b"md5"]), fill_to_isbn=LAST_ISBN)
md5

CodeDataset(array([     6,      1,      9, ...,  91739,      1, 531099],
      shape=(14737376,)), bounds=(978000000000, 979999999999))

In [36]:
%xdel input_data

## Crop dataset

In [37]:
start_isbn, end_isbn = get_prefix_bounds("978-2-36590")
cropped_registrant = md5.crop(start_isbn, end_isbn)
cropped_registrant

CodeDataset(array([  1,  17,   1,  15,   1,  52,   1,   2,   1,  24,   1,   1,   1,
       882], dtype=int32), bounds=(978236590000, 978236590999))

## Query and check ISBNs

In [38]:
some_isbn_string = "978-2-36590-117-X"
some_isbn_number = int(some_isbn_string.replace("-", "")[:12])
query_result = md5.query_isbn(some_isbn_number)
query_result

QueryResult(is_streak=True, segment_index=8652142, position_in_segment=0)

In [39]:
md5.codes[query_result.segment_index : query_result.segment_index + 3]

array([   1, 1890,    1])

In [40]:
end_isbn = some_isbn_number + md5.codes[query_result.segment_index + 1] + 1
isbns_to_check = np.arange(some_isbn_number, end_isbn + 1)
print(isbns_to_check)

md5.check_isbns(isbns_to_check)

[978236590117 978236590118 978236590119 ... 978236592006 978236592007
 978236592008]


array([ True, False, False, ..., False, False,  True], shape=(1892,))

## Get and count filled ISBNs

In [41]:
filled_isbns = cropped_registrant.get_filled_isbns()
filled_isbns

array([978236590000, 978236590018, 978236590034, 978236590087,
       978236590090, 978236590115, 978236590117])

In [42]:
cropped_registrant.count_filled_isbns()

7

In [43]:
cropped = md5.crop(*get_prefix_bounds("978-0"))

In [44]:
%timeit -n 10 len(cropped.get_filled_isbns())
%timeit -n 10 cropped.count_filled_isbns()

260 ms ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
4.48 ms ± 421 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Unpack dataset

In [45]:
unpacked_codes = md5.crop(isbns_to_check[0], isbns_to_check[-1]).unpack_codes()
unpacked_codes

array([ True, False, False, ..., False, False,  True], shape=(1892,))

## Work with ISBNs

### Normalize and complete ISBNs

In [46]:
some_isbn_string

'978-2-36590-117-X'

In [47]:
canonical_isbn = normalize_isbn(some_isbn_string)
canonical_isbn

CanonicalISBN(978236590117X)

In [48]:
canonical_isbn.complete()

CanonicalISBN(9782365901178)

### Convert to ISBN12

In [49]:
isbn12 = canonical_isbn.to_isbn12()
isbn12

978236590117

In [50]:
md5.query_isbn(isbn12)

QueryResult(is_streak=True, segment_index=8652142, position_in_segment=0)

### Masked ISBNs

In [51]:
masked_isbn = MaskedISBN.from_canonical(canonical_isbn)
masked_isbn

MaskedISBN(bookland='978', group='2', registrant='36590', publication='117', check_digit='X')

In [52]:
masked_isbn.hyphenate()

'978-2-36590-117-X'

In [53]:
masked_isbn[:3]

'978-2-36590'

### Check and validate ISBNs

In [54]:
isbn_with_bad_check_digit = normalize_isbn(f"{masked_isbn[:-1]}-0")
isbn_with_bad_check_digit

CanonicalISBN(9782365901170)

In [55]:
validate_isbn(isbn_with_bad_check_digit, return_reasons=True)

(False, [<InvalidISBNReason.BAD_CHECK_DIGIT: 'bad_check_digit'>])

In [56]:
validate_isbn(normalize_isbn("979-0-00000000-0"), return_reasons=True)

(False,
 [<InvalidISBNReason.BAD_CHECK_DIGIT: 'bad_check_digit'>,
  <InvalidISBNReason.BAD_GROUP: 'bad_group'>])