# A Tutorial on Finding the Table of Contents in a Book

In [1]:
import PyPDF2

import json
import re
import os

from abridged.pdf import parallel_search_in_pdf
from abridged.utils.pdf import describe_pdf
from abridged.params import DATA_TESTS_PATH
from abridged.book import get_toc

In [2]:
N_WORKERS = 8

In [3]:
test_books = [x for x in os.listdir(DATA_TESTS_PATH) if x.endswith('.pdf')]

# Examples of Books

## Oxford Handbook of International Security

In [4]:
book = test_books[0]
book

'The Oxford Handbook of International Security.pdf'

In [5]:
book_path = os.path.join(DATA_TESTS_PATH, book)
pdf = PyPDF2.PdfReader(book_path)
print(describe_pdf(pdf_path=book_path, n_workers=N_WORKERS))

100%|██████████| 8/8 [00:12<00:00,  1.53s/it]


This PDF file has 747 pages.
It contains 355112 words and 2470382 characters.
It has 569285 tokens according to the GPT-4o tokenizer.


#### Search ToC pages

In [6]:
search = 'Online ISBN: 9780191823329 Print ISBN: 9780198777854'
search_since_page = 19
search_until_page = 700

In [7]:
pages_toc = parallel_search_in_pdf(
    pdf_path=book_path,
    search=search,
    start_page=search_since_page,
    end_page=search_until_page,
    n_workers=N_WORKERS
)

print(f'Found {len(pages_toc)} pages for Table of Contents in {book}')
print(pages_toc)

100%|██████████| 8/8 [00:11<00:00,  1.48s/it]

Found 47 pages for Table of Contents in The Oxford Handbook of International Security.pdf
[18, 29, 43, 55, 70, 81, 96, 110, 127, 146, 162, 175, 189, 203, 219, 238, 252, 267, 282, 298, 314, 329, 344, 359, 375, 389, 404, 420, 429, 444, 458, 474, 487, 501, 515, 530, 545, 560, 574, 588, 601, 617, 631, 645, 659, 674, 687]





#### Check Pages with Images

**Disclaimer:** The pages in `pdf` start from 0, but in `convert_from_path` they start from 1.

In [8]:
# images_toc = [
#     (i, convert_from_path(book_path, first_page=i+1, last_page=i+1)[0])
#     for i in pages_toc
# ]
# for image in images_toc:
#     print(f'Page {image[0]}')
#     # display(image[1])

#### Get Titles

In [9]:
pattern = r"Oxford Handbooks(\d+)\s+([^]+)"

toc = get_toc(
    pdf=pdf, pages_toc=pages_toc, pattern=pattern, search_until_page=search_until_page
)

print(f'Table of Contents with {len(toc)} entries')
print(f'Book: {book}')
print('#'*80)
for chapter, info in toc.items():
    print(f'Chapter {chapter} [{info["since"]}p-{info["until"]}p]: {info["title"]}')

Table of Contents with 46 entries
Book: The Oxford Handbook of International Security.pdf
################################################################################
Chapter 1 [18p-28p]: The Future of Security Studies
Chapter 2 [29p-42p]: Security and “Security Studies”: Conceptual Evolution and Historical Transformation
Chapter 3 [43p-54p]: Expertise and Practice: The Evolving Relationship between the Study and Practice of Security
Chapter 4 [55p-69p]: Feminist Security and Security Studies
Chapter 5 [70p-80p]: Critical Security Studies
Chapter 6 [81p-95p]: Realisms
Chapter 7 [96p-109p]: Constructivism
Chapter 8 [110p-126p]: Liberal Approaches
Chapter 10 [127p-145p]: Statistics and International Security
Chapter 11 [146p-161p]: Methods in Constructivist Approaches
Chapter 12 [162p-174p]: Methods in Critical Security Studies
Chapter 13 [175p-188p]: Game Theory and the Future of International Security
Chapter 14 [189p-202p]: Biology, Evolution, and International Security
Chapter 15

#### Save ToC

In [10]:
filename = f'TOC-{book.split(".")[0]}.json'
filepath = os.path.join(DATA_TESTS_PATH, filename)
with open(filepath, 'w', encoding='utf-8') as file:
    json.dump(toc, file, indent=4, ensure_ascii=False)