### Guide to Add Book Summaries

In [None]:
#@markdown ### Input form for ISBN and processing flags.
#@markdown Fill in the ISBN of the PDF and select options for processing.

isbn_ten = "9354990517" #@param {type:"string"}
exclude_fluff = True #@param {type:"boolean"}
remove_empty_chapters = True #@param {type:"boolean"}

#@markdown ---

In [None]:
# The code below processes the uploaded PDF according to the provided ISBN and flags.
# It extracts the chapters and generates a JSON file with the chapter contents.
import re
import os
import copy
import json
import pprint
import logging
from typing import Dict, Union
from pypdf import PdfReader
from google.colab import files


logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')


def bookmark_dict(
    bookmark_list, reader: PdfReader, use_labels: bool = False,
) -> Dict[Union[str, int], str]:
    """
    Extract all bookmarks as a flat dictionary.

    Args:
        bookmark_list: The reader. outline or a recursive call
        use_labels: If true, use page labels. If False, use page indices.

    Returns:
        A dictionary mapping page labels (or page indices) to their title

    Examples:
        Download the PDF from https://zenodo.org/record/50395 to give it a try
    """
    result = {}
    for item in bookmark_list:
        if isinstance(item, list):
            # recursive call
            result.update(bookmark_dict(item, reader))
        else:
            page_index = reader.get_destination_page_number(item)
            page_label = reader.page_labels[page_index]
            if use_labels:
                result[page_label] = item.title
            else:
                result[page_index] = item.title
    return result


def array_to_json_file(array, file_name):
    """
    Saves a list of dictionaries to a JSON file.
    """
    try:
        with open(file_name, 'w', encoding='utf-8') as json_file:
            json.dump(array, json_file, ensure_ascii=False, indent=4)
    except Exception as e:
        logging.error(f"Error saving JSON: {e}")


def construct_page_splits_array(reader, bms):
    last_page = len(reader.pages)
    split_at_list = list(bms.keys())
    split_at_list.append(last_page)
    return split_at_list


def construct_start_and_end_arrays(split_at_pages):
    start = 0
    end = 0
    splits = []
    for i in range(len(split_at_pages)):
        if i == 0:
            start = 1
            end = split_at_pages[i]
        else:
            start = split_at_pages[i - 1]
            end = split_at_pages[i]
        print(f"Start: {start}, End: {end}")
        splits.append((start, end))
    return splits


def get_chapter(split, reader, bms):
    content = []
    # print(split)
    start, end = split
    # print(start, end)
    t = type(start)
    name = bms.get(start, '')
    print(f'Search for {start} as type {t} and found {name}')

    for page_nb in range(int(start), int(end)):
        page_text = reader.pages[page_nb].extract_text()
        content.append(page_text)
    chapter_content = ''.join(content)
    return {
        'name': name,
        'contents': chapter_content,
        'type_of_name': t.__name__  # keep .__name__ this here
    }


def count_words(text):
    """
    Counts the number of words in the given text using a regular expression that matches word boundaries.

    Args:
        text (str): The text string to count words in.

    Returns:
        int: The number of words in the text.
    """
    words = re.findall(r'\b\w+\b', text)
    return len(words)


def extract_pdf_chapters(pdf_file_path):

    reader = PdfReader(pdf_file_path)
    bms = bookmark_dict(reader.outline, reader, use_labels=True)
    print(bms.keys())
    print(bms.values())

    for page_nb, title in sorted(bms.items(), key=lambda n: f"{str(n[0]):>5}"):
        print(f"{page_nb:>3}: {title}")
        pass

    sequence = construct_page_splits_array(reader, bms)
    splits = construct_start_and_end_arrays(sequence)
    splits_excluding_first = splits[1:]

    chapters = []
    for index, split in enumerate(splits_excluding_first):
        chapter = get_chapter(split, reader, bms)
        chapter['sequence_index'] = index
        chapter['part'] = ''
        chapters.append(chapter)

    return chapters


def exclude_fluff_from_request_bodies(json_data):
        """
        request_bodies = {
                    "isbn_ten": isbn_ten,
                    "name": chapter_name,
                    "sequence_index": index,
                    "contents": contents,
                    "part": chapter_part,
                }
        """
        # given a list of request bodies, exclude the ones that are too small or have keywords
        exclude_keywords = [
            "acknowledgement",
            "acknowledgment",
            "reference",
            "appendix",
            "bibliography",
            "glossary",
            "copyright",
            "author's note",
            "note on",
            "publisher's note",
            "about the author",
            "list of collaborators",
            "notes",
            "praise for",
            "praise",
            "thanks",
            "cover",
            "index",
            "resources",
            "sources",
            "table of contents",
            "title page",
            "penguin books",
            "further readings",
            "illustration credits",
            "photo insert",
            "about the publisher",
            "author"
        ]
        exclude_indices = []

        for index, request_body in enumerate(json_data):
            try:
                chapter_contents = request_body["contents"]
                chapter_name = request_body["name"].lower()

                # Log if chapter should not be filtered
                if "chapter" in chapter_name:
                    print(f"CHAPTER DOES NOT NEED TO BE FILTERED: {request_body['name']}")
                    continue

                # Exclude chapters with exclusionary keywords
                if any(keyword in chapter_name for keyword in exclude_keywords):
                    exclude_indices.append(index)
                    print(f"\tEXCLUDED CHAPTER - KEYWORDS FLUFF: {request_body['name']}")
                    continue

                # Exclude if chapters not big enough
                if count_words(chapter_contents) < 1000:
                    exclude_indices.append(index)
                    print(
                        f"\tEXCLUDED CHAPTER - TOO SMALL: {request_body['name']} ({count_words(chapter_contents)} words)")
                else:
                    print(f"CHAPTER DOES NOT NEED TO BE FILTERED: {request_body['name']}")

            except Exception as e:
                print(f"Error during chapter fluff filtering {request_body['name']}: {str(e)}")

        included_request_bodies = [
            request_body for index, request_body in enumerate(json_data)
            if index not in exclude_indices
        ]
        try:
            print("\nCHAPTERS INCLUDED:")
            for request_body in included_request_bodies:
                try:
                    print(request_body["name"])
                except KeyError:
                    print("Error: 'name' key is missing in some request bodies.")
        except Exception as e:
            print(f"Error during the fluff filtering process: {str(e)}")

        # Using deepcopy to create a complete copy of the data, excluding unwanted elements
        filtered_request_bodies = copy.deepcopy(
            [request_body for index, request_body in enumerate(json_data) if index not in exclude_indices])

        return filtered_request_bodies


def analyze_raw_extraction(data):
    chapter_count = len(data)
    total_word_count = 0

    for index, chapter in enumerate(data):
        words = count_words(chapter['contents'])
        print(f"{chapter['name']} has {words} words")
        total_word_count += words

    filtered_data = exclude_fluff_from_request_bodies(data)
    number_of_excluded_chapters = chapter_count - len(filtered_data)

    filtered_total_word_count = 0
    filtered_chapters_with_count = ''
    for index, chapter in enumerate(filtered_data):
        words = count_words(chapter['contents'])
        # print(f"{chapter['name']} has {words} words")
        filtered_total_word_count += words
        filtered_chapters_with_count += f"{chapter['name']} -- {words} words\n"

    return {
        'chapter_count': chapter_count,
        'total_word_count': total_word_count,
        "number_of_excluded_chapters": number_of_excluded_chapters,
        "filtered_chapter_count": len(filtered_data),
        "filtered_total_word_count": filtered_total_word_count,
        "filtered_chapters_with_count": filtered_chapters_with_count
    }


def propagate_name_to_part(arr):
    propagate_name = None
    for i, obj in enumerate(arr):
        # If contents are blank and no name is currently being propagated, start propagation
        if obj["contents"] == "" and propagate_name is None:
            propagate_name = obj["name"]
        # If contents are blank and a name is being propagated, stop propagation before updating this object
        elif obj["contents"] == "" and propagate_name is not None:
            propagate_name = obj["name"]  # We've encountered another empty "contents", reset the name
        # Propagate the name to the part key if needed
        elif propagate_name is not None and obj["type_of_name"] == "int":
            arr[i]["part"] = propagate_name
    return arr


def remove_empty_chapters(arr):
    return [obj for obj in arr if obj["contents"] != ""]


def re_sequence_chapters(arr):
    for i, obj in enumerate(arr):
        arr[i]["sequence_index"] = i
    return arr


def inject_isbn_to_chapters(arr, isbn_ten):
    for i, obj in enumerate(arr):
        arr[i]["isbn"] = isbn_ten
    return arr


def remove_type_of_name_helper(arr):
    for i, obj in enumerate(arr):
        arr[i].pop("type_of_name", None)
    return arr


def process_pdf(pdf_file_name, exclude_fluff=True, remove_empty_chapters=True):
    """
    Main function to process the uploaded PDF file in Colab, extract chapters based on bookmarks,
    and return a list of chapters data after applying various transformations.
    """
    try:
        book_name = os.path.splitext(pdf_file_name)[0]  # Extract book name without .pdf
        logging.info(f"Processing PDF: {pdf_file_name}")

        chapters = extract_pdf_chapters(pdf_file_name)

        if exclude_fluff:
            chapters = exclude_fluff_from_request_bodies(chapters)
        chapters_with_part_info = propagate_name_to_part(chapters)

        if remove_empty_chapters:
            chapters_without_empty = remove_empty_chapters(chapters_with_part_info)
        else:
            chapters_without_empty = chapters_with_part_info

        chapters_resequenced = re_sequence_chapters(chapters_without_empty)
        chapters_with_isbn = inject_isbn_to_chapters(chapters_resequenced, book_name)
        chapters_without_type_of_name = remove_type_of_name_helper(chapters_with_isbn)

        results = analyze_raw_extraction(chapters)
        pprint.pprint(results)

        return chapters_without_type_of_name
    except Exception as e:
        logging.error(f"Error processing {pdf_file_name}: {e}")
        raise



In [None]:
# Upload your PDF file here. Click the "Choose Files" button and select the PDF from your device.

print("Please upload your PDF file.")
uploaded = files.upload()
pdf_file_name = list(uploaded.keys())[0] # It is assumed that only one file is uploaded.

Please upload your PDF file.


Saving 9354990517.pdf to 9354990517 (3).pdf


In [None]:
# PDF Processing and Chapters Data Generation
try:
    chapters_data = process_pdf(pdf_file_name, exclude_fluff, remove_empty_chapters)
except Exception as e:
    logging.error(f"Error processing PDF: {e}")


dict_keys(['3', '5', '9', '12', '15', '20', '22', '25', '29', '33', '39', '41', '42', '46', '51', '55', '56', '59', '60', '63', '65', '71', '72', '73', '76', '80', '87'])
dict_values(['PREMIER CHAPITRE', 'CHAPITRE II', 'CHAPITRE III', 'CHAPITRE IV', 'CHAPITRE V', 'CHAPITRE VI', 'CHAPITRE VII', 'CHAPITRE VIII', 'CHAPITRE IX', 'CHAPITRE X', 'CHAPITRE XI', 'CHAPITRE XII', 'CHAPITRE XIII', 'CHAPITRE XIV', 'CHAPITRE XV', 'CHAPITRE XVI', 'CHAPITRE XVII', 'CHAPITRE XVIII', 'CHAPITRE XIX', 'CHAPITRE XX', 'CHAPITRE XXI', 'CHAPITRE XXII', 'CHAPITRE XXIII', 'CHAPITRE XXIV', 'CHAPITRE XXV', 'CHAPITRE XXVI', 'CHAPITRE XXVII'])
  3: PREMIER CHAPITRE
  5: CHAPITRE II
  9: CHAPITRE III
 12: CHAPITRE IV
 15: CHAPITRE V
 20: CHAPITRE VI
 22: CHAPITRE VII
 25: CHAPITRE VIII
 29: CHAPITRE IX
 33: CHAPITRE X
 39: CHAPITRE XI
 41: CHAPITRE XII
 42: CHAPITRE XIII
 46: CHAPITRE XIV
 51: CHAPITRE XV
 55: CHAPITRE XVI
 56: CHAPITRE XVII
 59: CHAPITRE XVIII
 60: CHAPITRE XIX
 63: CHAPITRE XX
 65: CHAPITRE XXI
 7

In [None]:
# Save and Download JSON
# Click the download link that will appear after running this cell.
try:
    json_filename = f"{isbn_ten}_chapters.json"
    with open(json_filename, 'w') as json_file:
        json.dump(chapters_data, json_file, indent=4)

    files.download(json_filename)  # Make sure 'files' is imported from google.colab
except Exception as e:
    logging.error(f"Error saving or downloading JSON: {e}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>