Scans entire directory for .json files
- Expects to find .json file next to .pdf of the same name
- Expects .json file with the following structure:
```
{
    "book": <title>,
    "number_of_chapters": <number of chapters in book: string>,
    "number_of_appendices": <number of appendices in book: string>,
    "number_of_extras": <number of extras in book: string>,
    "chapters": {
        "1": {
            "title": <chapter-title: string>,
            "start": <page where chapter starts (per pdf, not print): string>,
            "end": <page where chapter ends (per pdf, not print): string>
        },
        .
        .
        .
    },
    "appendices": {
        "1": {
            "title": <appendix-title: string>,
            "start": <page where appendix starts (per pdf, not print): string>,
            "end": <page where appendix ends (per pdf, not print): string>
        },
        .
        .
        .
    },
    "extras": {
        "1": {
            "title": <misc./extra-title: string>,
            "start": <page where extra content starts (per pdf, not print): string>,
            "end": <page where extra content ends (per pdf, not print): string>
        },
        .
        .
        .
    }
}
```
- N.B. avoid spaces in titles, whether from pdf to be read or pdf chapter to be written

In [1]:
import pikepdf
import json
import shutil

from pathlib import Path
from collections import namedtuple

In [2]:
directory = '.'
directory_path = Path(directory)

In [3]:
p = directory_path.glob('**/*.json')
files = [x for x in p if x.is_file()]

In [4]:
def strip_extension(title: str) -> str:
    if '.pdf' in title:
        return title[:title.find('.pdf')]
    elif '.json' in title:
        return title[:title.find('.json')]
    else:
        return title

In [5]:
def get_folder_and_file(pathlib_file):
    return pathlib_file.parents[0], strip_extension(pathlib_file.name)

In [6]:
data_dictionaries = []
FileObj = namedtuple("FileObj", "folder name data")
for file in files:
    parent_dir, file_name = get_folder_and_file(file)
    with open(file, 'r') as json_file:
        data_dictionaries.append(FileObj(parent_dir, file_name, json.load(json_file)))

In [7]:
def pdf_slice_to_disk(content_type, pdf, destination_folder, instruction_dict):
    num_entries = instruction_dict[f'number_of_{content_type}']
    if num_entries > 0:
        for i in range(1, num_entries + 1):
            print(instruction_dict[content_type][str(i)])
            dst = pikepdf.Pdf.new()
            for j in range(int(instruction_dict[content_type][str(i)]['start']) - 1, int(instruction_dict[content_type][str(i)]['end'])):
                dst.pages.append(pdf.pages[j])
            if content_type == 'chapters':
                dst.save(destination_folder / f'{i}.{instruction_dict[content_type][str(i)]["title"]}.pdf')
            else:
                if content_type == 'appendices':
                    prefix = 'A'
                else:
                    prefix = 'E'
                dst.save(destination_folder / f'{prefix}{i}.{instruction_dict[content_type][str(i)]["title"]}.pdf')

In [8]:
content_type_list = ['chapters', 'appendices', 'extras']

In [9]:
def process_book(current_file_tuple):
    destination_folder = current_file_tuple.folder / current_file_tuple.name
    parent_folder = current_file_tuple.folder.parents[0] / current_file_tuple.name
    if (not destination_folder.exists()) and (not parent_folder.exists()):
        destination_folder.mkdir(parents=True, exist_ok=True)
        current_book = current_file_tuple.folder / current_file_tuple.data['book']
        instruction_dict = current_file_tuple.data
        with pikepdf.open(current_book) as pdf:
            num_pages = len(pdf.pages)
            for cont_typ in content_type_list:
                pdf_slice_to_disk(cont_typ, pdf, destination_folder, instruction_dict)
        # move json
        json_path = current_file_tuple.folder / f'{current_file_tuple.name}.json'
        shutil.move(json_path, destination_folder / f'{current_file_tuple.name}.json' )
        # move pdf
        shutil.move(current_book, destination_folder / f'{current_file_tuple.name}.pdf' )

In [10]:
for data_tuple in data_dictionaries:
    process_book(data_tuple)

FileNotFoundError: [Errno 2] No such file or directory: 'Discrete-Math/Discrete_mathematics_and_its_applications/.ipynb_checkpoints/Discrete_mathematics_and_its_applications.pdf'