In [1]:
import os
# use pdftoppm command to convert pdf to image

def pdf2image(pdf_path, save_path, start, end):
    """
        Convert from pdf to image, start and end are page number
    """
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    else:
        # remove
        for file in os.listdir(save_path):
            os.remove(os.path.join(save_path, file))

    filename = os.path.basename(pdf_path).split(".")[0]
    save_image = os.path.join(save_path, filename)
    cmd = "pdftoppm -png -f {} -l {} {} {}".format(start, end, pdf_path, save_image)
    os.system(cmd)


In [None]:
def show_images(image_path):
    """
        Show images in image_path inline
    """
    from PIL import Image
    import matplotlib.pyplot as plt
    from IPython.display import display

    # Load the images
    image_paths = [os.path.join(image_path, path) for path in os.listdir(image_path)]
    images = [Image.open(path) for path in image_paths]

    # Create a row of subplots
    fig, axes = plt.subplots(1, len(images), figsize=(15, 5))

    # Display each image in a subplot
    for i, ax in enumerate(axes):
        ax.imshow(images[i])
        ax.axis('off')

    # Show the row of images
    plt.show()


In [1]:
# show drop down list and choose to get the results
# math: unknown/yes/no
# field: unknown/algebra/calculus/geometry/statistics/trigonometry
# level: unknown/yes/no
# type: unknown/competition/exam/textbook/workbook


class Info:
    def __init__(self) -> None:
        self.math = 'unknown'
        self.field = 'unknown'
        self.level = 'unknown'
        self.type = 'unknown'
    
    def dict(self):
        return {
            'math': self.math,
            'field': self.field,
            'level': self.level,
            'type': self.type,
        }

from ipywidgets import interact, Dropdown, Button, Output, VBox, Image
from IPython.display import clear_output

def get_dropdown(options,desp):
    """
        Get results from the drop down list
    """

    # Define a dropdown widget
    dropdown = Dropdown(
        options=options,
        value='unknown',
        description=desp,
        disabled=False,
    )

    return dropdown


In [2]:
def convert_and_open(base_path, pdf_name, tmp_path):
    """
        Convert pdf to image and open it
    """
    pdf_path = os.path.join(base_path, pdf_name)
    image_path = tmp_path
    pdf2image(pdf_path, image_path, 10, 20)

    # open all images
    return [os.path.join(image_path, path) for path in os.listdir(image_path)]

In [4]:
def main():
    # get the available files from recheck_index.json
    import json
    with open("recheck_index.json", "r", encoding="utf-8") as json_file:
        pdf_files_list = json.load(json_file)
    # get the pdf files path that are not removed and is marked math
    pdf_files_list = [pdf_file["path"] for pdf_file in pdf_files_list if not pdf_file["clean"]["remove"] and pdf_file["mark"]["math"] == "yes"]

    # save to /data/xukp/tmp
    base_path = "/data/xukp"
    save_path = "/data/xukp/tmp"

    infos = []

    # convert pdf to image, show and mark
    # first
    id = 0
    pdf_name = pdf_files_list[id]
    images = convert_and_open(base_path, pdf_name, save_path)
    # use hbox to show
    display(HBox([Image(filename=image) for image in images]))
    math_dropdown = get_dropdown(['unknown','yes','no'],'math')
    field_dropdown = get_dropdown(['unknown','algebra','calculus','geometry','statistics','trigonometry'],'field')
    level_dropdown = get_dropdown(['unknown','yes','no'],'level')
    type_dropdown = get_dropdown(['unknown','competition','exam','textbook','workbook'],'type')

    info = Info()

    output = Output()

    button = Button(
        description='OK',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click me',
        icon='check' # (FontAwesome names without the `fa-` prefix)
    )

    def on_button_clicked(b):
        # Display the message within the output widget.
        with output:
            info.math = math_dropdown.value
            info.field = field_dropdown.value
            info.level = level_dropdown.value
            info.type = type_dropdown.value
            clear_output(wait=True)
            infos.append(info.dict().copy())
            # to next pdf
            id += 1
            if id < len(pdf_files_list):
                pdf_name = pdf_files_list[id]
                images = convert_and_open(base_path, pdf_name, save_path)
                # use hbox to show
                display(HBox([Image(filename=image) for image in images]))
                math_dropdown.value = 'unknown'
                field_dropdown.value = 'unknown'
                level_dropdown.value = 'unknown'
                type_dropdown.value = 'unknown'
            else:
                print("All done!")
                print(infos)
                # save to man_index.json
                with open("man_index.json", "r", encoding="utf-8") as json_file:
                    json.dump(infos, json_file, indent=4, ensure_ascii=False)


    button.on_click(on_button_clicked)

    display(VBox([math_dropdown, field_dropdown, level_dropdown, type_dropdown, button, output]))

main()        

KeyError: 'remove'