In [1]:
import os
# use pdftoppm command to convert pdf to image

def pdf2image(pdf_path, save_path, start, end):
    """
        Convert from pdf to image, start and end are page number
    """
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    else:
        # remove
        for file in os.listdir(save_path):
            os.remove(os.path.join(save_path, file))

    filename = os.path.basename(pdf_path).split(".")[0]
    save_image = os.path.join(save_path, filename)
    cmd = "pdftoppm -png -f {} -l {} \"{}\" \"{}\" > /dev/null 2>&1".format(start, end, pdf_path, save_image)
    os.system(cmd)


In [2]:
def show_images(image_path):
    """
        Show images in image_path inline
    """
    from PIL import Image
    import matplotlib.pyplot as plt
    from IPython.display import display

    # Load the images
    image_paths = [os.path.join(image_path, path) for path in os.listdir(image_path)]
    images = [Image.open(path) for path in image_paths]

    # Create a row of subplots
    fig, axes = plt.subplots(1, len(images), figsize=(15, 5))

    # Display each image in a subplot
    for i, ax in enumerate(axes):
        ax.imshow(images[i])
        ax.axis('off')

    # Show the row of images
    plt.show()


In [10]:
# show drop down list and choose to get the results
# math: unknown/yes/no
# field: unknown/algebra/calculus/geometry/statistics/trigonometry
# level: unknown/yes/no
# type: unknown/competition/exam/textbook/workbook


class Info:
    def __init__(self) -> None:
        self.math = 'unknown'
        self.field = 'unknown'
        self.level = 'unknown'
        self.type = 'unknown'
        self.clear = 'unknown'
    
    def dict(self):
        return {
            'math': self.math,
            'field': self.field,
            'level': self.level,
            'type': self.type,
            'clear': self.clear,
        }

from ipywidgets import interact, Dropdown, Button, Output, VBox, Image
from IPython.display import clear_output

def get_dropdown(options,desp):
    """
        Get results from the drop down list
    """

    # Define a dropdown widget
    dropdown = Dropdown(
        options=options,
        value='unknown',
        description=desp,
        disabled=False,
    )

    return dropdown


In [4]:
def convert_and_open(base_path, pdf_name, tmp_path):
    """
        Convert pdf to image and open it
    """
    from PIL import Image
    pdf_path = os.path.join(base_path, pdf_name)
    image_path = tmp_path
    print("\rconverting {}".format(pdf_name))
    pdf2image(pdf_path, image_path, 20, 23)
    print("\rdone")

    # open all images
    return [Image.open(os.path.join(image_path, path)) for path in os.listdir(image_path)]

In [13]:
from ipywidgets import HBox

infos = []
file_id = 0

import json, os
if os.path.exists('man_index.json'):
    with open('man_index.json', 'r', encoding='utf-8') as f:
        infos = json.load(f)
        file_id = len(infos)

def main():
    # get the available files from recheck_index.json
    import json
    with open("recheck_index.json", "r", encoding="utf-8") as json_file:
        pdf_files_list = json.load(json_file)
    # get the pdf files path that are not removed and is marked math
    pdf_files_list = [pdf_file["path"] for pdf_file in pdf_files_list if not pdf_file["clean"]["rm"] and pdf_file["mark"]["math"] == "yes"]

    # save to /data/xukp/tmp
    base_path = "/data/xukp"
    save_path = "/data/xukp/tmp"

    # convert pdf to image, show and mark
    # first
    global file_id
    pdf_name = pdf_files_list[file_id]
    images = convert_and_open(base_path, pdf_name, save_path)
    # use hbox to show
    from ipywidgets import Image
    display(file_id)
    hbox = HBox([Image(value=image._repr_png_(), format='png', width=500, height=500) for image in images])
    display(hbox)
    math_dropdown = get_dropdown(['unknown','yes','no'],'math')
    field_dropdown = get_dropdown(['unknown','algebra','calculus','geometry','statistics','trigonometry'],'field')
    level_dropdown = get_dropdown(['unknown','yes','no'],'level')
    type_dropdown = get_dropdown(['unknown','competition','exam','textbook','workbook'],'type')
    clear_dropdown = get_dropdown(['unknown','yes','no'],'clear')

    info = Info()

    output = Output()

    button = Button(
        description='OK',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click me',
        icon='check' # (FontAwesome names without the `fa-` prefix)
    )

    def on_button_clicked(b):
        # Display the message within the output widget.
        with output:
            global file_id
            info.math = math_dropdown.value
            info.field = field_dropdown.value
            info.level = level_dropdown.value
            info.type = type_dropdown.value
            info.clear = clear_dropdown.value
            clear_output(wait=True)
            infos.append(info.dict().copy())
            infos[-1]['path'] = pdf_files_list[file_id]
            # save to man_index.json
            with open("man_index.json", "w", encoding="utf-8") as json_file:
                json.dump(infos, json_file, indent=4, ensure_ascii=False)
            # to next pdf
            file_id = file_id + 1
            if file_id < len(pdf_files_list):
                pdf_name = pdf_files_list[file_id]
                images = convert_and_open(base_path, pdf_name, save_path)
                # use hbox to show
                display(file_id)
                display(HBox([Image(value=image._repr_png_(), format='png', width=500, height=500) for image in images]))
                math_dropdown.value = 'yes'
                field_dropdown.value = 'unknown'
                level_dropdown.value = 'unknown'
                type_dropdown.value = 'unknown'
                clear_dropdown.value = 'yes'
            else:
                print("All done!")
                print(infos)


    button.on_click(on_button_clicked)

    display(VBox([math_dropdown, field_dropdown, level_dropdown, type_dropdown, clear_dropdown, button, output]))

main()        

converting libgen/act_math/Cynthia Johnson - 500 ACT Math Questions to Know by Test Day-McGraw-Hill Education (2013).pdf
done


0

HBox(children=(Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04\x0e\x00\x00\x05\xd0\x08\x02\x00\x…

VBox(children=(Dropdown(description='math', options=('unknown', 'yes', 'no'), value='unknown'), Dropdown(descr…