In [17]:
# Tool 1: PDFMiner
# Can extract each image's name and bounding box
# For example, Image page_7_R33 is an image on page 7. 
#   The dict printed afterwards contains the bounding box information.
#   NOTE: Please do not rely on this code to extract images. It can only extract a small percentage of images.
# It can also extract each chunk of text and bounding box.
# For example, here are two text boxes.
#     Text page_4: 2 dimensions and 6 clusters, as shown in Fig. 1(a). We imple-
#     (cid:0)
#     ment the proposed U-k-means clustering algorithm for the
#     data set of Fig. 1(a) in which it obtains the correct number
#     c∗ = 6 of clusters with AR=1.00, as shown in Fig. 1(f),
#     after 11 iterations. These validity indices of CH, SW, DB,
#     Gap statistic, DNo, DNg, and DNs are shown in Table 1.
#     All indices give the correct number c∗ = 6 of clusters,
#     except DNg.
#      (bbox: (297.07915, 574.0578647999996, 538.2939210020002, 667.7077647999997))
#
#     Text page_4: FIGURE 1. (a) Original data set; (b)-(e) Processes of the U-k-means
#     after 1, 2, 4, and 9; (f) Convergent results.
#      (bbox: (297.07890000000003, 214.23653, 516.4383308796, 231.21739000000002))
#
# The first text box discusses about an image. The second text box is the caption of an image.
# Based on this caption and its bounding box, we can figure out which image it is for.

import os
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTImage, LTAnno, LTChar, LTText, LTTextBox, LTTextLine, LTFigure, LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

filepath = 'datasets/Unsupervised_K_Means_Clustering.pdf' #'datasets/Clustering_aggregation.pdf'
output_folder = 'datasets/test'

def determine_image_type(stream_first_4_bytes):
    """Find the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
    file_type = None
    bytes_as_hex = stream_first_4_bytes.hex()
    if bytes_as_hex.startswith('ffd8'):
        file_type = '.jpeg'
    elif bytes_as_hex == '89504e47':
        file_type = '.png'
    elif bytes_as_hex == '47494638':
        file_type = '.gif'
    elif bytes_as_hex.startswith('424d'):
        file_type = '.bmp'
    return file_type


def save_image(lt_image, page_number, images_folder):
    """Try to save the image data from this LTImage object, and return the file name, if successful"""
    result = None
    if lt_image.stream:
        file_stream = lt_image.stream.get_rawdata()
        if file_stream:
            file_ext = determine_image_type(file_stream[:4])
            if file_ext:
                file_name = f"{page_number}_{lt_image.name}{file_ext}"
                if write_file(images_folder, file_name, file_stream, flags='wb'):
                    result = file_name
    return result

def write_file(folder, filename, filedata, flags='w'):
    """Write the file data to the folder and filename combination
    (flags: 'w' for text, 'wb' for binary)"""
    result = False
    if os.path.isdir(folder):
        try:
            file_obj = open(os.path.join(folder, filename), flags)
            file_obj.write(filedata)
            file_obj.close()
            result = True
        except IOError:
            pass
    return result

# Function to parse the layout object
def parse_layout(layout, page_num):
    """Function to recursively parse the layout tree."""
    for lt_obj in layout:
        if isinstance(lt_obj, LTImage):
            # Extracted image
            print(f"Image page_{page_num}_{lt_obj.name}: {lt_obj.__dict__}")
            save_image(lt_obj, page_num, output_folder)
        elif isinstance(lt_obj, LTFigure):
            # Recursively parse figures
            parse_layout(lt_obj, page_num)
        elif isinstance(lt_obj, (LTTextBox, LTTextLine)):
            print(f"Text page_{page_num}: {lt_obj.get_text()} (bbox: {lt_obj.bbox})")
            

# Open a PDF file
file = open(filepath, 'rb')
parser = PDFParser(file)
document = PDFDocument(parser)

if not document.is_extractable:
    raise Exception("Document is not extractable")

# Create a PDF resource manager object
rsrcmgr = PDFResourceManager()

# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page_num, page in enumerate(PDFPage.create_pages(document)):
    interpreter.process_page(page)
    # receive the LTPage object for this page
    layout = device.get_result()
    parse_layout(layout, page_num)


Text page_0: Received April 5, 2020, accepted April 16, 2020. Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.
 (bbox: (36.1699, 711.5704999999999, 411.6854112059998, 718.5445))
Text page_0: Digital Object Identifier 10.1109/ACCESS.2020.2988796
 (bbox: (36.1699, 699.06617, 175.57110916399998, 705.04417))
Text page_0: Unsupervised K-Means Clustering Algorithm
 (bbox: (36.1699, 649.411876, 416.92943983599986, 671.329876))
Text page_0: KRISTINA P. SINAGA AND MIIN-SHEN YANG
Department of Applied Mathematics, Chung Yuan Christian University, Taoyuan City 32023, Taiwan
 (bbox: (36.1699, 612.1216499999999, 302.88333871750007, 628.9232499999999))
Text page_0: Corresponding author: Miin-Shen Yang (msyang@math.cycu.edu.tw)
 (bbox: (36.169953, 599.8429219999999, 250.0831805427, 607.513922))
Text page_0: This work was supported in part by the Ministry of Science and Technology, Taiwan, under Grant MOST 107-2118-M-033-002-MY2.
 (bbox: (36.169953, 584.968722, 448.46405920101

Text page_3: and
 (bbox: (108.99806299999999, 707.727066, 123.38463499999999, 717.690066))
Text page_3: corresponding
 (bbox: (133.50704299999998, 707.727066, 190.50536599999998, 717.690066))
Text page_3: z∗
ik
 (bbox: (200.617163, 705.133642, 209.959147, 719.766488))
Text page_3: need
 (bbox: (221.176963, 707.727066, 239.987107, 717.690066))
Text page_3: to
 (bbox: (250.10941537, 707.727066, 257.86062936999997, 717.690066))
Text page_3: be
 (bbox: (267.97307437, 707.727066, 277.37814636999997, 717.690066))
Text page_3: and
 (bbox: (297.07892999999996, 707.727246, 311.46550199999996, 717.690246))
Text page_3: K. P. Sinaga, M.-S. Yang: U-k-means Clustering Algorithm
 (bbox: (362.677, 749.9575, 538.2894962460001, 756.9315))
Text page_3: proportion α∗
k
re-normalized by
 (bbox: (36.1699, 695.772366, 104.50601062899999, 719.766488))
Text page_3: k = α∗
α∗
k
 (bbox: (110.99296299999999, 667.6937819999999, 146.12901799999997, 682.3266279999999))
Text page_3: ik = z∗
z∗
ik
 (bbox: (111.081131

Text page_4: K. P. Sinaga, M.-S. Yang: U-k-means Clustering Algorithm
 (bbox: (36.1699, 749.9575, 211.7823683500001, 756.9315))
Text page_4: P
 (bbox: (45.989709999999974, 616.209555, 56.40104499999997, 626.172555))
Text page_4: P
 (bbox: (54.19393999999996, 592.2997150000001, 64.60527499999996, 602.2627150000001))
Text page_4: P
 (bbox: (201.39079999999998, 628.1646549999999, 211.802135, 638.1276549999999))
Text page_4: n
i=1
 (bbox: (187.43572, 645.6036609999999, 199.23305, 660.6617789999999))
Text page_4: of another clustering algorithm that had also used the
idea from the EM algorithm by Yang et al. [31]. This
is the robust-learning fuzzy c-means (RL-FCM) proposed
by Yang and Nataliani [33]. In Yang and Nataliani [33],
they gave the RL-FCM objective function J (U, α, A) =
n
c
c
k=1 µik kxi − ak k2 − r1
k=1 µik ln αk +
i=1
n
c
c
k=1 µik ln µik − r3n
k=1 αk ln αk with µik , not
r2
i=1
P
P
P
binary variables, but fuzzy c-memberships with 0 ≤ µik ≤ 1
P
P
c
k=1 µik = 1 to indicate fuzzy

Text page_5: TABLE 1. Validity index values of CH, SW, DB, Gap-stat, DNo, DNg, and
DNs for the data set of Fig. 1(a).
 (bbox: (36.1699, 701.49353, 266.8983051518, 718.47439))
Text page_5: K. P. Sinaga, M.-S. Yang: U-k-means Clustering Algorithm
 (bbox: (362.677, 749.9575, 538.2894962460001, 756.9315))
Text page_5: FIGURE 3. 14-cluster dataset; (b) Final results from U-k-means.
 (bbox: (297.079, 596.1445, 503.8154083146, 604.75739))
Text page_5: TABLE 3. Results of the seven validity indices.
 (bbox: (297.079, 571.1005, 448.7627381699999, 579.71339))
Text page_5: results of the U-k-means, R-EM, C-FS, k-means with the true
cluster number c = 14, X-means, and RL-FCM algorithms.
Note that C-FS, k-means with the true number of clusters, and
X-means algorithms are dependent of initials or parameter
selection, and so we consider their average AR (AV-AR)
under different initials or parameter selection. From Table 4,
it is seen that the proposed U-k-means, R-EM, and RL-FCM
clustering algorithms

Text page_8: K. P. Sinaga, M.-S. Yang: U-k-means Clustering Algorithm
 (bbox: (36.1699, 749.9575, 211.7823683500001, 756.9315))
Text page_8: TABLE 10. Clustering results from various algorithms for different real data sets with the best results in boldface.
 (bbox: (36.1699, 709.8615, 407.652844714, 718.47439))
Text page_8: TABLE 11. Descriptions of the six medical data sets used in Example 6.
 (bbox: (36.1699, 509.14750000000004, 269.637461428, 517.76039))
Text page_8: TABLE 12. Results from various algorithms for the six medical data sets with the best results in boldface.
 (bbox: (36.1699, 342.74150000000003, 381.21455716799994, 351.35439))
Text page_8: factorization technique. We also conduct experiments to com-
pare the proposed U-k-means with R-EM, C-FS, k-means
with the true c, k-means+Gap-stat, X-means, and RL-FCM.
The results are shown in Table 12. For C-FS, k-means
with the true c, k-means+Gap-stat and X-means, we make
experiments with 25 different initializations, and report

In [18]:
# Tool 2: PyMuPDF (or fitz)
# It can extract images from each page of a pdf, and save as files
# Here are seven images extracted from a page:
#     page_num: 7
#     num_images: 7
#     (97, 0, 266, 275, 8, 'Indexed', '', 'R97', 'FlateDecode', 0)
#     Image idx= 0 xref= 97
#     page.get_xobjects= []
#     (95, 0, 247, 262, 8, 'Indexed', '', 'R95', 'FlateDecode', 0)
#     Image idx= 1 xref= 95
#     page.get_xobjects= []
#     (94, 0, 253, 268, 8, 'Indexed', '', 'R94', 'FlateDecode', 0)
#     Image idx= 2 xref= 94
#     page.get_xobjects= []
#     (93, 0, 252, 268, 8, 'Indexed', '', 'R93', 'FlateDecode', 0)
#     Image idx= 3 xref= 93
#     page.get_xobjects= []
#     (92, 0, 256, 268, 8, 'Indexed', '', 'R92', 'FlateDecode', 0)
#     Image idx= 4 xref= 92
#     page.get_xobjects= []
#     (90, 0, 249, 263, 8, 'Indexed', '', 'R90', 'FlateDecode', 0)
#     Image idx= 5 xref= 90
#     page.get_xobjects= []
#     (33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
#     Image idx= 6 xref= 33
#
# Please note the name of each image is stored a certain position in the tuple of metadata.
# For example, 'R97' is the name of an image, which can be matched with the bounding boxes
# extracted by PDFMiner.

import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_path):
    # Open the PDF
    doc = fitz.open(pdf_path)

    # Iterate through each page
    for page_num in range(len(doc)):
        print("page_num:", page_num)
        page = doc.load_page(page_num)

        # Get the list of image dictionaries on the page
        image_list = page.get_images(full=True)
        print("num_images:", len(image_list))

        # Iterate through each image
        for img_index, img in enumerate(image_list):
            print(img)
            xref = img[0]
            name = img[7]
            print("Image idx=", img_index, "xref=", xref)

            # Extract the image
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Save the image
            image_filename = f"image_{page_num}_{name}.png"
            with open(output_path + '/' + image_filename, "wb") as img_file:
                img_file.write(image_bytes)

            # Find the image's position on the page
            # This involves searching through the page's XObjects
            print("page.get_xobjects=", page.get_xobjects())
            for xobj in page.get_xobjects():
                xobj_page = doc.extract_xobject(xobj[1])
                if xref in xobj_page.get_images(full=True):
                    rect = fitz.Rect(xobj_page.rect)
                    print(f"Image {image_filename} extracted from page {page_num}, location: {rect}")

    doc.close()

# Use the function with your PDF
extract_images_from_pdf(filepath, output_folder)


page_num: 0
num_images: 1
(8, 0, 1313, 320, 8, 'DeviceRGB', '', 'R8', 'FlateDecode', 0)
Image idx= 0 xref= 8
page.get_xobjects= []
page_num: 1
num_images: 1
(33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
Image idx= 0 xref= 33
page.get_xobjects= []
page_num: 2
num_images: 1
(33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
Image idx= 0 xref= 33
page.get_xobjects= []
page_num: 3
num_images: 1
(33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
Image idx= 0 xref= 33
page.get_xobjects= []
page_num: 4
num_images: 1
(33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
Image idx= 0 xref= 33
page.get_xobjects= []
page_num: 5
num_images: 1
(33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
Image idx= 0 xref= 33
page.get_xobjects= []
page_num: 6
num_images: 1
(33, 0, 377, 64, 8, 'Indexed', '', 'R33', 'FlateDecode', 0)
Image idx= 0 xref= 33
page.get_xobjects= []
page_num: 7
num_images: 7
(97, 0, 266, 275, 8, 'Indexed', '', 'R97', 'FlateDecode', 0)
Im