In [7]:
from pdf2image import convert_from_path
from pdf2image.exceptions import (
     PDFInfoNotInstalledError,
     PDFPageCountError,
     PDFSyntaxError
     )
import pdfplumber
from PIL import Image, ImageDraw

from collections import defaultdict, Counter
import gc
import decimal
import pandas as pd
from tqdm.notebook import tqdm
import os
import glob
from math import ceil
import numpy as np
import re
import random

In [8]:
def extract_parallel_text(source_pdf_path, config_dict):    
    # create plumber_pdf_obj
    pdf_file = pdfplumber.open(source_pdf_path)

    # create folder for images with bbox
    company_file_name = os.path.splitext(os.path.basename(source_pdf_path))[0]
    image_dir = os.path.join(company_file_name,"images")
    os.makedirs(image_dir, exist_ok=True) 
    #convert pdf to image
    
    
    pages = []
    left_lang = []
    right_lang = []
    
    try:
        len_pages = len(pdf_file.pages)
    except Exception as e:
        #no pages
        print(e)
        return None 
    
    #for each page, image
    for i in tqdm(range(len_pages)):
#         print(f"page {i}")
        image = convert_from_path(source_pdf_path, first_page=i+1, last_page=i+1)[0] 
        pdf_page = pdf_file.pages[i]
        image_with_bbox, extracted_text_pairs = split_x_split_y(pdf_page, image, config_dict)
        image_with_bbox.save(os.path.join(image_dir, f"{company_file_name}_page{i}.jpg"))

        for extracted_text_pair in extracted_text_pairs:
            pages.append(i)
            left_lang.append(extracted_text_pair[0])
            right_lang.append(extracted_text_pair[1])

    df_fn_excel = os.path.join(company_file_name, f"{company_file_name}_df.xlsx")
    df_fn_csv = os.path.join(company_file_name, f"{company_file_name}_df.tsv")
    result_df = pd.DataFrame({"page":pages, "left_lang": left_lang, "right_lang":right_lang})
    result_df['left_lang'] =result_df['left_lang'].astype(str).str.replace("\n", " ").str.replace("\t", " ").str.strip()
    result_df['right_lang'] =result_df['right_lang'].astype(str).str.replace("\n", " ").str.replace("\t", " ").str.strip()
    #
    try:
        result_df.to_excel(df_fn_excel)
        result_df.to_csv(df_fn_csv, sep='\t')
        
    except Exception as e:
        print(e)
        print("attempting to save in tsv")
        
        result_df.to_csv(df_fn_csv, sep='\t')
        
    return result_df


## Segregation using X-Y cutting


In [9]:
def get_most_common_gap(gaps):
    rounded_gaps = []
    for gap in gaps:
        rounded_gaps.append(ceil(gap['gap']/10) * 10)
    
    counter = Counter(rounded_gaps)

#     print(f"most_common_gap {counter.most_common(1)[0][0]}")
    return counter.most_common(1)[0][0]

def im_draw_lines(left_right_bbox, image, pdf_page):
    draw = ImageDraw.Draw(image)
    image_array_height, image_array_width  = np.array(image).shape[0:2]

    pdf_height =float(pdf_page.height)
    pdf_width = float(pdf_page.width)
    
    color = random.choice(['red','green'])
    
    left_box_x0 = float(left_right_bbox[0]['left']) /pdf_width*image_array_width
    left_box_y0 = float(left_right_bbox[0]['top'])/pdf_height*image_array_height-20
    left_box_x1 = float(left_right_bbox[0]['right']) /pdf_width*image_array_width
    left_box_y1 = float(left_right_bbox[0]['bottom'])/pdf_height*image_array_height
    
    right_box_x0 = float(left_right_bbox[1]['left'])/pdf_width*image_array_width
    right_box_y0 = float(left_right_bbox[1]['top'])/pdf_height*image_array_height-20
    right_box_x1 = float(left_right_bbox[1]['right'])/pdf_width*image_array_width
    right_box_y1 = float(left_right_bbox[1]['bottom'])/pdf_height*image_array_height

    
    draw.rectangle([left_box_x0,left_box_y0,left_box_x1,left_box_y1], outline=color, width=4)
    draw.rectangle([right_box_x0,right_box_y0,right_box_x1,right_box_y1], outline=color, width=4)
    draw.line([left_box_x1,(left_box_y0+left_box_y1)/2, right_box_x0, (right_box_y0+right_box_y1)/2], width =3, fill=color,)

#     height_top = height_top/pdf_height * image_array_height
#     height_btm = height_btm/pdf_height * image_array_height
#     y_split = y_split/pdf_width * image_array_width

#     draw.line([0,height_top,image_array_width,height_top], width =3, fill=200)
#     draw.line([0,height_btm,image_array_width,height_btm], width =3, fill=200)
#     draw.line([y_split,height_top,y_split,height_btm], width =3, fill=200)

    return image

def is_finance_table(text):
    pattern = r"(\s{3,}[)(\d\.\,]+\s{3,})|(\([\d\.\,\s]{3,}\))"
    if re.search(pattern, text) is not None:
        return True
    return False

def is_majority_number(text):
    digits = re.findall(r"\d", text)
    character = re.findall(r"[a-zA-Z]", text)
    return len(digits) >= (len(character)*0.2)


def get_x_tolerance(text_cropped, config_dict):
    chars = text_cropped.chars
    if len(chars) < 15:
        return config_dict['max_extract_text_x_tolerance']
    
    gaps = [config_dict['max_extract_text_x_tolerance']]
    for i in range(len(chars)-1):
        gap = float(chars[i+1]['x0'])- float(chars[i]['x1'])
        if gap >= config_dict['min_extract_text_x_tolerance']:
            gaps.append(gap-0.1)
    
    return min(gaps)
    
    

def get_text_bbox(pdf_page, config_dict):
    words = pdf_page.extract_words(extra_attrs=["upright"])
    words = [word for word in words if word['upright']]
    
    top = words[0]['top']
    bottom = words[0]['bottom']
    left = words[0]['x0']
    right = words[0]['x1']
    
    for word in words[1:]:
        if word['top'] < top:
            top = word['top']
        if word['bottom'] > bottom:
            bottom = word['bottom']
        if word['x0'] < left:
            left = word['x0']
        if word['x1'] > right:
            right = word['x1']
    
    text_cropped =  pdf_page.crop((left,
                                  top,
                                  right,
                                  bottom)) 
    
    calculated_x_tolerance = get_x_tolerance(text_cropped, config_dict)
    bbox_text = text_cropped.extract_text(x_tolerance = calculated_x_tolerance)
    
    #sometimes the first char which is space will cause some words to be split eg. The to T he
#     print([char['text'] for char in text_cropped.chars[:20]])
    if text_cropped.chars[0]['text'] == " ":
        bbox_text = bbox_text.replace(" ", "", 1) #only replace first

    
    return {"top":top, "bottom":bottom, "left":left, "right":right}, bbox_text
    

def split_x(pdf_page, config_dict):
    page_height = pdf_page.height
    page_height_scaled = round(page_height*100)
#     print(f"page_height {page_height_scaled}")
    height_array = np.zeros(page_height_scaled, dtype=np.int)

    words = pdf_page.extract_words(extra_attrs=["upright"])
    words = [word for word in words if word['upright']]
    
    if len(words) == 0:
        return []

    #forming the 1-d array
    for word in words:
        start_index = round(word['top']*100)
        end_index = round(word['bottom']*100)

        height_array[start_index: end_index] = 1

    gaps = []
    counter =0 
    start_of_gap_index = 0 

    first_text_height = float(words[0]['top'])-0.5
    bottom_of_page =  float(page_height)-10

    for i in range(len(height_array)):
        if height_array[i] == 0:
            counter += 1
        elif counter !=0:
            gaps.append({"start":i-counter,"end":i,"gap":counter})
            start_of_gap_index = i
            counter = 0

    spacing_gap = max(get_most_common_gap(gaps), float(config_dict['abs_min_vertical_spacing_gap'] * page_height_scaled))

    spliting_heights = []

    for i in range(1, len(gaps)):
        "ignore white space at the top and bottom"
        if gaps[i]['gap'] > (config_dict['min_vertical_spacing_gap']*spacing_gap):
            spliting_heights.append((gaps[i]["start"] + gaps[i]["end"])/200.0 )

    spliting_heights = [first_text_height] + spliting_heights + [bottom_of_page]
    return spliting_heights


def split_y(height_top, height_btm, pdf_page, config_dict):


    page_width = pdf_page.width

    min_gap = page_width * decimal.Decimal(config_dict['min_horizontal_spacing_gap']) * 100

    min_x_gap_location = page_width * decimal.Decimal(config_dict['central_split_left_boundary']) * 100 
    max_x_gap_location = page_width * decimal.Decimal(config_dict['central_split_right_boundary']) * 100 
    mid_point = float(page_width) *0.5 * 100

    cropped_page = pdf_page.crop((decimal.Decimal("0.0"),
                                  decimal.Decimal(height_top),
                                  pdf_page.width,
                                  decimal.Decimal(height_btm)))
    
    if cropped_page.extract_table() is not None:
#         print("cropped pdf is a table!")
        return None, None
    
    #flatten into 1-array
    page_width_scaled = round(page_width*100)
    width_array = np.zeros(page_width_scaled, dtype=np.int)

    words = cropped_page.extract_words(extra_attrs=["upright"])
    words = [word for word in words if word['upright']]
    
    #forming the 1-d array
    for word in words:
        start_index = round(word['x0']*100)
        end_index = round(word['x1']*100)

        width_array[start_index: end_index] = 1

    gaps = []
    counter =0 
    start_of_gap_index = 0 

    for i in range(len(width_array)):
        if width_array[i] == 0:
            counter += 1
        elif counter !=0:
            gaps.append({"start":i-counter,"end":i,"gap":counter})
            start_of_gap_index = i
            counter = 0
    
    gap_candidates = []

    for gap in gaps:
        if ((gap['gap']) > min_gap) and (gap['start'] > min_x_gap_location) and (gap['start'] < max_x_gap_location) and (gap['end'] > min_x_gap_location) and (gap['end'] < max_x_gap_location):
            gap_candidates.append(gap)            
            
            
    if len(gap_candidates) > 0 :  
        best_gap = gap_candidates[0]
        best_distance_to_centre = abs(mid_point - (best_gap['start'] + best_gap['end'])/2)

        if len(gap_candidates) >1:
            for gap_candidate in gap_candidates[1:]:
                distance_to_centre = abs(mid_point-(gap_candidate["start"] + gap_candidate["end"])/2)
                if  distance_to_centre < best_distance_to_centre:
                    best_gap = gap_candidate
                    best_distance_to_centre = distance_to_centre


        spliting_x = (best_gap["start"] + best_gap["end"])/200.0    
        left_crop =  pdf_page.crop((decimal.Decimal("0.0"),
                                    decimal.Decimal(height_top),
                                    decimal.Decimal(spliting_x),
                                    decimal.Decimal(height_btm)))

        right_crop = pdf_page.crop((decimal.Decimal(spliting_x),
                                    decimal.Decimal(height_top),
                                    pdf_page.width,
                                    decimal.Decimal(height_btm)))


#         left_text =left_crop.extract_text()
#         right_text = right_crop.extract_text()

        minimum_text_span = page_width * decimal.Decimal(config_dict['minimum_textspan']) 
        left_text_bbox, left_text = get_text_bbox(left_crop, config_dict)
        right_text_bbox, right_text = get_text_bbox(right_crop, config_dict)
        left_text_span = left_text_bbox['right'] -left_text_bbox['left']
        right_text_span = right_text_bbox['right'] -right_text_bbox['left']


        if (is_finance_table(left_text) or 
            is_finance_table(right_text) or 
            is_majority_number(left_text) or 
            is_majority_number(right_text) or 
            (left_text_span < minimum_text_span) or
            (right_text_span < minimum_text_span) or
            (left_text_span < decimal.Decimal(0.5) * right_text_span) or
            (left_text_span  > decimal.Decimal(2) * right_text_span) 
           ):


            #                 print("ignoring finance table/ majority digits like bbox")
            return None, None

        return (left_text_bbox,right_text_bbox) , (left_text, right_text)

#.strip().strip("\n").strip().strip("\n")
    return None, None



def split_x_split_y(pdf_page, image, config_dict):
    tmp_image = image.copy()
    extracted_tables = pdf_page.extract_table()
    extracted_texts = []

    # if extracted_tables is not None:
    #     print("Page contains tables, skipping")
    # else:
    try:
        spliting_heights = split_x(pdf_page, config_dict)

        for i in range(len(spliting_heights)-1):
            height_top  = spliting_heights[i]
            height_btm  = spliting_heights[i+1]
            left_right_bbox, extracted_text = split_y(height_top, height_btm, pdf_page, config_dict)
            if left_right_bbox is not None:
                tmp_image = im_draw_lines(left_right_bbox, tmp_image, pdf_page)
                extracted_texts.append(extracted_text)
    except Exception as e:
        print(e)
        print("if value error, Wrong orientation, skipping")


    return tmp_image, extracted_texts



In [10]:
source_pdf_dir = r"data\annual_report\bahasa_2"

file_paths =glob.glob(os.path.join(source_pdf_dir,"*.pdf"))


class CONFIG:
    bahasa = dict(
        min_vertical_spacing_gap = 1.75, # 1.75 times the natura spacing gap will result in a horizontal split
        abs_min_vertical_spacing_gap = 0.003,
        min_horizontal_spacing_gap = 0.02, # 2% of page width
        central_split_left_boundary = 0.35, # the vertical split must locate on the right of 35% of the page width
        central_split_right_boundary = 0.65, # the vertical split must locate on the right of 65% of the page width
        minimum_textspan = 0.15, # 15% of page width
        max_extract_text_x_tolerance = 3, #default is 3
        min_extract_text_x_tolerance = 1.6
    )
        

print(file_paths)    

['data\\annual_report\\bahasa_2\\ACES_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\ACST_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\ADES_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\AKKU_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\AKRA_Annual_Report_2019.pdf', 'data\\annual_report\\bahasa_2\\APIC_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\ARKA_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\ASDF_Annual_Report_2019.pdf', 'data\\annual_report\\bahasa_2\\ASDM_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\ASII_Annual_Report_2019.pdf', 'data\\annual_report\\bahasa_2\\ASMI_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\ASSA_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\AUTO_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\BABP_Annual_Report 2019_revisi.pdf', 'data\\annual_report\\bahasa_2\\BAJA_Annual_Report 2019.pdf', 'data\\annual_report\\bahasa_2\\BALI_Annual_Report_2019.pdf', 

In [11]:
for source_pdf_path in file_paths:
    result_df = extract_parallel_text(source_pdf_path, CONFIG.bahasa)
    gc.collect()

  0%|          | 0/93 [00:00<?, ?it/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/564 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

Bounding box (Decimal('0.0'), Decimal('60.14099999999999823785401531495153903961181640625'), Decimal('595.320'), Decimal('101.7450000000000045474735088646411895751953125')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('60.14099999999999823785401531495153903961181640625'), Decimal('595.320'), Decimal('101.7450000000000045474735088646411895751953125')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('60.14099999999999823785401531495153903961181640625'), Decimal('595.320'), Decimal('102.0450000000000017053025658242404460906982421875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error,

Bounding box (Decimal('0.0'), Decimal('57.50099999999999766941982670687139034271240234375'), Decimal('595.320'), Decimal('99.0450000000000017053025658242404460906982421875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('57.50099999999999766941982670687139034271240234375'), Decimal('595.320'), Decimal('99.0450000000000017053025658242404460906982421875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('57.50099999999999766941982670687139034271240234375'), Decimal('595.320'), Decimal('99.0450000000000017053025658242404460906982421875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value err

Bounding box (Decimal('0.0'), Decimal('57.50099999999999766941982670687139034271240234375'), Decimal('595.320'), Decimal('99.0450000000000017053025658242404460906982421875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('57.50099999999999766941982670687139034271240234375'), Decimal('595.320'), Decimal('99.0450000000000017053025658242404460906982421875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('56.88600000000000278532752417959272861480712890625'), Decimal('595.320'), Decimal('95.7999999999999971578290569595992565155029296875')) is not fully within parent page bounding box (Decimal('-11.225'), Decimal('11.282'), Decimal('584.095'), Decimal('853.202'))
if value err

  0%|          | 0/182 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/508 [00:00<?, ?it/s]

  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/332 [00:00<?, ?it/s]

  0%|          | 0/370 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/251 [00:00<?, ?it/s]

  0%|          | 0/123 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/165 [00:00<?, ?it/s]

  0%|          | 0/148 [00:00<?, ?it/s]

  0%|          | 0/235 [00:00<?, ?it/s]

Bounding box (Decimal('0.0'), Decimal('-3.428999999999999825917029738775454461574554443359375'), Decimal('595.260'), Decimal('59.30499999999999971578290569595992565155029296875')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.260'), Decimal('841.860'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('-3.428999999999999825917029738775454461574554443359375'), Decimal('595.260'), Decimal('238.854999999999989768184605054557323455810546875')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.260'), Decimal('841.860'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('-3.80900000000000016342482922482304275035858154296875'), Decimal('595.260'), Decimal('60.19500000000000028421709430404007434844970703125')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.260'), Decimal('841.860'))
if value

Bounding box (Decimal('0.0'), Decimal('-3.80900000000000016342482922482304275035858154296875'), Decimal('595.260'), Decimal('61.35499999999999687361196265555918216705322265625')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.260'), Decimal('841.860'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('-3.80900000000000016342482922482304275035858154296875'), Decimal('595.260'), Decimal('61.27499999999999857891452847979962825775146484375')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.260'), Decimal('841.860'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('-3.80900000000000016342482922482304275035858154296875'), Decimal('595.260'), Decimal('61.35499999999999687361196265555918216705322265625')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.260'), Decimal('841.860'))
if value 

  0%|          | 0/434 [00:00<?, ?it/s]

  0%|          | 0/301 [00:00<?, ?it/s]

  0%|          | 0/505 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

  0%|          | 0/373 [00:00<?, ?it/s]

  0%|          | 0/432 [00:00<?, ?it/s]

  0%|          | 0/768 [00:00<?, ?it/s]

  0%|          | 0/133 [00:00<?, ?it/s]

  0%|          | 0/298 [00:00<?, ?it/s]

Bounding box (Decimal('0.0'), Decimal('-36.695999999999997953636921010911464691162109375'), Decimal('595.276'), Decimal('831.8899999999999863575794734060764312744140625')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.276'), Decimal('841.890'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('-36.71399999999999863575794734060764312744140625'), Decimal('595.276'), Decimal('831.8899999999999863575794734060764312744140625')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.276'), Decimal('841.890'))
if value error, Wrong orientation, skipping
Bounding box (Decimal('0.0'), Decimal('-36.71399999999999863575794734060764312744140625'), Decimal('595.276'), Decimal('831.8899999999999863575794734060764312744140625')) is not fully within parent page bounding box (Decimal('0.000'), Decimal('0.000'), Decimal('595.276'), Decimal('841.890'))
if value error, Wrong orientatio

  0%|          | 0/154 [00:00<?, ?it/s]