In [None]:
from bs4 import BeautifulSoup
import os
import re

In [None]:
# Step 0 - Directory structure
# Place pdfs under directory pdfs
!mkdir -p xmls
!mkdir -p txts

In [None]:
# Step 1 - Convert PDFs to XML
!pip install pdfminer
!pdf2txt.py -o xmls/English.xml -t xml pdfs/English.pdf

In [None]:
# Step 2 - Read the XML to memory
doc_name = "English"
xml_file = "xmls/" + doc_name + ".xml"
with open(xml_file,'r') as f:
    xml_content = f.read()

In [None]:
# Step 3 - Enter document specific parameters. This should be done manually by inspecting PDF and XML.

# The list of pages to skip in the document. For each translation it's different
# Turkish
#bad_page_nos=[1,2,3,4,5,8,9,12,14,16,17,21,25,33,34,57,58,79,80,81,82,83,84,85]

# Chinese
#bad_page_nos=[1, 2, 3, 5, 7, 10, 11, 12, 15, 18, 41, 42, 58, 59, 60, 61, 62, 63, 64]

# English, French, Italian, Spanish, Indonesian, Dutch, German, Serbian, Japanese
bad_page_nos=[1, 2, 3, 5, 7, 10, 11, 12, 15, 18, 25, 44, 45, 62, 63, 64, 65, 66, 67, 68, 69, 70]

# Document specific formatting parameters
# font_set: list of font sizes to scrape. Text that are not one of these sizes will be skipped (titles, footnotes etc.)
# new_line_at_line_end: flag to make new line after textlines that don't take the whole page
# line_end_mark_left: if text ends before this position, then it's a signal for a paragraph end(for left page) 
# line_end_mark_right: if text ends before this position, then it's a signal for a paragraph end(for right page) 
# inbetween_line_space: space between lines that belong to the same paragraph. Any space above this threshold will count as paragraph break.

# Zheijang COVID-19 handbook parameters
# English, French, Italian, Spanish, Indonesian, Dutch, German
font_set = ["9.723", "8.334", "9.317"]
new_line_at_line_end = True
line_end_mark_left = 337.0
line_end_mark_right = 355.0
inbetween_line_space = 0.3

# Serbian
# font_set = ["7.665", "6.570"]
# new_line_at_line_end = False
# inbetween_line_space = 2.35

# Japanese
# font_set = ["22.848"]
# new_line_at_line_end = True
# line_end_mark_left = 353.0
# line_end_mark_right = 355.0

# Chinese
# font_set = ["11.112"]
# new_line_at_line_end = True
# line_end_mark_left = 300.0
# line_end_mark_right = 350.0
# inbetween_line_space = 0.9

In [None]:
# Step 4 - Parse XML with BeautifulSoup
soup = BeautifulSoup(xml_content)
pages = soup.findAll("page")
good_pages = [page for page in pages if not int(page.attrs["id"]) in bad_page_nos]
print("Good pages", len(good_pages))

In [None]:
# Text cleaning procedures
end_hyphen = re.compile('-$')
w_spaces = re.compile(' +')
multinewline = re.compile('\n+')

def clean_paragraph(text):
    clean_paragraph = text
    
    clean_paragraph = w_spaces.sub(' ', clean_paragraph)
    clean_paragraph = multinewline.sub('\n', clean_paragraph)
    
    return clean_paragraph

def clean_line(text):
    clean_line = text
    
    #clean_line = clean_line.replace('\n', '')
    clean_line = clean_line.replace('ﬁ', 'fi')
    clean_line = clean_line.replace('ﬃ', 'ffi')
    clean_line = clean_line.replace('ﬂ', 'fl')
    clean_line = clean_line.replace('ﬀ', 'ff')
    clean_line = clean_line.replace('(cid:31)', 'ffi')
    
    #add space in the end unless there's an hyphen at the end
    if not clean_line[-1] == "-":
        clean_line += " "
    else:
        clean_line = end_hyphen.sub('', clean_line)
        
    clean_line = clean_line.lstrip()
    
    return clean_line

In [None]:
# Step 5 - Parse text from XML. This section prints positions in the document where a 
# page ends in the middle of a paragraph. It would automatically result in two paragraphs 
# but should be joined manually in the output text. 

no_page = 0 
no_paragraphs = 0
last_line_x_end = 0
last_text_size = 0
last_line_y_bottom = 0.0
size_change=False

with open('txts/' + doc_name + '.txt', 'w') as f:
    
    for page in good_pages:
        page_content = ""
        last_line_y_bottom = 0.0
        last_line_x_end = 0.0
        
        page_id = int(page.attrs["id"])
        if new_line_at_line_end:
            if page_id % 2 == 0:
                line_end_mark = line_end_mark_left
            else:
                line_end_mark = line_end_mark_right
        
        for textbox in page.find_all("textbox"):
            paragraph = ""
            for textline in textbox.find_all("textline"):
                line = ""
                for text in textline.find_all("text"):
                    if text.has_attr("size"):
                        if text.attrs["size"] in font_set:  #enter normal text sizes
                            line += text.text
                        line_y_top = float(text.attrs["bbox"].split(",")[3])
                        line_y_bottom = float(text.attrs["bbox"].split(",")[1])
                        line_x_end = float(text.attrs["bbox"].split(",")[2])
                        
                if line and not line.isspace():
                    cleaned_line = clean_line(line)
                    
                    if cleaned_line and not cleaned_line.isspace():
                        
                        line_break = last_line_y_bottom - line_y_top
                        #print("line_break %f - %f = %f"%(last_line_y_bottom,line_y_top, line_break))
                        #print("last_line_x_end", last_line_x_end)
                        
                        if new_line_at_line_end and last_line_x_end <= line_end_mark:
                            #print("EARLY PARAGRAPH END BREAK")
                            paragraph += "\n"
            
                        if line_break > inbetween_line_space:
                            #print("WIDE PARAGRAPH BREAK")
                            paragraph += "\n"
                        
                        paragraph += cleaned_line
                        #print("|" + cleaned_line + "|\n")

                last_line_y_bottom = line_y_bottom
                last_line_x_end = line_x_end
                
            cleaned_paragraph = clean_paragraph(paragraph)
            if cleaned_paragraph and not cleaned_paragraph.isspace():
                page_content += cleaned_paragraph

                no_paragraphs += 1
        
        no_page += 1    
        if page_content and not page_content.isspace():
            f.write(page_content)
            if page_content[-2].isalpha():
                print("page " + str(no_page) + "|" + page_content[-3] + "|"+ page_content[-2] + "|"+ page_content[-1] + "|")
                print(page_content[-50:])
            
print('no_parags', no_paragraphs)