In [20]:
# https://stackabuse.com/text-translation-with-google-translate-api-in-python/
import io, os, json
import googletrans
from os import listdir
from os.path import isfile, join
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            
            text = fake_file_handle.getvalue()
            yield text
    
            # close open handles
            converter.close()
            fake_file_handle.close()
    
def extract_text(pdf_path, extracted_text):
    for page in extract_text_by_page(pdf_path):
        extracted_text.append(page)
    return extracted_text

In [21]:
def check_folder_exist(file_path):
    
    ifExist = False
    # If this file object exist.
    if(os.path.exists(file_path)):
        
        # If this is a file.
        if(os.path.isfile(file_path)):
            ifExist = "is_a_file"

        # This is a directory.    
        else:
            ifExist = True
            
    else:
        ifExist = False
        
    return ifExist

def create_new_folder(file_path):
    if(not check_folder_exist(file_path)):
        os.mkdir(file_path)
        print(file_path + " has been created. ")

In [22]:
def extract_title_date(extracted_text_test):
    
    extracted_title_date_value = dict()
    splited_cover_page = [string for string in extracted_text_test[0].split(' ') if string != ""]
    for each_word in range(0, len(splited_cover_page)):
        if "Datum" in splited_cover_page[each_word]:
            extracted_title_date_value.update({"Date":splited_cover_page[each_word+1]+'-'+splited_cover_page[each_word+2]\
                                       +'-'+splited_cover_page[each_word+3]})
    
    title = []
    for each_word in range(0, len(splited_cover_page)):
        if "Documentat" in splited_cover_page[each_word]:
            title = ' '.join(word for word in splited_cover_page[each_word+1:])

    extracted_title_date_value.update({"Title":title})
    
    return extracted_title_date_value

In [1]:
import pdfminer
if __name__ == '__main__':
    

    category_folders = [x[0] for x in os.walk(os.getcwd() + "/MicrodataFiles/")]
    
    for each in range(1, len(category_folders)):

        extracted_pdfs = [f for f in listdir(category_folders[each]) if isfile(join(category_folders[each], f))]
        
        for each_pdf in extracted_pdfs: 
        
            input_pdf = category_folders[each] + '/' + each_pdf

            
            output_folder_path = os.getcwd() + "/ExtractVariableTitle/" + category_folders[each].split('/')[-1]
            create_new_folder(output_folder_path)
            output_file_path = output_folder_path + "/title_%s.json" %(each_pdf[:-4])
            
            if not check_folder_exist(output_file_path):
                try:
                    extracted_text = []
                    extracted_text = extract_text(input_pdf, extracted_text) #  pgbawbztab

                    extracted_title_date_value = extract_title_date(extracted_text)
                    if len(extracted_title_date_value) < 2:
                        print("Title and Date info is not complete!")
                        print("%s" %input_pdf)
                        
                    with open(output_file_path, 'w') as output_file:
                            output_file.write(json.dumps(extracted_title_date_value))
                except pdfminer.psparser.PSEOF:
                    print('Unexpected EOF')
                except pdfminer.pdfdocument.PDFSyntaxError:
                    print('Is this really a PDF?')
                except:
                    raise
                    
                    