In [1]:
# !pip install googletrans
# !pip install pdfminer
# !pip install pdfminer.six

In [1]:
# https://stackabuse.com/text-translation-with-google-translate-api-in-python/
import io, os, json
import googletrans
from os import listdir
from os.path import isfile, join
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
            
            text = fake_file_handle.getvalue()
            yield text
    
            # close open handles
            converter.close()
            fake_file_handle.close()
    
def extract_text(pdf_path, extracted_text):
    for page in extract_text_by_page(pdf_path):
        extracted_text.append(page)
    return extracted_text

In [2]:
def getFirstCapital(string):
    firstCapital = False
    for pos in range(0, len(string)):
        if string[pos].isupper():
            firstCapital = pos
            break
    return firstCapital

def getCapitalNumber(string):
    count_capital_number = False
    for letter in string:
        if letter.isupper():
            count_capital_number = count_capital_number + 1
    return count_capital_number

In [52]:
def extract_variable_name_page(splited_content, variable_page_list):
    ### Extract variable names and pages in the file ###
    skip = 0
    
    for item in range(1, len(splited_content),2):

        if item + 1 <= len(splited_content)-1:
            try: 
                if skip == 0:
                    variable_page = int(splited_content[item+1])
                    variable_name = splited_content[item]
                    variable_page_list.update({variable_name:variable_page})

                    if getCapitalNumber(variable_name) < 2:
                        skip = skip + 1
                        break
            except:
                for after_item in range(item+1, len(splited_content)):
                    if skip == 0:
                        if splited_content[after_item].isdigit():
                            end_variable = ' '.join(word for word in splited_content[item:after_item])
                            variable_page_list.update({end_variable:int(splited_content[after_item])})
                            skip = skip + 1
    return variable_page_list


def extract_variable_info(extracted_text):

    ### Extract Beschikbare bestand information ###
    variable_page_list = dict()

    for each_page in extracted_text:
        if "Beschikbare bestand" in each_page:
            important_note = each_page.split("Beschikbare bestand")[1]
            variable_page_list.update({'Beschikbare bestand':important_note[getFirstCapital(important_note):-16]})
            break
            
    if not variable_page_list:
        variable_page_list.update({'Beschikbare bestand':None})
        print("This file does not have Beschikbare bestand. ")
        
        
    ### Extract variable list from content ###        
    for each_page in extracted_text:
        if "Toelichting op de variabelen" in each_page:
            content = each_page.split('Toelichting op de variabelen', 1)[1].replace('.',' ')
            break
        
    splited_content = [string for string in content.split(' ') if string != ""]       
    variable_page_list = extract_variable_name_page(splited_content, variable_page_list)

                         
                            
    ### METHOD 2 to extract variable names and pages ###
    if len(variable_page_list) < 2:
        new_splited_content = []
        for each_page in extracted_text:
            if "Toelichting op de variabelen" in each_page:
                for string in each_page.split('Toelichting op de variabelen', 1)[1].split('.'):
                    if string != "" and len(string)>3:
                        page_num_str = str()
                        for character in range(0, 3):
                            if string[character].isdigit():
                                page_num_str = page_num_str + string[character]
                        if page_num_str:
                            new_splited_content.append(int(page_num_str))
                            new_splited_content.append(''.join(i for i in string if not i.isdigit()))
          
        variable_page_list = extract_variable_name_page(new_splited_content, variable_page_list)

    return variable_page_list

In [53]:
def combine_all_variable_page(variable_page_list, extracted_text):
    
    ### Combine all pages of variables ###
    skip = 0
    start_page = int(variable_page_list[list(variable_page_list.keys())[1]])-1
    end_page = int(variable_page_list[list(variable_page_list.keys())[-1]])

    for page_num in range(start_page, end_page):  

        if page_num == start_page: # Toelichting op de samenstelling van de bestanden
            if "Toelichting op de variabelen" in extracted_text[page_num]: 
                sep_header_content = extracted_text[page_num].split("Toelichting op de variabelen")
                page_header = sep_header_content[0][:-4] # extract page header, other pages need to remove headers in the page
                all_variables_description = sep_header_content[1]
            else:
                print("Variable description does not start with the start page.")
            
                
        else:
            page_content = extracted_text[page_num].replace(page_header, '') # page number is still in the head#

            if page_num == end_page-1:
                ### get the content before the end variable #
                if list(variable_page_list.keys())[-1] in extracted_text[page_num]:
                    all_variables_description = all_variables_description + extracted_text[page_num].split(list(variable_page_list.keys())[-1])[0][:-9]
            else:
                all_variables_description = all_variables_description + page_content[getFirstCapital(page_content):]

    # remove page header in the last page #
    all_variables_description = all_variables_description.replace(page_header,'')
    all_variables_description = all_variables_description.replace('\x0c','')

    return all_variables_description

In [54]:
### if the next word starts with a capital letter then it's a sentence after variable name ###
def check_true_variable(all_variables_description_words_level, word_index):
    next_word = all_variables_description_words_level[word_index+1]
    if getFirstCapital(next_word) == 0 and getCapitalNumber(next_word) == 1: ## there is a sentence after the variable ##
        return word_index
    return "Not a variable"

In [55]:
def make_variable_dictionary(all_variables_description, input_variable_page_list):

    ### Detect each variable from the all_variables_description ###
    each_variable_description = dict()
    each_variable_description.update({'Beschikbare bestand':input_variable_page_list['Beschikbare bestand']})
    
    ### for the no-space problem e.g., variablecontext (no space between variable and following text)
    print(list(input_variable_page_list.keys()))
    
    variable_page_list = dict()
    for var in range(0, len(list(input_variable_page_list.keys()))):
        if var == 0 or var == len(list(input_variable_page_list.keys())):
            variable_page_list.update({list(input_variable_page_list.keys())[var]:input_variable_page_list[list(input_variable_page_list.keys())[var]]})
        else:
            new_var_name = str()
            for charac in list(input_variable_page_list.keys())[var]:
                if (charac != " "):
                    new_var_name = new_var_name + charac

            all_variables_description = all_variables_description.replace(list(input_variable_page_list.keys())[var], ' ' + new_var_name + ' ')
            variable_page_list.update({new_var_name:input_variable_page_list[list(input_variable_page_list.keys())[var]]})
       
    
    ###########################

    
    all_variables_description_words_level = []
    for string in all_variables_description.split(' '):
        if (string != ""):
            all_variables_description_words_level.append(string)

    for variable_index in range(1, len(list(variable_page_list.keys()))-1):
        pre_variable_pos = "Not a variable"
        post_variable_pos = "Not a variable"

        for word_index in range(0, len(all_variables_description_words_level)):
            if pre_variable_pos == "Not a variable":
                ### if found the variable name ###
                if all_variables_description_words_level[word_index] == list(variable_page_list.keys())[variable_index]:
                    pre_variable_pos = check_true_variable(all_variables_description_words_level, word_index)

            else:
                if all_variables_description_words_level[word_index] == list(variable_page_list.keys())[variable_index+1]:
                    post_variable_pos = check_true_variable(all_variables_description_words_level, word_index)

                    if post_variable_pos != "Not a variable":
                        break
                elif variable_index == len(variable_page_list.keys())-2:
                    post_variable_pos = len(all_variables_description_words_level)-1
                    break

        if pre_variable_pos == "Not a variable":
            print("Did not find (start) %s variable." %list(variable_page_list.keys())[variable_index])

        ### write each variable and its description ###
        else:
            if post_variable_pos == "Not a variable":
                print("Did not find (end) %s variable." %list(variable_page_list.keys())[variable_index+1])
                each_variable_description.update({all_variables_description_words_level[pre_variable_pos]:None})
            
            else:
                get_description_content = ' '.join(word for word in all_variables_description_words_level[pre_variable_pos+1:post_variable_pos-1])
                each_variable_description.update({all_variables_description_words_level[pre_variable_pos]:get_description_content})
    
    return each_variable_description

In [56]:
def check_folder_exist(file_path):
    
    ifExist = False
    # If this file object exist.
    if(os.path.exists(file_path)):
        
        # If this is a file.
        if(os.path.isfile(file_path)):
            ifExist = "is_a_file"

        # This is a directory.    
        else:
            ifExist = True
            
    else:
        ifExist = False
        
    return ifExist

def create_new_folder(file_path):
    if(not check_folder_exist(file_path)):
        os.mkdir(file_path)
        print(file_path + " has been created. ")

In [1]:
import pdfminer
if __name__ == '__main__':
    

    category_folders = [x[0] for x in os.walk(os.getcwd() + "/MicrodataFiles/")]
    count_fails_total = 0
    count_files_total = 0
    
    for each in range(1, len(category_folders)):
        count_fails = 0
        count_files = 0
        
        extracted_pdfs = [f for f in listdir(category_folders[each]) if isfile(join(category_folders[each], f))]
        
        for each_pdf in extracted_pdfs: 
        
            input_pdf = category_folders[each] + '/' + each_pdf
            count_files = count_files + 1
            count_files_total = count_files_total + 1
            
            output_folder_path = os.getcwd() + "/ExtractFiles/" + category_folders[each].split('/')[-1]
            create_new_folder(output_folder_path)
            output_file_path = output_folder_path + "/keyInfo_%s.json" %(each_pdf[:-4])
            
            if not check_folder_exist(output_file_path):
                try:

                    extracted_text = []
                    extracted_text = extract_text(input_pdf, extracted_text) #  pgbawbztab

                    variable_page_list = extract_variable_info(extracted_text)

                    all_variables_description = combine_all_variable_page(variable_page_list, extracted_text)

                    each_variable_description = make_variable_dictionary(all_variables_description, variable_page_list)

                    with open(output_file_path, 'w') as output_file:
                        output_file.write(json.dumps(each_variable_description))

                    print(category_folders[each].split('/')[-1]+"/keyInfo_%s.json" %(each_pdf[:-4]), " generated! ")
                
                except UnboundLocalError:
                    print("-----> UnboundLocalError %s " %(category_folders[each].split('/')[-1]+"/keyInfo_%s.json" %(each_pdf[:-4])))
                    count_fails = count_fails + 1
                    count_fails_total = count_fails_total + 1
                
                except pdfminer.pdfdocument.PDFSyntaxError:
                    count_fails = count_fails + 1
                    count_fails_total = count_fails_total + 1
                
                except pdfminer.psparser.PSEOF:
                    count_fails = count_fails + 1
                    count_fails_total = count_fails_total + 1
                
                except IndexError:
                    print("-----> IndexError %s " %(category_folders[each].split('/')[-1]+"/keyInfo_%s.json" %(each_pdf[:-4])))
                    count_fails = count_fails + 1
                    count_fails_total = count_fails_total + 1

#                 except:
#                     print("-----> FAILURE ERROR: %s " %category_folders[each].split('/')[-1]+"/keyInfo_%s " %(each_pdf))
                    
#             else:
#                 print(category_folders[each].split('/')[-1]+"/keyInfo_%s.json" %(each_pdf[:-4]), " existed! ")
        
        print("In this catogory, there are %s files."%str(count_files))
        print("Has %s failed variable extraction." %str(count_fails))
        print("************ %s *******************" %category_folders[each].split('/')[-1] )
        print()
        
    print("<----- In total, there are %s files. -----> \n" %str(count_files_total))
    print("<----- %s files failed to be extracted. --> \n" %str(count_fails_total))