# Data acquisition from .pdf files

This notebook acquires data from the dataset2/falencias

Requirements:

*   textract - Enables easier extraction of text from .pdf files
*   pandas - Provides dataframes for storage, and conversion to .csv

The next cells are needed to run the notebook inside the Google Colaboratory platform, with the datasets in Google Drive

In [None]:
!pip install textract

Setting up connection to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# dataset2 location in Google Drive
PATH_falencias = '/content/drive/Shared drives/NLP Jurídico - Lab. Ciência de Dados/Dados PDFs falencias'
PATH_save = '/content/drive/Shared drives/NLP Jurídico - Lab. Ciência de Dados'

The next cell is needed to run the notebook locally, with the datasets available in a local disk

In [6]:
PATH_falencias = '/run/media/raktanaka/KINGSTON/dataset2'
PATH_save = '/run/media/raktanaka/KINGSTON'

In [2]:
import os
import csv
import textract
import unicodedata
import re

In [3]:
def GetPDFData(suit, path_pdf, log_succ, log_fail):

    full_file_name = os.path.split(path_pdf)[1]
    row_dict = {}

    try:
        begin_parenthesis_i = full_file_name.find('(')
        doc_type = full_file_name[:begin_parenthesis_i-1]
        # Normalizes graphic accentuation
        doc_type = unicodedata.normalize('NFC', doc_type)
        doc_type = doc_type.lower()
        begin_sheet_no = 0
        end_sheet_no = 0
        signer_name = ''

        pag_found = False
        file_name = full_file_name[begin_parenthesis_i:]

        '''
        Gets the initial and final page from the file name
        '''
        # Tries to get what's between two parenthesis (...). If the string "pag"
        # isn't present, tries to get two more parenthesis (...) in the file name.
        # This is crucial because there are filenames like "Bla (AR) (pag. xxx - xxx).pdf"

        while(not pag_found):
            begin_parenthesis_i = file_name.find('(')
            end_parenthesis_i = file_name.find(')')

            # Checks if there're still substrings with "(...)"
            if begin_parenthesis_i != -1:
                # Checks if there's something like "(pag)" within the substring
                if 'pag' in file_name[begin_parenthesis_i:end_parenthesis_i]:
                    pag_found = True
                    # Checks if there're one or two pages in the file name
                    dash_index = file_name.find('-')
                    if dash_index != -1:
                        begin_sheet_no = int(
                            file_name[begin_parenthesis_i+4:dash_index-1])
                        end_sheet_no = int(
                            file_name[dash_index+1:end_parenthesis_i])
                    else:
                        begin_sheet_no = int(
                            file_name[begin_parenthesis_i+4:end_parenthesis_i])
                        end_sheet_no = int(
                            file_name[begin_parenthesis_i+4:end_parenthesis_i])
            else:
                pag_found = True
            file_name = file_name[end_parenthesis_i+1:]

        '''
        Extracts the string from the PDF
        '''
        if (len(full_file_name) > 4 and full_file_name[-4:] == '.pdf'):
            text = textract.process(path_pdf)
            text = text.decode("utf-8")
            text = unicodedata.normalize('NFC', text)
            text = text.lower()
            text = text.replace('\n', '')
            signature_date = ''

            '''
            Tries to find the signature date 
            '''
            index_date = text.find("protocolado em")
            if index_date != -1:
                signature_date = text[index_date+15:index_date+25]
            else:
                index_date = text.find("liberado nos autos em")
                if index_date != -1:
                    signature_date = text[index_date+22:index_date+32]
            '''
            Tries to find who has signed the documento
            '''
            signer_name_i = text.find("assinado digitalmente por")
            if signer_name_i != -1:
                index_end_name = text[signer_name_i:].find(',')
                signer_name = text[signer_name_i +
                                   26:signer_name_i+index_end_name]

            # Filters the PDF string, removing the stopwords and other garbage.
            text = re.sub(
                '(para conferir o original)(.+?)(e código [0-9A-za-z]+\.)', '', text)
            text = re.sub(
                '(este documento é cópia do original).+?(às [0-9]{2}:[0-9]{2} , sob o número [A-Za-z0-9]+\.)', '', text)
            text = re.sub(
                '(este documento é cópia do original).+?(liberado nos autos em [0-9]{2}/[0-9]{2}/[0-9]{4} às [0-9]{2}:[0-9]{2} . )', '', text)

        row_dict = {'n_processo': suit, 'tipo_documento': doc_type,
                    'string': text, 'data_doc': signature_date, 'assinado_por': signer_name,
                    'n_folha_inicio': begin_sheet_no, 'n_folha_fim': end_sheet_no}

        log_succ.write(suit + ' ' + full_file_name + '\n')

    except:
        log_fail.write(suit + ' ' + full_file_name + '\n')
        pass

    return(row_dict)

In [4]:
def walk_dirs(root, dirs):

    path = os.path.join(root, dirs)
    subroot, subdirs, files = next(os.walk(path))
    
    if subdirs:
        for each_dir in subdirs:
            subfiles = walk_dirs(subroot, each_dir)
            subfiles = [os.path.join(each_dir, f) for f in subfiles]
            files.extend(subfiles)

    return(files)
            


In [8]:
# Files: success and failure logs, and the csv with pdf data
log_succ = open(os.path.join(PATH_save, 'pdf_success.log'), 'w')
log_fail = open(os.path.join(PATH_save, 'pdf_error.log'), 'w')
csv_pdf = open(os.path.join(PATH_save, 'csv_pdf.csv'), 'w')
csv_columns = ['n_processo', 'tipo_documento', 'string', 'data_doc', 'assinado_por', 'n_folha_inicio', 'n_folha_fim']
writer = csv.row_dicttWriter(csv_pdf, fieldnames=csv_columns)
writer.writeheader()

# Get all processes = directories
root, dirs, files = next(os.walk(PATH_falencias))
dirs.sort()

for suit in dirs:
    p_root, p_dirs, p_files = next(os.walk(os.path.join(root, suit)))
    # If has subdirectories, gets the pdfs inside
    if p_dirs:
        for each_dir in p_dirs:
            # Skips hidden or system directories
            if not each_dir.startswith('.') and not each_dir.startswith('System'):
                subfiles = walk_dirs(p_root, each_dir)
                subfiles = [os.path.join(each_dir, f) for f in subfiles]
                p_files.extend(subfiles)

    # With the list of all pdfs, starts processing the data
    p_files.sort()
    for pdf in p_files:
        # Skips hidden or system files
        if not pdf.startswith('.') and not pdf.startswith('System'):
            row_dict = GetPDFData(suit, os.path.join(p_root, pdf), log_succ, log_fail)
            if row_dict:
                writer.writerow(row_dict)

log_succ.close()
log_fail.close()
csv_pdf.close()