# Data acquisition from .pdf files

This notebook acquires data from the dataset2/falencias

Requirements:

*   textract - Enables easier extraction of text from .pdf files
*   pandas - Provides dataframes for storage, and conversion to .csv

The next cells are needed to run the notebook inside the Google Colaboratory platform, with the datasets in Google Drive

In [None]:
!pip install textract

Setting up connection to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# dataset2 location in Google Drive
PATH_falencias = '/content/drive/Shared drives/NLP Jurídico - Lab. Ciência de Dados/Dados PDFs falencias'
PATH_save = '/content/drive/Shared drives/NLP Jurídico - Lab. Ciência de Dados'

The next cell is needed to run the notebook locally, with the datasets available in a local disk

In [1]:
PATH_falencias = '/run/media/raktanaka/KINGSTON/dataset2/falenciaspdf'
PATH_save = '/run/media/raktanaka/KINGSTON'

In [189]:
import os
import csv
import textract
import unicodedata
import re

In [190]:
def GetPDFData(processo, path_pdf, log_succ, log_fail):

    arquivo = os.path.split(path_pdf)[1]
    dic = {}

    try:
        index_ini_parent = arquivo.find('(')
        tipo_documento = arquivo[:index_ini_parent-1]
        # Normalizes graphic accentuation 
        tipo_documento = unicodedata.normalize('NFC', tipo_documento)
        tipo_documento = tipo_documento.lower()
        n_folha_inicio = 0
        n_folha_fim = 0
        nome_assinador = ''

        pag_encontrada = False
        nome_do_arquivo = arquivo[index_ini_parent:]

        '''
        Gets the initial and final page from the file name
        '''
        # Tries to get what's between two parenthesis (...). If the string "pag"
        # isn't present, tries to get two more parenthesis (...) in the file name. 
        # This is crucial because there are filenames like "Bla (AR) (pag. xxx - xxx).pdf"

        while(not pag_encontrada):
            index_ini_parent = nome_do_arquivo.find('(')
            index_fim_parent = nome_do_arquivo.find(')')
            
            # Checks if there're still substrings with "(...)"
            if index_ini_parent != -1:
                # Checks if there's something like "(pag)" within the substring
                if 'pag' in nome_do_arquivo[index_ini_parent:index_fim_parent]: 
                    pag_encontrada = True
                    # Checks if there're one or two pages in the file name
                    index_traco = nome_do_arquivo.find('-')
                    if index_traco != -1: 
                        n_folha_inicio = int(nome_do_arquivo[index_ini_parent+4:index_traco-1])
                        n_folha_fim = int(nome_do_arquivo[index_traco+1:index_fim_parent])
                    else: 
                        n_folha_inicio = int(nome_do_arquivo[index_ini_parent+4:index_fim_parent])
                        n_folha_fim = int(nome_do_arquivo[index_ini_parent+4:index_fim_parent])
            else: 
                pag_encontrada = True
            nome_do_arquivo = nome_do_arquivo[index_fim_parent+1:]

        '''
        Extracts the string from the PDF
        '''
        if (len(arquivo) > 4 and arquivo[-4:] == '.pdf'):
            text = textract.process(path_pdf)
            text = text.decode("utf-8")
            text = unicodedata.normalize('NFC', text)
            text = text.lower()
            text = text.replace('\n', '')
            data_doc = ''

            '''
            Tries to find the signature date 
            '''
            index_date = text.find("protocolado em")
            if index_date != -1:
                data_doc = text[index_date+15:index_date+25]
            else: 
                index_date = text.find("liberado nos autos em")
                if index_date != -1: 
                    data_doc = text[index_date+22:index_date+32]
            '''
            Tries to find who has signed the documento
            '''
            index_name = text.find("assinado digitalmente por")
            if index_name != -1:
                index_end_name = text[index_name:].find(',')
                nome_assinador = text[index_name+26:index_name+index_end_name]

            # Filters the PDF string, removing the stopwords and other garbage. 
            text = re.sub('(para conferir o original)(.+?)(e código [0-9A-za-z]+\.)', '', text)
            text = re.sub('(este documento é cópia do original).+?(às [0-9]{2}:[0-9]{2} , sob o número [A-Za-z0-9]+\.)', '', text)
            text = re.sub('(este documento é cópia do original).+?(liberado nos autos em [0-9]{2}/[0-9]{2}/[0-9]{4} às [0-9]{2}:[0-9]{2} . )', '', text)

        dic = {'n_processo': processo, 'tipo_documento': tipo_documento,
                'string': text, 'data_doc': data_doc, 'assinado_por': nome_assinador,
                'n_folha_inicio': n_folha_inicio, 'n_folha_fim':n_folha_fim}
        #print(arquivo)
        #print(processo, tipo_documento, data_doc, n_folha_inicio, n_folha_fim)
        log_succ.write(processo + ' ' + arquivo + '\n')

    except: 
        #print('Ocorreu um problema ao processar o seguinte documento: ')
        #print(arquivo)
        log_fail.write(processo + ' ' + arquivo + '\n')
        pass

    return(dic)


In [191]:
def WalkDirs(root, dirs):

    path = os.path.join(root, dirs)
    subroot, subdirs, files = next(os.walk(path))
    
    if subdirs:
        for each_dir in subdirs:
            subfiles = WalkDirs(subroot, each_dir)
            subfiles = [os.path.join(each_dir, f) for f in subfiles]
            files.extend(subfiles)

    return(files)
            


In [203]:
# Files: success and failure logs, and the csv with pdf data
log_succ = open(os.path.join(PATH_save, 'pdf_success.log'), 'w')
log_fail = open(os.path.join(PATH_save, 'pdf_error.log'), 'w')
csv_pdf = open(os.path.join(PATH_save, 'csv_pdf.csv'), 'w')
csv_columns = ['n_processo', 'tipo_documento', 'string', 'data_doc', 'assinado_por', 'n_folha_inicio', 'n_folha_fim']
writer = csv.DictWriter(csv_pdf, fieldnames=csv_columns)
writer.writeheader()

# Get all processes = directories
root, dirs, files = next(os.walk(PATH_falencias))
dirs.sort()

for processo in dirs:
    p_root, p_dirs, p_files = next(os.walk(os.path.join(root, processo)))
    # If has subbdirectories, gets the pdfs inside
    if p_dirs:
        for each_dir in p_dirs:
            # Skips hidden or system directories
            if not each_dir.startswith('.') and not each_dir.startswith('System'):
                subfiles = WalkDirs(p_root, each_dir)
                subfiles = [os.path.join(each_dir, f) for f in subfiles]
                p_files.extend(subfiles)

    # With the list of all pdfs, starts processing the data
    p_files.sort()
    for pdf in p_files:
        # Skips hidden or system files
        if not pdf.startswith('.') and not pdf.startswith('System'):
            dic = GetPDFData(processo, os.path.join(p_root, pdf), log_succ, log_fail)
            writer.writerow(dic)

log_succ.close()
log_fail.close()
csv_pdf.close()