# Data acquisition from .pdf files

This notebook acquires data from the dataset2/falencias

Requirements:

*   textract - Enables easier extraction of text from .pdf files
*   Pandas - Merging of csv

The next cells are needed to run the notebook inside the Google Colaboratory platform, with the datasets in Google Drive

In [None]:
!pip install textract

Setting up connection to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# dataset2 location in Google Drive
PATH_falencias = '/content/drive/Shared drives/NLP Jurídico - Lab. Ciência de Dados/Dados PDFs falencias'
PATH_save = '/content/drive/Shared drives/NLP Jurídico - Lab. Ciência de Dados'

The next cell is needed to run the notebook locally, with the datasets available in a local disk

In [37]:
PATH_falencias = '/home/raktanaka/USP/falenciaspdf'
PATH_save = '/home/raktanaka/USP/'

In [3]:
import os
import csv
import textract
import unicodedata
import re
import pandas as pd

In [4]:
def get_pdf_data(suit, path_pdf, log_fail):

    full_file_name = os.path.split(path_pdf)[1]
    row_dict = {}

    try:
        begin_parenthesis_i = full_file_name.find('(')
        doc_type = full_file_name[:begin_parenthesis_i-1]
        # Normalizes graphic accentuation
        begin_sheet_no = 0
        end_sheet_no = 0
        signer_name = ''

        pag_found = False
        file_name = full_file_name[begin_parenthesis_i:]

        '''
        Gets the initial and final page from the file name
        '''
        # Tries to get what's between two parenthesis (...). If the string "pag"
        # isn't present, tries to get two more parenthesis (...) in the file name.
        # This is crucial because there are filenames like "Bla (AR) (pag. xxx - xxx).pdf"

        while(not pag_found):
            begin_parenthesis_i = file_name.find('(')
            end_parenthesis_i = file_name.find(')')

            # Checks if there're still substrings with "(...)"
            if begin_parenthesis_i != -1:
                # Checks if there's something like "(pag)" within the substring
                if 'pag' in file_name[begin_parenthesis_i:end_parenthesis_i]:
                    pag_found = True
                    # Checks if there're one or two pages in the file name
                    dash_index = file_name.find('-')
                    if dash_index != -1:
                        begin_sheet_no = int(
                            file_name[begin_parenthesis_i+4:dash_index-1])
                        end_sheet_no = int(
                            file_name[dash_index+1:end_parenthesis_i])
                    else:
                        begin_sheet_no = int(
                            file_name[begin_parenthesis_i+4:end_parenthesis_i])
                        end_sheet_no = int(
                            file_name[begin_parenthesis_i+4:end_parenthesis_i])
            else:
                pag_found = True
            file_name = file_name[end_parenthesis_i+1:]

        '''
        Extracts the string from the PDF
        '''
        if (len(full_file_name) > 4 and full_file_name[-4:] == '.pdf'):
            text = textract.process(path_pdf).decode('utf-8')

        row_dict = {'n_processo': suit, 'tipo_documento': doc_type,
                    'string': text, 'n_folha_inicio': begin_sheet_no, 
                    'n_folha_fim': end_sheet_no}

        #log_succ.write(suit + ' ' + full_file_name + '\n')

    except:
        #row_dict = {'n_processo': suit, 'tipo_documento': doc_type,
        #            'string': 'NaN', 'n_folha_inicio': begin_sheet_no, 
        #            'n_folha_fim': end_sheet_no}
        log_fail.write(suit + ' ' + full_file_name + '\n')
        pass

    return(row_dict)

In [5]:
def walk_dirs(root, dirs):

    path = os.path.join(root, dirs)
    subroot, subdirs, files = next(os.walk(path))
    
    if subdirs:
        for each_dir in subdirs:
            subfiles = walk_dirs(subroot, each_dir)
            subfiles = [os.path.join(each_dir, f) for f in subfiles]
            files.extend(subfiles)

    return(files)
            


In [6]:
# Natural sort, so the pdf files are ordered alphanumerically:
# Documento1, Documento2, Documento10
# And not
# Documento1, Documento10, Documento2
def NaturalSort(l):

    convert = lambda text: int(text) if text.isdigit() else text 
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(l, key = alphanum_key)

In [17]:
log_fail = open(os.path.join(PATH_save, 'pdf_error.log'), 'a')

csv_columns = ['n_processo', 'tipo_documento', 'string', 'n_folha_inicio', 'n_folha_fim']

# Get all process's = directories
root, dirs, files = next(os.walk(PATH_falencias))
dirs = NaturalSort(dirs)

for suit in dirs:
    csv_single_name = os.path.join(PATH_save, (suit + '.csv'))
    with open(csv_single_name,  'a', newline='') as csv_single:
        csv_writer_single = csv.DictWriter(csv_single, fieldnames=csv_columns)
        csv_writer_single.writeheader()
        p_root, p_dirs, p_files = next(os.walk(os.path.join(root, suit)))
        # If has subdirectories, gets the pdfs inside
        if p_dirs:
            p_dirs = NaturalSort(p_dirs)
            for each_dir in p_dirs:
                # Skips hidden or system directories
                if not each_dir.startswith('.') and not each_dir.startswith('System'):
                    subfiles = walk_dirs(p_root, each_dir)
                    subfiles = [os.path.join(each_dir, f) for f in subfiles]
                    p_files.extend(subfiles)

        # With the list of all pdfs, starts processing the data
        p_files = NaturalSort(p_files)
        for pdf in p_files:
            # Skips hidden or system files
                if not pdf.startswith('.') and not pdf.startswith('System'):
                    row_dict = get_pdf_data(suit, os.path.join(p_root, pdf), log_fail)
                    if row_dict:
                        csv_writer_single.writerow(row_dict)

log_fail.close()


In [39]:
csv_combined_name = 'dados_pdf_falencias.csv'
csv_parts = os.listdir(PATH_falencias)

csv_list = []
for each in csv_parts:
    try:
        csv_list.append(pd.read_csv(os.path.join(PATH_falencias, each)))
    except:
        pass

csv_combined = pd.concat(csv_list)
csv_combined.to_csv(os.path.join(PATH_save, csv_combined_name), index=False)
