In [0]:
#!pip install PyPDF2
#or
#!conda install -c conda-forge -y PyPDF2

In [0]:
import PyPDF2

import requests, io, csv, logging

import re

## Logging

In [0]:
from logging.handlers import TimedRotatingFileHandler
logger = logging.getLogger('read_pdf_pypdf2')
logger.setLevel(logging.DEBUG)

trfh = TimedRotatingFileHandler('votings/pdf_extract.log', when="midnight", interval=1)
trfh.suffix = "%Y%m%d"
trfh.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(pathname)s:%(lineno)s - %(name)s - %(message)s')
trfh.setFormatter(formatter)

logger.addHandler(trfh)

In [0]:
#pdf_path = 'http://orka.sejm.gov.pl/Glos8.nsf/nazwa/86_1/$file/glos_87_1.pdf'

## Functions

In [0]:
def get_text_from_pdf_url(pdf_path):
    '''
    Gets textual info about a single voting
    '''
    text = ''
    response = requests.get(pdf_path)
    response.raise_for_status()        

    with io.BytesIO(response.content) as open_pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(open_pdf_file)
        num_pages = pdf_reader.getNumPages()

        for page in pdf_reader.pages:
            text += page.extractText()

    return text

In [0]:

def re_get_first_group( pattern, text, default='NA' ):
    m = re.search(pattern, text)
    return m.group(1) if m else default

def get_voting_info_from_text(text):
    '''Gets info about a single voting based on textual representation'''
    voting_info = dict()
    
    voting_info['cadence'] = re_get_first_group('Sejm RP (.*?) kaden', text)
    voting_info['sitting_id'] = re_get_first_group('POSIEDZENIE (.*?)\.', text)
    
    try:
      voting_info['voting_number'] = int(re_get_first_group('osowanie nr (.*?) ', text))
    except:
      logger.debug(f'Got problem when obtaining voting_number', exc_info=1)
      
    voting_info['voting_date'] = re_get_first_group('\(([0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]) ', text)
    voting_info['voting_time'] = re_get_first_group(' ([0-9][0-9]\:[0-9][0-9]\:[0-9][0-9])', text)
    voting_info['resolution_number'] = re_get_first_group('druku nr (.*?) ', text)
    
    return voting_info

In [0]:
def get_votes_from_text( text ):
    '''Gets detailed votes for a single voting out from a textual representation'''
    votes = []
    party = ''
    name = ''

    ignores = [
        'PDF stworzony przez wersj. demonstracyjn. pdfFactory Pro www.pdffactory.com [0-9]',
        '.+ \(\d+\) - \d+ ZA - \d+ PRZECIW - \d+ WSTRZYM. - \d+ NIE G!OS. - \d+'
    ]
    vote_words = {'za', 'pr.', 'ng.'}

    for txt in text.split(' '):
        if txt in vote_words:
            votes.append((party, name, txt))
            name = ''
        elif txt == 'G!OSOWA!O':
            party = name.split(' ')[-2]   
        else:
            if name != '':
                name += ' '
            name += txt

            for ignore in ignores:
                name = re.sub(ignore, '', name)

    return votes

In [0]:
def store_votes_to_csv( voting_info, votes, csv_file_path ):
    '''Serialize single voting votes to csv'''
    with open(csv_file_path, mode='w') as csv_file:    
    
        votes_writer = csv.DictWriter(
            csv_file, 
            delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, 
            fieldnames= list(voting_info) + ['party', 'deputy', 'vote']
        )

        votes_writer.writeheader()
        
        for vote in votes:
            votes_writer.writerow(
                {**voting_info, 'party': vote[0], 'deputy': vote[1], 'vote': vote[2]}
            )
    

## the final function

In [0]:
def store_votings_to_csvs(
    cadence=8, 
    init_sitting_id=1, 
    last_sitting_id=100,
    init_voting_number=1, 
    csv_file_pattern='voting-{cadence}_{sitting_id}_{voting_number}.csv'
):
    '''
    Process votings for given cadence, sitting range and optionally start from given voting number (good for incremental loads).
    Store each voting data as a separate csv
    '''
  
    logger.info(f'Start processing for: {cadence}, {init_sitting_id}, {last_sitting_id}, {init_voting_number}')

    pdf_path_pattern = 
        'http://orka.sejm.gov.pl/Glos{cadence}.nsf/nazwa/{sitting_id}_{voting_number}/$file/glos_{sitting_id}_{voting_number}.pdf'

    has_sittings = True
    sitting_id = init_sitting_id

    for sitting_id in range(init_sitting_id, last_sitting_id+1):
        voting_number = init_voting_number
        has_votings = True
        is_first_voting = True

        while has_votings:
            pdf_path = pdf_path_pattern.format(cadence=cadence, sitting_id=sitting_id, voting_number=voting_number)

            try:
                text = get_text_from_pdf_url(pdf_path)
            except  requests.exceptions.HTTPError:
                has_votings = False
            except Exception:
                has_votings = False
                logger.warning(
                    f'Failed to retrieve pdf for {cadence}_{sitting_id}_{voting_number}.\n{pdf_path}', 
                    exc_info=1)

            if has_votings:
                is_first_voting = False            
                
                try:
                  csv_path = csv_file_pattern.format(
                      cadence=cadence, sitting_id=sitting_id, voting_number=voting_number)
                  store_votes_to_csv( 
                    get_voting_info_from_text(text), 
                    get_votes_from_text(text), 
                    csv_path 
                  )
                except Exception as err:
                  logger.warning(f'Failed to process pdf for {cadence}_{sitting_id}_{voting_number}.', exc_info=1)
                else:
                  logger.info(f'Processed pdf for {cadence}_{sitting_id}_{voting_number}.')
                finally:
                   voting_number += 1
                    
            elif is_first_voting:
                logger.debug(f'Sitting {cadence}_{sitting_id} does not exist.')
            else:
                logger.debug(f'Voting {voting_number} out of bound for {cadence}_{sitting_id} sitting.')


## invocation example 

In [0]:
store_votings_to_csvs(8, 86, 86, 40, 'drive/My Drive/currents/votings/voting-{cadence}_{sitting_id}_{voting_number}.csv')

In [19]:

os.getcwd()

'/data/notebooks/data/politics'

In [21]:
import glob


txtfiles = []
for file in glob.glob("/data/notebooks/data/politics/*sql"):
    txtfiles.append(file)
    
txtfiles

['/data/notebooks/data/politics/ddl.sql']