In [22]:
import os
import re
import pandas as pd
import cv2
import numpy as np
import layoutparser as lp
from tqdm import tqdm  

In [35]:
# Initialize GCV OCR agent 
# Note -- requires google cloud vision credentials 
ocr_agent2 = lp.GCVAgent.with_credential(
    "gcv_credentials.json",
    languages=['por'])

In [43]:
# function to group text blocks by vertical distance
def group_blocks_by_distance(blocks, distance_th):
    # sort blocks top to bottom and compute gaps
    distances = np.array([
        b2.coordinates[1] - b1.coordinates[3]
        for b1, b2 in zip(blocks, blocks[1:])
    ])
    distances = np.append([0], distances)
    # Identify groups where gap > threshold
    block_group = (distances > distance_th).cumsum()
    grouped = [lp.Layout([]) for _ in range(max(block_group) + 1)]
    for i, blk in zip(block_group, blocks):
        grouped[i].append(blk)
    # Return each group as a Layout object    
    return grouped

def process_image(image_path):
    # load img and improve contrast
    img = cv2.imread(image_path)
    img = cv2.convertScaleAbs(img, alpha=1.3)

    # OCR
    res = ocr_agent2.detect(img, return_response=True)
    text = ocr_agent2.gather_full_text_annotation(
        res, agg_level=lp.GCVFeatureType.WORD
    )

    # define column bounds for position, names, party and votes
    pos = text.filter_by(lp.Rectangle(x_1=200, y_1=700, x_2=400, y_2=1880))
    names = text.filter_by(lp.Rectangle(x_1=350, y_1=800, x_2=1200, y_2=1880))
    party = text.filter_by(lp.Rectangle(x_1=1300, y_1=800, x_2=1750, y_2=1850))
    nums = text.filter_by(lp.Rectangle(x_1=1850, y_1=800, x_2=2450, y_2=1850))

    # group and align first two columns
    A = group_blocks_by_distance(pos, 0)
    B = group_blocks_by_distance(names, 0)
    height_th = 50
    iA = iB = 0
    res1 = []
    while iA < len(A) and iB < len(B):
        ay = A[iA][0].coordinates[1]
        by = B[iB][0].coordinates[1]
        a_txt = ' '.join(A[iA].get_texts())
        b_txt = ' '.join(B[iB].get_texts())
        if abs(ay - by) < height_th:
            iA += 1; iB += 1
        elif ay < by:
            iA += 1; b_txt = ''
        else:
            iB += 1; a_txt = ''
        res1.append([a_txt, b_txt])

    # next two columns
    C = group_blocks_by_distance(party, 0)[1:]
    D = group_blocks_by_distance(nums, 0)[1:]
    iC = iD = 0
    res2 = []
    while iC < len(C) and iD < len(D):
        cy = C[iC][0].coordinates[1]
        dy = D[iD][0].coordinates[1]
        c_txt = ''.join(C[iC].get_texts())
        d_txt = ''.join(D[iD].get_texts())
        if abs(cy - dy) < height_th:
            iC += 1; iD += 1
        elif cy < dy:
            iC += 1; d_txt = ''
        else:
            iD += 1; c_txt = ''
        res2.append([c_txt, d_txt])

    # duplicating rows in second result to match lengths
    if res2:
        adj2 = [res2[0]] + [item for sub in res2[1:] for item in (sub, sub)]
    else:
        adj2 = res2
    # combine
    combined = [row1 + row2 for row1, row2 in zip(res1, adj2)]

    # convert to df
    if len(combined) > 1:
        df = pd.DataFrame(
            combined[1:],
            columns=["position", "name", "party", "votes"]
        )
        df['flag'] = 0
    else:
        df = pd.DataFrame(
            columns=["position", "name", "party", "votes", "flag"]
        )
        df.loc[0] = ['', '', '', '', 1]

    # extract metadata from filename
    muni = re.findall(r'/\d+(.+)\.pdf', image_path)
    df['municipality'] = muni[0] if muni else ''
    yr = re.findall(r'/(\d+)', image_path)
    df['year'] = yr[0] if yr else ''

    return df

In [44]:
folder = 'Pictures_1972'
dfs = []
for fname in tqdm(sorted(os.listdir(folder)), desc="Processing images"):
    if fname.lower().endswith(('.png', '.jpg')):
        path = os.path.join(folder, fname)
        dfs.append(process_image(path))

# Final concatenated DataFrame
final_df = pd.concat(dfs, ignore_index=True)


Processing images: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 233/233 [07:15<00:00,  1.87s/it]


In [45]:
final_df.head()

Unnamed: 0,position,name,party,votes,flag,municipality,year
0,,ALVES . DA . ANUNCIAÇÃO ...,MDB.,2.045......,0,agudo,1972
1,VP -,GERALDO LOSEKANN,MDB.,2.045......,0,agudo,1972
2,Þ,HILDOR MAX LOSEKANN,ARENA,1.745,0,agudo,1972
3,VP -,RUI FERREIRA FEHN,ARENA,1.745,0,agudo,1972
4,-,,ARENA,.1.051,0,agudo,1972


In [46]:
#save to csv
final_df.to_csv('output/combined_output.csv', index=False)
