In [54]:
pip install pymupdf



In [55]:
import pandas as pd
import fitz
import numpy as np

In [56]:
def extract_word_positions(pdf_path, page_num):
    page_num = page_num
    doc = fitz.open(pdf_path)
    word_data = []
    page_width = []
    #for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    for block in page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]:
        for line in block["lines"]:
            wdir = line["dir"]    # writing direction = (cosine, sine)
            if wdir[0] == 0:  # either 90° or 270°
                #print(line['bbox'])
                page.add_redact_annot(line["bbox"])
    page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
    width = page.rect.width
    page_width.append(width)
    words = page.get_text("words")
    for word in words:
        x0, y0, x1, y1, word_text, block_no, line_no, word_no = word
        word_data.append([page_num, x0, y0, x1, y1, word_text, block_no, line_no, word_no])

    df_words = pd.DataFrame(word_data, columns=["page_num", "x0", "y0", "x1", "y1", "word_text", "block_no", "line_no", "word_no"])
    return df_words, page_width

def merge_words_into_lines(df_words, y_threshold=5):
    df_words = df_words.sort_values(by=["page_num", "y0", "x0"]).reset_index(drop=True)
    lines = []
    current_line = []
    current_line_num = 0

    for i, row in df_words.iterrows():
        if not current_line:
            current_line.append(row)
        else:
            prev_word = current_line[-1]
            if row["page_num"] == prev_word["page_num"] and abs(row["y0"] - prev_word["y0"]) <= y_threshold:
                current_line.append(row)
            else:
                lines.append((current_line_num, current_line))
                current_line = [row]
                current_line_num += 1

    if current_line:
        lines.append((current_line_num, current_line))

    line_data = []
    for line_num, words in lines:
        for word in words:
            line_data.append([line_num, word["x0"], word["y0"], word["x1"], word["y1"], word["word_text"]])

    df_lines = pd.DataFrame(line_data, columns=["line_num", "x0", "y0", "x1", "y1", "word_text"])
    return df_lines

def merge_words_into_lines(df_words, page_widths, y_threshold=5):
    df_words = df_words.sort_values(by=["page_num", "y0", "x0"]).reset_index(drop=True)
    lines = []
    current_line = []
    current_line_num = 0

    for i, row in df_words.iterrows():
        if not current_line:
            current_line.append(row)
        else:
            prev_word = current_line[-1]
            if row["page_num"] == prev_word["page_num"] and abs(row["y0"] - prev_word["y0"]) <= y_threshold:
                current_line.append(row)
            else:
                lines.append((current_line_num, current_line))
                current_line = [row]
                current_line_num += 1

    if current_line:
        lines.append((current_line_num, current_line))

    line_data = []
    flagged_lines = []
    for line_num, words in lines:
        page_num = words[0]["page_num"]
        page_num = 0  # Assuming page numbers start from 0 (as we are checking for every page)
        x = page_widths[page_num] / 10
        if all(word["x0"] < 5 * x for word in words):
            flagged_lines.append(line_num)
        for word in words:
            line_data.append([line_num, word["x0"], word["y0"], word["x1"], word["y1"], word["word_text"]])

    df_lines = pd.DataFrame(line_data, columns=["line_num", "x0", "y0", "x1", "y1", "word_text"])
    return df_lines, flagged_lines

def find_lines_with_large_x_diff(df_lines, threshold=50):
    lines_with_large_diff = df_lines[df_lines["distance"] > threshold]["line_num"].unique()
    return lines_with_large_diff

# Function to calculate distances between words in the same line
def calculate_distances(df):
    distances = []

    # Group by line_num
    grouped = df.groupby('line_num')

    for line_num, group in grouped:
        group = group.sort_values('x0')
        previous_x1 = None

        for index, row in group.iterrows():
            if previous_x1 is not None:
                distance = row['x0'] - previous_x1
                distance = round(distance)
                distances.append({
                    'line_num': line_num,
                    'word1': previous_word,
                    'word2': row['word_text'],
                    'distance': distance
                })
            previous_x1 = row['x1']
            previous_word = row['word_text']

    return pd.DataFrame(distances)

def union_of_lists(list1, list2):
    # Convert lists to sets and perform union
    union_set = set(list1) | set(list2)
    # Convert the set back to a list (optional, as sets can be used directly)
    union_list = list(union_set)
    return union_list

def consecutive_sublists(lst, min_length=3):
    lst = sorted(lst)
    sublists = []
    current_sublist = []

    for num in lst:
        if not current_sublist or num == current_sublist[-1] + 1:
            current_sublist.append(num)
        else:
            if len(current_sublist) >= min_length:
                sublists.append(current_sublist)
            current_sublist = [num]

    if len(current_sublist) >= min_length:
        sublists.append(current_sublist)

    return sublists

def check_y_diff_within_threshold_text_classification(df, sublist, threshold=45):
    # Filter the dataframe for the lines in the sublist
    filtered_df = df[df['line_num'].isin(sublist)]

    # Get the first entry per line number
    first_entries_per_line = filtered_df.groupby('line_num').first().reset_index()

    # Extract the y0 values
    y0_values = first_entries_per_line['y0'].values
    # Check differences and remove elements if needed
    while True:
        y_diff = abs(pd.Series(y0_values).diff().dropna())
        y_diff = y_diff.tolist()
        if len(y_diff) > 0:
          y_diff.insert(0, y_diff[0])
        y_diff = pd.Series(y_diff)

        exceed_indices = y_diff[y_diff > threshold].index
        if exceed_indices.empty:
            return True

        # Remove the element causing the exceedance
        exceed_index = exceed_indices[0]  # Adjust for the dropped first value

        y0_values = list(y0_values)

        del y0_values[exceed_index]

        if len(y0_values) <= 2:
            return False  # If there are fewer than 2 elements, we can't compare

def get_y0(df, sublist):
    # Filter the dataframe for the lines in the sublist
    filtered_df = df[df['line_num'].isin(sublist)]

    # Get the first entry per line number
    first_entries_per_line = filtered_df.groupby('line_num').first().reset_index()
    #print(first_entries_per_line)

    # Extract the y0 values
    y0_values = first_entries_per_line['y0'].values
    y1_values = first_entries_per_line['y1'].values
    return (y0_values[0], y1_values[-1])

def get_pdf_page_dimensions(pdf_path, page_number = 5):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Select the specific page
    page = pdf_document.load_page(page_number)

    # Get the dimensions
    width = page.rect.width
    height = page.rect.height

    return width, height

def calculate_overlap(region, range_header, range_footnote):
    header_start, header_end = range_header
    footnote_start, footnote_end = range_footnote
    region_start, region_end = region

    # Calculate overlap with header
    header_overlap = max(0, min(region_end, header_end) - max(region_start, header_start))

    # Calculate overlap with footnote
    footnote_overlap = max(0, min(region_end, footnote_end) - max(region_start, footnote_start))

    # Total overlap
    total_overlap = max(header_overlap, footnote_overlap)

    # Calculate the length of the figure region
    region_length = region_end - region_start

    # Calculate the percentage of overlap
    overlap_percentage = (total_overlap / region_length) * 100

    return overlap_percentage

def filter_regions(regions, range_header, range_footnote, threshold=50):
    filtered_regions = []

    for region in regions:
        overlap_percentage = calculate_overlap(region, range_header, range_footnote)
        if overlap_percentage <= threshold:
            filtered_regions.append(region)

    return filtered_regions


def pymupdf_extract_blocks_with_coords(page, adjusted_bbox_matrix):

    x_pdf = adjusted_bbox_matrix[0][0]
    y_pdf = adjusted_bbox_matrix[0][1]
    x_pdf_2 = adjusted_bbox_matrix[0][2]
    y_pdf_2 = adjusted_bbox_matrix[0][3]

    #pdf_document = fitz.open(pdf_path)
    text_blocks = []
    x0_list = []
    y0_list = []
    x1_list = []
    y1_list = []

    #page = pdf_document.load_page(page_num)

    words = page.get_text("words")

    current_block = []
    for word in words:
        #print(word)
        if x_pdf < word[0] < x_pdf_2 and y_pdf < (word[1] + word [3])/2 < y_pdf_2:
            if not current_block:  # If the current_block is empty, this is the first word that meets the condition.
                current_block = [word]
            else:
                previous_word = current_block[-1]
                current_word = word

                # Calculate horizontal distance and vertical alignment difference
                distance_x = current_word[0] - previous_word[2]
                alignment_y = abs((current_word[1] + current_word[3]) - (previous_word[1] + previous_word[3])) / 2

                if 0 < distance_x < 1 and alignment_y < 1:
                    current_block.append(current_word)
                else:
                    # For each block, store the text and coordinates
                    text_blocks.append(' '.join([w[4] for w in current_block]))
                    x0_list.append(current_block[0][0])
                    y0_list.append(current_block[0][1])
                    x1_list.append(current_block[-1][2])
                    y1_list.append(current_block[-1][3])

                    current_block = [current_word]

    # Handle the last block in the page
    if current_block:
        text_blocks.append(' '.join([w[4] for w in current_block]))
        x0_list.append(current_block[0][0])
        y0_list.append(current_block[0][1])
        x1_list.append(current_block[-1][2])
        y1_list.append(current_block[-1][3])

    #pdf_document.close()
    return text_blocks, x0_list, y0_list, x1_list, y1_list

def find_unique_lines_row(df):
    # Calculate heights and their mean
    heights = [row['Y1'] - row['Y0'] for i, row in df.iterrows()]
    height_mean = sum(heights) / len(heights)

    # Initialize list to keep track of unique lines
    h_lines = []
    sensitivity = height_mean / 1.6

    # Identify unique lines based on sensitivity
    for y0 in df['Y0']:
        found = False
        for c in h_lines:
            if abs(y0 - c) < sensitivity:
                found = True
                break
        if not found:
            h_lines.append(y0)

    return h_lines

def find_unique_lines_column(df):
    # Calculate heights and their mean
    heights = [row['X1'] - row['X0'] for i, row in df.iterrows()]
    height_mean = sum(heights) / len(heights)

    # Initialize list to keep track of unique lines
    h_lines = []
    sensitivity = height_mean / 2

    # Identify unique lines based on sensitivity
    for y0 in df['X0']:
        found = False
        for c in h_lines:
            if abs(y0 - c) < sensitivity:
                found = True
                break
        if not found:
            h_lines.append(y0)

    return h_lines

def find_segment(midpoint, lines):
    """Helper function to determine the segment number based on the midpoint."""
    for i, line in enumerate(lines):
        if midpoint < line:
            return i
    return len(lines)  # for the last segment

def assign_row_col_numbers(df, row_lines, column_lines):
    # Calculate midpoints
    df['MidX'] = (df['X0'] + df['X1']) / 2
    df['MidY'] = (df['Y0'] + df['Y1']) / 2

    # Determine row and column numbers
    df['RowNumber'] = df['MidY'].apply(lambda y: find_segment(y, row_lines))
    df['ColumnNumber'] = df['MidX'].apply(lambda x: find_segment(x, column_lines))

    return df

In [57]:
pdf_path = "enter_your_pdf_path_here"

In [65]:
#list the page numbers where the table contains in the document. Page number always start with zero.
page_numbers = [0]

In [66]:
main_dfs = []

for number_page in page_numbers:
    print(number_page)
    print(type(number_page))
    number_page = int(number_page)
    print(type(number_page))
    df_words, page_widths = extract_word_positions(pdf_path, page_num = number_page)

    df_lines, flagged_lines = merge_words_into_lines(df_words, page_widths)

    # Calculate distances
    distances_df = calculate_distances(df_lines)
    text_lines = find_lines_with_large_x_diff(distances_df)
    result = union_of_lists(flagged_lines, list(text_lines))
    result.sort()

    result_sublists = consecutive_sublists(result)

    for sublist in result_sublists:
      result = check_y_diff_within_threshold_text_classification(df_lines, sublist, threshold = 60)
      if result == False:
        del result_sublists[result_sublists.index(sublist)]

    width, height = get_pdf_page_dimensions(pdf_path, page_number = number_page)
    footnote_start = height - height/10
    footnote_end = height
    range_footnote = [footnote_start, footnote_end]
    range_header = [0, height/10]


    result_sublist_bbox = []
    for sublist in result_sublists:
      result = get_y0(df_lines, sublist)
      result_sublist_bbox.append(result)

    new_result_sublist_bbox = filter_regions(result_sublist_bbox, range_header, range_footnote)
    new_result_sublist_bbox = sorted(new_result_sublist_bbox, key=lambda x: x[0], reverse=False)

    for region in new_result_sublist_bbox:

        x0, y0, x1, y1 = page_widths[0]*0.04, region[0], page_widths[0]*0.96, region[1]
        adjusted_bbox_matrix = [[x0, y0, x1, y1]]
        doc = fitz.open(pdf_path)
        page = doc.load_page(number_page)
        text_blocks, x0_list, y0_list, x1_list, y1_list = pymupdf_extract_blocks_with_coords(page, adjusted_bbox_matrix)
        data = {
          'Text': text_blocks,
          'X0': x0_list,
          'Y0': y0_list,
          'X1': x1_list,
          'Y1': y1_list
      }

        # Create DataFrame
        new_df = pd.DataFrame(data)

        df_new = new_df.copy()
        row_lines = find_unique_lines_row(df_new)
        row_lines.sort()
        #column_lines = find_unique_lines_column(df_new)
        #print(column_lines)


        column_lines = find_unique_lines_column(df_new)

        column_lines.sort()


        updated_df = assign_row_col_numbers(df_new, row_lines, column_lines)
        # Group by RowNumber and ColumnNumber and concatenate texts
        grouped = df_new.groupby(['RowNumber', 'ColumnNumber'])['Text'].agg(' '.join).reset_index()

        # Pivot the DataFrame to create a mapped view
        mapped_df = grouped.pivot(index='RowNumber', columns='ColumnNumber', values='Text')

        # Fill NaN with empty strings if needed
        mapped_df = mapped_df.fillna('')

        # Combine the first three rows to form a single header
        new_header = mapped_df.iloc[0:1].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=0)
        mapped_df.columns = new_header  # Set the new header
        mapped_df = mapped_df[1:]  # Remove the header rows from the DataFrame

        # Reset index if necessary
        mapped_df.reset_index(drop=True, inplace=True)

        main_dfs.append(mapped_df)





0
<class 'int'>
<class 'int'>


In [67]:
#main_dfs[0]

Unnamed: 0,No,Firstname,Lastname,Idno,Umusaruro,Agaciro,Unnamed: 7
0,1.0,ABAKORANABUSHAKE,ITSINDA,1111111111111.0,983.0,289985,
1,2.0,AHORUKOMEYE,J PAUL,1199380161305001.0,523.0,154285,
2,3.0,AYINKAMIYE,CHRISTINE,1196370012352041.0,607.0,179065,
3,4.0,AYINKAMIYE,FRANCOISE,1197670021505022.0,481.0,141895,
4,5.0,AYINKAMIYE,MEDIATRICE,1198870031986046.0,130.0,38350,
5,6.0,BIMENYIMANA,JEAN,1195480006633081.0,593.0,174935,
6,7.0,BIMENYIMANA,MARC,1195880009341078.0,606.0,178770,
7,8.0,BIMENYIMANA,TELESPHORE,1195480006920099.0,536.0,158120,
8,9.0,BIZIMANA,AIMABLE,1198680034667064.0,208.0,61360,
9,10.0,BIZIMANA,ALEXIS,1197480100533036.0,331.0,97645,
