In [17]:
import os

import pandas as pd


def get_book_df(book_dir):
    book_df = pd.DataFrame()
    for filename in sorted(os.listdir(book_dir)):
        if filename.endswith('.tsv'):
            page_df = pd.read_csv(os.path.join(book_dir, filename), sep='\t', header=0)
            page_df['page'] = filename.split('.')[0]
            book_df = pd.concat([book_df, page_df], ignore_index=True)
    return book_df


In [18]:
book_base_dir = '../data/dag_results/pdf_processing/dysche_zhyg.pdf'
book_df_1 = get_book_df(os.path.join(book_base_dir, 'kbd_0.009_4360_66700', 'tsvs'))
book_df_2 = get_book_df(os.path.join(book_base_dir, 'kbd_0.229_2995_10800', 'tsvs'))

book_df_1 = book_df_1.dropna()
book_df_2 = book_df_2.dropna()

In [41]:
book_df_1.describe().to_csv('book_df_1_describe.tsv', sep='\t')
book_df_2.describe().to_csv('book_df_2_describe.tsv', sep='\t')

In [27]:
join_by = ['page', 'text', 'level', 'page_num', 'block_num', 'par_num', 'line_num']
inner_df = pd.merge(book_df_1, book_df_2, on=join_by, suffixes=('_1', '_2'))

In [28]:
inner_show_cols = ['text', 'conf_1', 'conf_2']
inner_df = inner_df[inner_show_cols]
inner_df.to_csv('inner_df.tsv', sep='\t')

In [37]:
left_show_cols = ['page', 'text', 'conf_1', 'conf_2']
left_df = pd.merge(book_df_1, book_df_2, on=join_by, how='left', suffixes=('_1', '_2'))
left_df = left_df[left_df['conf_2'].isna()]
left_df = left_df[left_show_cols]
left_df.to_csv('left_df.tsv', sep='\t')

In [38]:
right_show_cols = ['page', 'text', 'conf_1', 'conf_2']
right_df = pd.merge(book_df_1, book_df_2, on=join_by, how='right', suffixes=('_1', '_2'))
right_df = right_df[right_df['conf_1'].isna()]
right_df = right_df[right_show_cols]
right_df.to_csv('right_df.tsv', sep='\t')

In [40]:
# filter by conf
conf_threshold = 80
filtered_book_df_1 = book_df_1[book_df_1['conf'] < conf_threshold]
filtered_book_df_2 = book_df_2[book_df_2['conf'] < conf_threshold]
filtered_book_df_1.to_csv(f'filtered_{conf_threshold}_book_df_1.tsv', sep='\t')
filtered_book_df_2.to_csv(f'filtered_{conf_threshold}_book_df_2.tsv', sep='\t')