### Importy i ustawienia

In [1]:
import fitz
import os
import pickle
import pandas as pd
import ourfuncs as funk

In [2]:
FOLDER_PATH = 'Doc/'  # fodler z dokumentami
PICKLE_PATH = 'Saved/textdata.pkl'
CSV_PATH    = 'Saved/textdata.csv'

BLOCK_MIN_SENTENCES = 4
BLOCK_MAX_SENTENCES = 5
BLOCK_MIN_WORDS     = 20
SENTENCE_MIN_WORDS  = 3

### Funkcje

In [3]:
def extract_paragraphs_from_page(page):
    blocks = page.get_text("blocks")
    paragraphs = [block[4].strip() for block in blocks if block[4].strip() != '']

    merged_paragraphs = []
    temp_paragraph = ""
    temp_word_count = 0
    temp_sentence_count = 0

    for paragraph in paragraphs:
        sentences = [sentence.strip() for sentence in paragraph.split('.') if sentence.strip()]
        for sentence in sentences:
            words = sentence.split()
            if len(words) >= SENTENCE_MIN_WORDS:
                # Count as a valid sentence
                if temp_sentence_count < BLOCK_MAX_SENTENCES:
                    temp_paragraph += (sentence + ". ")
                    temp_sentence_count += 1
                    temp_word_count += len(words)
                else:
                    # Once the sentence limit is reached, append the paragraph and reset
                    merged_paragraphs.append(temp_paragraph.strip())
                    temp_paragraph = sentence + ". "
                    temp_sentence_count = 1
                    temp_word_count = len(words)
            else:
                # If it's not a valid sentence, just append the words to the temp paragraph
                temp_paragraph += (sentence + " ")
                temp_word_count += len(words)
        
        # Handle case where paragraph ends without reaching the sentence limit
        if temp_word_count < BLOCK_MIN_WORDS:
            # Merge with the next paragraph if current is too short
            continue
        elif temp_sentence_count < BLOCK_MIN_SENTENCES:
            # Check if we can append this to previous paragraphs
            if merged_paragraphs:
                merged_paragraphs[-1] += " " + temp_paragraph.strip()
                temp_paragraph = ""
                temp_word_count = 0
                temp_sentence_count = 0
            else:
                merged_paragraphs.append(temp_paragraph.strip())
                temp_paragraph = ""
                temp_word_count = 0
                temp_sentence_count = 0
        else:
            merged_paragraphs.append(temp_paragraph.strip())
            temp_paragraph = ""
            temp_word_count = 0
            temp_sentence_count = 0

    # Make sure the last paragraph gets added if it's not empty
    if temp_paragraph.strip():
        if temp_sentence_count >= BLOCK_MIN_SENTENCES and temp_word_count >= BLOCK_MIN_WORDS:
            merged_paragraphs.append(temp_paragraph.strip())
        elif merged_paragraphs:
            # Merge with the last paragraph if conditions are not met
            merged_paragraphs[-1] += " " + temp_paragraph.strip()

    return merged_paragraphs



def extract_and_clean_text_from_folder(folder_path):
    all_pdf_data = []
    list_pdf_filenames(folder_path)
    for filename in os.listdir(folder_path):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            doc = fitz.open(pdf_path)
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                paragraphs = extract_paragraphs_from_page(page)
                para_num = 0
                for paragraph in paragraphs:
                    para_num += 1
                    document_name = filename
                    text = ' '.join(paragraph.split())
                    word_count = len(text.split())  # Count the words in the cleaned paragraph
                    all_pdf_data.append([document_name, page_num + 1, para_num, word_count, text.strip()])
            
            doc.close()
    return all_pdf_data


def list_pdf_filenames(folder_path):
    pdf_filenames = [filename for filename in os.listdir(folder_path) if filename.lower().endswith('.pdf')]
    idx = 0
    print(f'Documents in folder:')
    for filename in pdf_filenames:
        idx += 1
        print(f'\t {idx} : {filename}')
    return pdf_filenames


### Rozbicie PDF na tekst

In [4]:
all_pdf_text_list   = extract_and_clean_text_from_folder(FOLDER_PATH)
all_pdf_text_df     = pd.DataFrame(all_pdf_text_list, columns=['Document', 'Page', 'Block', "WordCount", 'Text'])

# zapis w csv (łatwiej odpalić do przejrzenia)
with open(CSV_PATH, mode='w', newline='', encoding='utf-8-sig') as file:
    all_pdf_text_df.to_csv(file, index=False)

# zapis listy (inne skrypty ją będą odczytywać)
funk.save_data(all_pdf_text_list, PICKLE_PATH)

Documents in folder:
	 1 : DDDD & DD.pdf
	 2 : Latające Taksówki - Wdrażanie Innowacji  - Projekt (Lange, 67619).pdf
	 3 : Ustawa.pdf
Data saved to Saved/textdata.pkl


### Sprawdzenie rozbijania na tekst

In [5]:
word_count_stats = all_pdf_text_df.groupby('Document')['WordCount'].agg(['min', 'max', 'mean']).reset_index()
word_count_stats = word_count_stats.rename(columns={'min': 'MinWordCount', 'max': 'MaxWordCount', 'mean': 'AvgWordCount'})

# Kalkulacje do fine-tuning exportu tekstu
blocks_per_document         = all_pdf_text_df.groupby(['Document', 'Page']).size().reset_index(name='Blocks')
median_blocks_per_document  = blocks_per_document.groupby('Document')['Blocks'].median().reset_index(name='MedianBlocks')
total_blocks_per_document   = blocks_per_document.groupby('Document')['Blocks'].max().reset_index(name='MaxBlocks')

final_stats = pd.merge(word_count_stats , median_blocks_per_document, on='Document')
final_stats = pd.merge(final_stats      , total_blocks_per_document , on='Document')
final_stats

Unnamed: 0,Document,MinWordCount,MaxWordCount,AvgWordCount,MedianBlocks,MaxBlocks
0,DDDD & DD.pdf,22,125,66.105263,3.0,4
1,Latające Taksówki - Wdrażanie Innowacji - Pro...,20,225,89.555556,2.0,5
2,Ustawa.pdf,22,304,113.714286,2.0,4


In [6]:
# leniwe auto odpalenie excela z wynikami
#os.system(f"start EXCEL.EXE \"{CSV_PATH}\"")