# Extracting images

In [21]:
import fitz 
import pandas as pd
import os

def extract_images_from_pdf(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    pdf_document = fitz.open(pdf_path)
    image_data = []

    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        image_list = page.get_images(full=True)

        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]

            image_name = f"page_{page_num + 1}_img_{img_index + 1}.png"
            image_path = os.path.join(output_folder, image_name)

            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)

            image_data.append({"Image Name": image_name, "Page Number": page_num + 1})

    pdf_document.close()

    df = pd.DataFrame(image_data)
    return df

pdf_path = r"C:\projects\cps\cp_pfo\train_dataset_train_data_Sila\train_data_Sila\data.pdf"  # Path to your PDF file
output_folder = "images"  # Folder to save images
df = extract_images_from_pdf(pdf_path, output_folder)


            Image Name  Page Number
0     page_1_img_1.png            1
1     page_1_img_2.png            1
2     page_2_img_1.png            2
3     page_2_img_2.png            2
4     page_2_img_3.png            2
..                 ...          ...
343  page_56_img_3.png           56
344  page_56_img_4.png           56
345  page_57_img_1.png           57
346  page_57_img_2.png           57
347  page_58_img_1.png           58

[348 rows x 2 columns]


In [22]:
import os
import pandas as pd
import re

def create_dataframe_from_images(folder_path):
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            match = re.match(r"page_(\d+)_img_\d+\.png", filename)
            if match:
                page_number = int(match.group(1)) 
                data.append({"Image Name": filename, "Page Number": page_number})

    df = pd.DataFrame(data)
    return df

folder_path = "images"  
df_img = create_dataframe_from_images(folder_path)
df_img.sort_values(by='Page Number').reset_index(drop=True).to_csv("image_pages.csv", index=False)

# Extract Text and Info

In [324]:
from docx import Document
import pandas as pd

def parse_word_to_dataframe(doc_path):
    data = []
    current_headings = ["", "", "", ""]  # Иерархия заголовков
    current_figures = []  # Названия рисунков
    doc = Document(doc_path)

    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            # Определяем иерархию заголовков
            if para.style.name == "Heading 1":
                current_headings[0] = text
                current_headings[1] = ""
                current_headings[2] = ""
                current_headings[3] = ""
            elif para.style.name == "Heading 2":
                current_headings[1] = text
                current_headings[2] = ""
                current_headings[3] = ""
            elif para.style.name == "Heading 3":
                current_headings[2] = text
                current_headings[3] = ""
            elif para.style.name == "Heading 4":
                current_headings[3] = text
            elif para.style.name == "Caption":  # Проверка, если текст параграфа - это подпись к рисунку
                current_figures.append(text)
            else:
                # Добавляем строки в таблицу
                data.append({
                    "text": text,
                    "level_1": current_headings[0],
                    "level_2": current_headings[1],
                    "level_3": current_headings[2],
                    "level_4": current_headings[3],
                    "figures": "; ".join(current_figures)  # Объединяем все названия рисунков
                })
                current_figures = []  # Сбрасываем список названий рисунков для следующего текста

    df = pd.DataFrame(data)
    return df

# Пример вызова функции
doc_path = r"C:\projects\cps\cp_pfo\train_dataset_train_data_Sila\train_data_Sila\data.docx" 
df_1 = parse_word_to_dataframe(doc_path)

In [325]:
figure_counter = 1
def process_figure(figure):
    global figure_counter
    if figure:
        formatted_figure = f"Рисунок {figure_counter}.{figure.split('.', 1)[1].strip()}"
        figure_counter += 1
        return formatted_figure
    return ""

df_1['figures'] = df_1['figures'].apply(process_figure)

In [326]:
pages = df_1[2:89].text.tolist()

head2page= {}

for item in pages:
    section_page = int(item.split('\t')[-1])
    section_number, section_title = item.split('\t')[0].split(' ', 1)
    head2page[section_title] = section_page

page2head = {v:k for k, v in head2page.items()}

In [327]:
levels = df_1[2:89].text.tolist()

num2head = {}

for item in levels:
    section_number, section_title = item.split('\t')[0].split(' ', 1)
    num2head[section_number] = section_title

head2num = {v:k for k, v in num2head.items()}

In [331]:
def last_non_empty(row):
    for level in ['level_4', 'level_3', 'level_2', 'level_1']:
        if row[level]:
            return row[level]
    return None

df_1['non_empty_level'] = df_1.apply(last_non_empty, axis=1)
df_1['page_num'] = df_1['non_empty_level'].map(head2page)


group = (df_1['non_empty_level'] != df_1['non_empty_level'].shift()).cumsum()

grouped_df = df_1.groupby(group).agg({
    'text': ' '.join,
    'level_1': 'first',
    'level_2': 'first',
    'level_3': 'first',
    'level_4': 'first',
    'figures': lambda x: [fig for fig in x if fig],  # Создание списка из непустых значений
    'page_num': 'first'
}).reset_index(drop=True)

chunks = grouped_df[89:].reset_index(drop=True)
chunks['page_num'] = chunks['page_num'].ffill()

In [333]:
def paste_level_num(text, head2num=head2num):
    num = head2num.get(text, 0)
    return f"{num} {text}"

In [334]:
chunks['level_1'] = chunks['level_1'].apply(paste_level_num)
chunks['level_2'] = chunks['level_2'].apply(paste_level_num)
chunks['level_3'] = chunks['level_3'].apply(paste_level_num)
chunks['level_4'] = chunks['level_4'].apply(paste_level_num)

In [336]:
chunks.to_csv("chunked.csv", index=False)