# Lösung Lab 04: Unstrukturierte Daten

### Basis Aufgabe

In [None]:
from docx import Document
from pypdf import PdfReader

# --- Word Verarbeitung ---
doc = Document('antrag.docx')
applicant_name = None

print("--- Word Analysis ---")
for para in doc.paragraphs:
    if "Antragsteller:" in para.text:
        # String manipulation to get clean name
        raw_text = para.text
        applicant_name = raw_text.split(":")[1].strip()
        print(f"Applicant found: {applicant_name}")

# --- PDF Check ---
print("\n--- PDF Analysis ---")
reader = PdfReader("bank_statement.pdf")
page = reader.pages[0]
text_content = page.extract_text()

if "Gehalt" in text_content:
    print("Success: Proof of Salary found in PDF.")
else:
    print("Warning: No Salary keyword found.")

### Bonus Herausforderung

In [None]:
import pdfplumber
from docx import Document

# 1. Extract Table from PDF
salary_data = []

with pdfplumber.open("bank_statement.pdf") as pdf:
    first_page = pdf.pages[0]
    
    # extract_table returns a list of lists (Rows -> Columns)
    table_rows = first_page.extract_table()
    
    if table_rows:
        for row in table_rows:
            # Filter logic: Ignore Header and look for "Gehalt"
            # row[0] is Date, row[1] is Description
            if row and row[0] != "Date":
                description = row[1]
                if "Gehalt" in description:
                    salary_data.append(row)

print(f"Extracted Data: {salary_data}")

# 2. Generate Word Report with Table
report = Document()
report.add_heading('Kredit-Prüfbericht', 0)

report.add_paragraph(f'Prüfung für Antragsteller: {applicant_name}')

if salary_data:
    report.add_heading('Gefundene Gehaltseingänge', level=2)
    
    # Create Table: Rows = Data + 1 Header, Cols = 3
    table = report.add_table(rows=1, cols=3)
    table.style = 'Table Grid'
    
    # Set Header
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Datum'
    hdr_cells[1].text = 'Text'
    hdr_cells[2].text = 'Betrag'
    
    # Fill Data
    for entry in salary_data:
        row_cells = table.add_row().cells
        row_cells[0].text = entry[0]
        row_cells[1].text = entry[1]
        row_cells[2].text = entry[2]

report.save('report_genehmigung.docx')
print("Report 'report_genehmigung.docx' generated successfully.")