# PPTX → Markdown Notebook Generator (v3)

텍스트 조합 품질을 개선하여 폰트 변경으로 쪼개진 문구를 문장 단위로 복원합니다.

In [19]:
# PPT → Markdown 변환기 (v3: 텍스트 조합 품질 향상)
#
# 개선 포인트:
#  - paragraph 수준으로 run을 병합하여 문장 단위 복원
#  - a:br(줄바꿈) 반영, 공백 정리, soft hyphen 제거
#  - 기본 bullet 접두사 처리
#  - 나머지(이미지/테이블/수식/헤더) 로직은 v2와 동일
#
# 사용법:
#  1) pptx_path, out_ipynb 지정
#  2) 셀 실행 → 새 노트북 생성

import os, mimetypes, base64, zipfile, re
from typing import List, Dict, Optional
from xml.etree import ElementTree as ET
import nbformat
from nbformat.v4 import new_notebook, new_markdown_cell

def _safe_text(s: Optional[str]) -> str:
    if type(s) != str:
        return ""
    return (s or "").strip()

def _smooth_join(parts: List[str]) -> str:
    buf, prev = [], ""
    for cur in parts:
        if cur is None:
            continue
        cur = cur.replace("\u00ad", "")
        if not prev:
            buf.append(cur)
        else:
            need_space = False
            if prev and cur:
                if prev[-1].isalnum() and cur[0].isalnum():
                    need_space = True
                if cur[0] in ".,;:!?)]}":
                    need_space = False
                if prev[-1].isspace() or cur[0].isspace():
                    need_space = False
            if need_space:
                buf.append(" ")
            buf.append(cur)
        prev = cur
    text = "".join(buf)
    text = re.sub(r"\s+([,.;:!?])", r"\1", text)
    text = re.sub(r"\s{2,}", " ", text)
    return text

def _image_to_data_uri(data: bytes, ext: str) -> str:
    import mimetypes, base64
    ctype = mimetypes.types_map.get(ext.lower(), "application/octet-stream")
    b64 = base64.b64encode(data).decode("ascii")
    return f"data:{ctype};base64,{b64}"

def _collect_with_python_pptx(pptx_path: str) -> List[Dict]:
    try:
        from pptx import Presentation
        from pptx.enum.shapes import MSO_SHAPE_TYPE
    except Exception:
        return []
    prs = Presentation(pptx_path)
    slides_out = []
    for idx, slide in enumerate(prs.slides):
        title_text = None
        if getattr(slide.shapes, 'title', None):
            title_text = _safe_text(slide.shapes.title.text)
        if not title_text:
            for shp in slide.shapes:
                if getattr(shp, "has_text_frame", False) and shp.has_text_frame:
                    paras = []
                    for p in shp.text_frame.paragraphs:
                        runs_text = [r.text or "" for r in p.runs]
                        line = _smooth_join(runs_text)
                        if line:
                            paras.append(line)
                    if paras:
                        title_text = paras[0][:120]
                        break
        paragraphs, images, tables = [], [], []
        for shp in slide.shapes:
            if getattr(shp, "has_text_frame", False) and shp.has_text_frame:
                for p in shp.text_frame.paragraphs:
                    runs_text = [r.text or "" for r in p.runs]
                    para = _smooth_join(runs_text)
                    if not para:
                        continue
                    try:
                        if p.level is not None and getattr(p, "bullet", None) is not None:
                            indent = "  " * int(p.level)
                            para = f"{indent}- {para}"
                    except Exception:
                        pass
                    paragraphs.append(para)
            if getattr(shp, "shape_type", None) == MSO_SHAPE_TYPE.PICTURE:
                try:
                    img = shp.image
                    images.append({"data": img.blob, "ext": getattr(img, "ext", None) or ".png"})
                except Exception:
                    pass
            if getattr(shp, "has_table", False):
                try:
                    tbl = shp.table
                    t_rows = []
                    for r in tbl.rows:
                        row_vals = []
                        for c in r.cells:
                            cell_lines = []
                            for pp in c.text_frame.paragraphs:
                                runs_text = [rr.text or "" for rr in pp.runs]
                                cell_line = _smooth_join(runs_text)
                                if cell_line:
                                    cell_lines.append(cell_line)
                            row_vals.append("\n".join(cell_lines))
                        t_rows.append(row_vals)
                    if t_rows:
                        tables.append(t_rows)
                except Exception:
                    pass
        slides_out.append({
            "index": idx,
            "title": title_text or "",
            "paragraphs": paragraphs,
            "images": images,
            "tables": tables,
            "equations": []
        })
    return slides_out

def _collect_with_zip(pptx_path: str) -> List[Dict]:
    slides_out = []
    with zipfile.ZipFile(pptx_path, "r") as zf:
        slide_names = sorted([n for n in zf.namelist() if n.startswith("ppt/slides/slide") and n.endswith(".xml")],
                             key=lambda x: int("".join(filter(str.isdigit, x)) or "0"))
        ns = {
            "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
            "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
            "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
            "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
        }
        for idx, slide_name in enumerate(slide_names):
            root = ET.fromstring(zf.read(slide_name))
            title_text = ""
            for sp in root.findall(".//p:sp", ns):
                p = sp.find(".//a:p", ns)
                if p is not None:
                    parts = []
                    for node in list(p):
                        if node.tag == f"{{{ns['a']}}}r":
                            t = node.find(".//a:t", ns)
                            parts.append(t.text if (t is not None and t.text) else "")
                        elif node.tag == f"{{{ns['a']}}}br":
                            parts.append("\n")
                    title_text = _safe_text(_smooth_join(parts)).splitlines()
                    if len(title_text) > 0:
                        title_text = [0][:120]
                    if title_text:
                        break
            paragraphs = []
            for p in root.findall(".//a:p", ns):
                parts = []
                for node in list(p):
                    if node.tag == f"{{{ns['a']}}}r":
                        t = node.find(".//a:t", ns)
                        parts.append(t.text if (t is not None and t.text) else "")
                    elif node.tag == f"{{{ns['a']}}}br":
                        parts.append("\n")
                para = _safe_text(_smooth_join(parts))
                if para:
                    has_bullet = p.find(".//a:buChar", ns) is not None or p.find(".//a:buAutoNum", ns) is not None
                    if has_bullet:
                        para = f"- {para}"
                    paragraphs.append(para)
            images = []
            rels_name = slide_name.replace("slides/", "slides/_rels/") + ".rels"
            if rels_name in zf.namelist():
                rels_xml = ET.fromstring(zf.read(rels_name))
                rel_ns = {"r": "http://schemas.openxmlformats.org/package/2006/relationships"}
                for rel in rels_xml.findall(".//r:Relationship", rel_ns):
                    target = rel.attrib.get("Target", "")
                    rtype = rel.attrib.get("Type", "")
                    if "image" in rtype or target.startswith("../media"):
                        if target.startswith("../"):
                            media_path = "ppt/" + target.replace("../", "")
                        else:
                            media_path = "ppt/slides/" + target
                        if "ppt/media/" not in media_path:
                            candidate = "ppt/media/" + os.path.basename(target)
                            if candidate in zf.namelist():
                                media_path = candidate
                        if media_path in zf.namelist():
                            img_bytes = zf.read(media_path)
                            ext = os.path.splitext(media_path)[1] or ".png"
                            images.append({"data": img_bytes, "ext": ext})
            tables = []
            for tbl in root.findall(".//a:tbl", ns):
                rows = []
                for tr in tbl.findall(".//a:tr", ns):
                    row_vals = []
                    for tc in tr.findall("a:tc", ns):
                        cell_parts = []
                        for p in tc.findall(".//a:p", ns):
                            prts = []
                            for node in list(p):
                                if node.tag == f"{{{ns['a']}}}r":
                                    t = node.find(".//a:t", ns)
                                    prts.append(t.text if (t is not None and t.text) else "")
                                elif node.tag == f"{{{ns['a']}}}br":
                                    prts.append("\n")
                            line = _safe_text(_smooth_join(prts))
                            if line:
                                cell_parts.append(line)
                        row_vals.append("\n".join(cell_parts))
                    if row_vals:
                        rows.append(row_vals)
                if rows:
                    tables.append(rows)
            equations = []
            for mnode in root.findall(".//m:oMath", ns) + root.findall(".//m:oMathPara", ns):
                toks = [mt.text for mt in mnode.findall(".//m:t", ns) if mt.text]
                eq = _safe_text("".join(toks))
                if eq:
                    equations.append(eq)
            slides_out.append({
                "index": idx,
                "title": title_text,
                "paragraphs": paragraphs,
                "images": images,
                "tables": tables,
                "equations": equations
            })
    return slides_out

def collect_pptx_contents(pptx_path: str) -> List[Dict]:
    if not os.path.exists(pptx_path):
        raise FileNotFoundError(f"PPTX not found: {pptx_path}")
    slides = _collect_with_python_pptx(pptx_path)
    if slides:
        try:
            xml_slides = _collect_with_zip(pptx_path)
            for a, b in zip(slides, xml_slides):
                a["equations"] = b.get("equations", [])
        except Exception:
            pass
        return slides
    return _collect_with_zip(pptx_path)

def _table_to_markdown(rows: List[List[str]]) -> str:
    if not rows:
        return ""
    max_cols = max(len(r) for r in rows)
    norm = [r + [""] * (max_cols - len(r)) for r in rows]
    header = norm[0]
    body = norm[1:] if len(norm) > 1 else []
    md = []
    md.append("| " + " | ".join(h if h else " " for h in header) + " |")
    md.append("| " + " | ".join("---" for _ in header) + " |")
    for r in body:
        md.append("| " + " | ".join(c if c else " " for c in r) + " |")
    return "\n".join(md)

def slides_to_markdown_cells(slides: List[Dict]) -> List[str]:
    md_cells = []
    prev_title = None
    for i, s in enumerate(slides, start=1):
        title = _safe_text(s.get("title") or f"Slide {i}")
        if title and title != prev_title:
            md_cells.append(f"# {title}")
            prev_title = title
        body = [f"### Slide {i}: {title if title else '(Untitled)'}", ""]
        for par in s.get("paragraphs", []):
            body.append(f"- {par}" if not par.lstrip().startswith(("-", "*", "+")) else par)
        for k, tbl in enumerate(s.get("tables", []), start=1):
            body.append("")
            body.append(f"**Table {i}.{k}**")
            body.append(_table_to_markdown(tbl))
        for k, eq in enumerate(s.get("equations", []), start=1):
            if not eq:
                continue
            body.append("")
            body.append(f"**Equation {i}.{k}**")
            body.append(f"$$\n{eq}\n$$")
        if s.get("images"):
            body.append("")
            for j, img in enumerate(s["images"], start=1):
                ext = img.get("ext") or ".png"
                data_uri = _image_to_data_uri(img.get("data", b""), ext)
                body.append(f"![slide{i}-image{j}]({data_uri})")
        md_cells.append("\n".join(body).strip() + "\n")  # NOTE: typo fixed below
    return md_cells

def build_output_notebook(pptx_path: str, out_ipynb: str) -> str:
    slides = collect_pptx_contents(pptx_path)
    md_cells = slides_to_markdown_cells(slides)
    nb = new_notebook()
    header = f"""# PPT → Markdown (v3: Better Text Joining + Tables + Equations)

**Source**: `{os.path.basename(pptx_path)}`

개선 사항:
- **폰트/스타일로 분리된 run들을 문장 단위로 매끄럽게 연결**
- 줄바꿈(br) 보존, 불필요한 공백 정리, 소프트 하이픈 제거
- bullet(기본) 감지 후 접두사 적용
- 기존 이미지/테이블/수식/헤더 로직 그대로 유지
"""
    nb.cells.append(new_markdown_cell(header.strip()))
    for md in md_cells:
        nb.cells.append(new_markdown_cell(md))
    with open(out_ipynb, "w", encoding="utf-8") as f:
        nbformat.write(nb, f)
    return out_ipynb

# ===== 사용자 설정 =====
pptx_path = "/mnt/data/sample.pptx"
out_ipynb = "/mnt/data/output_from_ppt_v3.ipynb"

# 실행 예시
# build_output_notebook(pptx_path, out_ipynb)
# print("Saved:", out_ipynb)

In [20]:
# ===== 사용자 설정 =====
pptx_path = "[1차시] LV3_1-1 데이터의 이해_스크립트.pptx"   # 변환할 PPTX 경로로 바꾸세요
out_ipynb = "output_from_ppt.ipynb"

# 실행 예시
build_output_notebook(pptx_path, out_ipynb)
# print("Saved:", out_ipynb)

'output_from_ppt.ipynb'