# 노트북 파일을 내가 원하는 형태로 분해해보기
- 전체를 markdown 으로 만들어보기
- markdown Cell 만 뽑아보기
- source Cell 만 뽑아보기

In [1]:
import base64
import json
import os
import re
from pathlib import Path
from urllib.parse import urlparse

import requests
from dotenv import load_dotenv
from markdownify import markdownify


load_dotenv()

NOTEBOOK_FILE_PATH = os.getenv("NOTEBOOK_FILE_PATH")
OUT_DIR = os.getenv("OUT_DIR")

notebook_path = Path(NOTEBOOK_FILE_PATH)

out_dir = Path(OUT_DIR)
out_dir.mkdir(exist_ok=True, parents=True)

img_dir = Path("images")

img_out_dir = out_dir / img_dir
img_out_dir.mkdir(exist_ok=True, parents=True)

In [2]:
def load_notebook(file_path: Path) -> dict:
    if not file_path.is_file() or file_path.suffix != ".ipynb":
        raise FileNotFoundError("노트북 파일을 열 수 없습니다.")

    with open(file_path, "r") as notebook_file:
        return json.load(notebook_file)

In [3]:
class MarkdownImageLocalizer:
    """
    URL 이나 base64 로 된 이미지 markdown 구문을
    local 에서 보여질 수 있도록 처리하는 객체
    """

    PATTERN_URL_IMAGE = r"!\[(.*?)\]\((https?://[^)]+)\)"
    PATTERN_BASE64_IMAGE = r"!\[(.*?)\]\(data:image\/(\w+);base64,(.*?)\)"

    @staticmethod
    def _replace_image_url_image(match) -> str:
        """
        md str 에서 이미지 URL 을 로컬 파일 경로 markdown 구문으로 교체
        만약 이미지를 다운로드할 수 없으면, 원래의 외부 URL 을 유지한 채 반환
        """
        alt_text = match.group(1)
        image_url = match.group(2)
        image_path = Path(urlparse(image_url).path)

        if not image_path.suffix:
            image_path = image_path.with_suffix(".jpg")
        local_image_path = img_out_dir / image_path.name

        res = requests.get(image_url, stream=True)
        if res.status_code != 200:  # 이미지 다운로드 못 받으면, 그대로 사용하도록
            return match.group(0)

        with open(local_image_path, "wb") as img_file:
            for chunk in res.iter_content(chunk_size=4096):
                img_file.write(chunk)

        return f"![{alt_text}](/{img_dir / local_image_path.name})"

    @staticmethod
    def _replace_base64_image(match) -> str:
        """
        md str 에서 base64로 인코딩된 이미지를 로컬에 저장한 후,
        그 path 로 이미지를 렌더링하기 위한 markdown 호환 문자열을 반환
        """

        alt_text = match.group(1)
        img_ext = match.group(2)
        base64_data = match.group(3)

        local_image_path = img_out_dir / f"{alt_text}.{img_ext}"

        # base64 데이터를 디코딩하여 파일로 저장
        with open(local_image_path, "wb") as img_file:
            img_file.write(base64.b64decode(base64_data))

        return f"![{alt_text}]({img_dir / local_image_path.name})"

    def process_md_contents(self, md_contents: list[str]) -> list[str]:
        """이미지가 local 에서 보여질 수 있도록 처리한 md 형태로 작성된 list 를 반환"""
        processed_mds = []
        for content in md_contents:
            line = re.sub(
                self.PATTERN_BASE64_IMAGE, self._replace_base64_image, content
            )
            line = re.sub(self.PATTERN_URL_IMAGE, self._replace_image_url_image, line)
            processed_mds.append(line)
        processed_mds.append("\n")
        return processed_mds

In [4]:
def convert_html_to_table(html_contents: list[str]) -> list[str]:
    result = markdownify("\n".join(html_contents))
    return [result, "\n\n"]

In [5]:
def parse_outputs_of_cell(outputs: list[dict]) -> list:
    mds = []

    if not outputs:
        return mds

    mds.append("실행결과\n\n")
    for output in outputs:
        output_type = output["output_type"]
        if output_type == "execute_result":
            mds += convert_html_to_table(output["data"]["text/html"])
        elif output_type == "stream" and output["name"] == "stdout":
            mds.append("```\n")
            mds += output["text"]
            mds.append("```\n")
    return mds

In [6]:
def parse_only_md(notebook_dict: dict) -> list[list[str]]:
    total = []
    md_localizer = MarkdownImageLocalizer()
    for cell in notebook_dict["cells"]:
        cell_type = cell["cell_type"]
        if cell_type != "markdown":
            continue

        source = cell["source"]
        localized_mds = md_localizer.process_md_contents(source)
        total.append(localized_mds)
    return total

In [7]:
def parse_only_code(notebook_dict: dict) -> list[list[str]]:
    total = []
    for cell in notebook_dict["cells"]:
        cell_type = cell["cell_type"]
        if cell_type != "code":
            continue

        source = cell["source"]
        total.append(["```python\n"] + source + ["\n```\n"])
        outputs_as_md = parse_outputs_of_cell(cell["outputs"])
        if outputs_as_md:
            total.append(outputs_as_md)
    return total

In [8]:
def parse_notebook_all(notebook_dict: dict) -> list[list[str]]:
    total = []
    md_localizer = MarkdownImageLocalizer()
    for cell in notebook_dict["cells"]:
        cell_type = cell["cell_type"]
        source = cell["source"]
        if cell_type == "markdown":
            localized_mds = md_localizer.process_md_contents(source)
            total.append(localized_mds)
        elif cell_type == "code":
            total.append(["실행코드\n\n```python\n"] + source + ["\n```\n"])
            outputs_as_md = parse_outputs_of_cell(cell["outputs"])
            if outputs_as_md:
                total.append(outputs_as_md)
    return total

In [9]:
def save_as_md(output_path: Path, mds_list: list[list[str]]):
    with open(output_path, "w") as f:
        for mds in mds_list:
            f.writelines(mds)

In [10]:
notebook = load_notebook(notebook_path)

In [11]:
save_as_md(out_dir / "all.md", parse_notebook_all(notebook))

In [12]:
save_as_md(out_dir / "mds.md", parse_only_md(notebook))

In [13]:
save_as_md(out_dir / "cods.md", parse_only_code(notebook))