## Visualise each chunk and its size

### 1. We create nodes as usual first

In [1]:
from src.file_reader import FileReader, process_md, LOGGER
from src.settings import MD_DIR_PATH, PROCESSED_DIR_PATH, CHUNK_SIZE, tokenizer
from pathlib import Path


md_paths = list(Path(MD_DIR_PATH).iterdir())
if md_paths:
    try:
        for filepath in md_paths:
            file_suffix = filepath.suffix
            # Only process for markdown files in the directory
            if file_suffix != ".md":
                continue
            LOGGER.info(filepath)
            with open(filepath, encoding="utf-8") as f:
                md = f.read()
            processed_md = process_md(md)
            filename = filepath.name
            LOGGER.info(filename)
            processed_md_path = str(Path(PROCESSED_DIR_PATH).joinpath(filename))
            with open(processed_md_path, "w", encoding="UTF-8") as f:
                f.write(processed_md)

    except Exception:
        LOGGER.exception("Markdown document parsing failed")
        raise

    LOGGER.info("Parsing and saving MD files completed succesfully")
else:
    LOGGER.info("No MD files found")

chunks = FileReader(input_dir=PROCESSED_DIR_PATH).load_data()

2025-02-25 11:34:39 - src - INFO - md_files/dillinger.md
2025-02-25 11:34:39 - src - INFO - dillinger.md
2025-02-25 11:34:39 - src - INFO - Parsing and saving MD files completed succesfully


### 2. Print chunk, metadata, length of tokens and check if chunk is within size

In [2]:
import tiktoken

tok = tiktoken.encoding_for_model(tokenizer)
prev_metadata = ""
start = True
for chunk in chunks:
    inside_limit = False
    metadata_str: str = "\n".join(
        [f"{key}: {value}" for key, value in chunk.metadata.items()]
    )
    token_length = len(tok.encode(chunk.text))
    if token_length < ((CHUNK_SIZE - len(tok.encode(metadata_str))) * 1.4):
        inside_limit = True

    if prev_metadata == chunk.metadata or start:
        print(chunk.metadata)
        print(f"Length of tokens: {token_length}")
        print(f"Is chunk inside token limit?: {inside_limit}")
        print(chunk.text)
        print(100 * "-")
    else:
        print(100 * "=")
        print(100 * "=")
        print(chunk.metadata)
        print(f"Length of tokens: {token_length}")
        print(f"Is chunk inside token limit?: {inside_limit}")
        print(chunk.text)
        print(100 * "-")
    start = False
    prev_metadata = chunk.metadata

{'doc_name': 'dillinger', 'doc_type': '.md', 'chunk_size': 256, 'chunk_overlap': 1.4}
Length of tokens: 129
Is chunk inside token limit?: True
# dillinger
## *The Last Markdown Editor, Ever*

[![N;Solid](https://cldup.com/dTxpPi9lDf.thumb.png)](https://nodesource.com/products/nsolid)

[![Build Status](https://travis-ci.org/joemccann/dillinger.svg?branch=master)](https://travis-ci.org/joemccann/dillinger)

Dillinger is a cloud-enabled, mobile-ready, offline-storage compatible,
AngularJS-powered HTML5 Markdown editor.

* Type some Markdown on the left
* See HTML in the right
* ✨Magic ✨
----------------------------------------------------------------------------------------------------
{'doc_name': 'dillinger', 'doc_type': '.md', 'chunk_size': 256, 'chunk_overlap': 1.4}
Length of tokens: 227
Is chunk inside token limit?: True
# dillinger
## *The Last Markdown Editor, Ever*

# dillinger
## Features

# dillinger
## *The Last Markdown Editor, Ever*

* Import a HTML file and watch it magicall