In [None]:
import os
os.chdir("/home/vecglypher/codes/svg_glyph_llm/")
os.getcwd()

In [None]:
from pathlib import Path

storage_base_dir = "/home/vecglypher/mnt/"
storage_base_dir = Path(storage_base_dir)

In [None]:
from transformers import AutoTokenizer
from rich.progress import Progress, track
import json
from tqdm import tqdm
from concurrent.futures import as_completed, ProcessPoolExecutor
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "DejaVu Sans"   # or any font installed

In [None]:
model_name_or_path = storage_base_dir / "workspace/hf_downloads/Qwen/Qwen3-4B"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [None]:
WORD_SEP = "<|SEP|>"
additional_special_tokens = [WORD_SEP]
tokenizer.add_special_tokens(
    {"additional_special_tokens": additional_special_tokens},
    replace_additional_special_tokens=False,
)
print(f"tokenizer.special_tokens_map: {tokenizer.special_tokens_map}")

In [None]:
def load_data_list(jsonl_dir, verbose=False):
    jsonl_dir = Path(jsonl_dir)
    jsonl_list = list(jsonl_dir.glob("*.jsonl"))
    print(f"number of jsonl files: {len(jsonl_list)}")

    data_list = []
    for jsonl in jsonl_list:
        with open(jsonl, "r") as f:
            for line in f:
                data_list.append(json.loads(line))
    # validate data
    if not verbose:
        return data_list

    data = data_list[0]

    instruction = data["instruction"]
    system = data["system"]
    output = data["output"]

    input_str = f"{instruction} {system}"
    output_str = output

    input_token = tokenizer(input_str, add_special_tokens=True)["input_ids"]
    output_token = tokenizer(output_str, add_special_tokens=True)["input_ids"]

    decode_input_str = tokenizer.decode(input_token, skip_special_tokens=False)
    decode_output_str = tokenizer.decode(output_token, skip_special_tokens=False)
    print(f"[[input_str]] {input_str}")
    print(f"[[decode_str]] {decode_input_str}")

    print(f"[[output_str]] {output_str}")
    print(f"[[decode_str]] {decode_output_str}")

    return data_list

In [None]:
def stat_token(data, tokenizer):
    instruction = data["instruction"]
    system = data["system"]
    output = data["output"]

    input_str = f"{instruction} {system}"
    output_str = output

    input_token = tokenizer(input_str, add_special_tokens=True)["input_ids"]
    output_token = tokenizer(output_str, add_special_tokens=True)["input_ids"]

    input_token_len = len(input_token)
    output_token_len = len(output_token)

    metadata = json.loads(data["metadata"])
    content_str = metadata["content_str"].replace(WORD_SEP, '')
    return {
        "input_str_len": len(input_str),
        "input_token_len": input_token_len,
        "output_str_len": len(output_str),
        "output_token_len": output_token_len,
        "content_len": len(content_str)
    }

def batch_stat_token(data_list, tokenizer):
    token_len_list = []
    for data in data_list:
        stat = stat_token(data, tokenizer)
        token_len_list.append(stat)
    return token_len_list

In [None]:
def stat_token_from_data_list(data_list, tokenizer):
    token_len_list = []
    num_workers = 20
    batch_size = 10000
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        future2data =[]
        for start_idx in range(0, len(data_list), batch_size):
            end_idx = min(start_idx + batch_size, len(data_list))
            data_batch = data_list[start_idx:end_idx]
            future = executor.submit(batch_stat_token, data_batch, tokenizer)
            future2data.append(future)

        # for future in tqdm(as_completed(future2data), total=len(future2data)):
        for future in as_completed(future2data):
            try:
                stat = future.result()
                token_len_list.extend(stat)
            except Exception as exc:
                raise exc
    return token_len_list

In [None]:
def plot_hist(token_len_df, data_name):
    fig, ax = plt.subplots(figsize=(12, 7), nrows=2, ncols=3)
    ax[0][0].hist(token_len_df["input_str_len"], bins=100)
    ax[0][0].set_title("input_str_len", fontdict={"fontsize": 10})
    ax[0][1].hist(token_len_df["input_token_len"], bins=100)
    ax[0][1].set_title("input_token_len", fontdict={"fontsize": 10})
    # Add quantile lines
    quantiles = [0.25, 0.5, 0.75,0.9, 0.95, 0.99]
    quantile_values = token_len_df["input_token_len"].quantile(quantiles)
    for q, val in zip(quantiles, quantile_values):
        ax[0][1].axvline(x=val, color='r', linestyle='--', alpha=0.7, linewidth=0.8,
                         label=f'{int(q*100)}% quantile: {int(val)}')
    ax[0][1].legend(fontsize=8)
    # scatter plot of content_len vs output_token_len
    y_max = max(ax[0][0].get_ylim()[-1], ax[0][1].get_ylim()[1])
    x_max = max(ax[0][0].get_xlim()[-1], ax[0][1].get_xlim()[1])
    for i in range(2):
        ax[0][i].set_ylim(0, y_max)
        ax[0][i].set_xlim(0, x_max)



    ax[1][0].hist(token_len_df["output_str_len"], bins=100)
    ax[1][0].set_title("output_str_len", fontdict={"fontsize": 10})
    ax[1][1].hist(token_len_df["output_token_len"], bins=100)
    ax[1][1].set_title("output_token_len", fontdict={"fontsize": 10})
    # Add quantile lines
    quantiles = [0.25, 0.5, 0.75,0.9, 0.95, 0.99, 1.0]
    quantile_values = token_len_df["output_token_len"].quantile(quantiles)
    for q, val in zip(quantiles, quantile_values):
        ax[1][1].axvline(x=val, color='r', linestyle='--', alpha=0.7, linewidth=0.8,
                         label=f'{int(q*100)}% quantile: {int(val)}')
    ax[1][1].legend(fontsize=8)
    # scatter plot of content_len vs output_token_len
    ax[1][2].scatter(token_len_df["content_len"], token_len_df["output_token_len"], s=0.5)
    y_max = max(ax[1][0].get_ylim()[-1], ax[1][1].get_ylim()[1])
    x_max = max(ax[1][0].get_xlim()[-1], ax[1][1].get_xlim()[1])
    for i in range(2):
        ax[1][i].set_ylim(0, y_max)
        ax[1][i].set_xlim(0, x_max)
    for ax_ in ax.flatten():
        ax_.set_xlabel("length", fontsize=8)
        ax_.set_ylabel("count", fontsize=8)
        ax_.tick_params(axis='both', which='major', labelsize=8)

    ax[1][2].set_xlabel("content_len")
    ax[1][2].set_ylabel("output_token_len")
    ax[1][2].set_xlim(0, ax[1][2].get_xlim()[-1])
    ax[1][2].set_ylim(0, ax[1][2].get_ylim()[-1])



    fig_title = f"token/string length distribution {data_name}"
    fig.suptitle(fig_title)

    output_dir = Path("misc/token_length")
    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"{data_name}.png"
    plt.savefig(output_path, dpi=300, bbox_inches="tight")
    print(f"save to {output_path}")
    plt.show()


In [None]:
pairs = [
"ood_font_family",
"train_font_family",
]
# jsonl_base_dir = storage_base_dir / "workspace/svg_glyph_llm/data/processed/sft/250814-oxford_5000-100_fonts-apply_word_sep"
jsonl_base_dir = "data/processed_envato/sft/250903-envato-alphanumeric"
for data_name in pairs:
    jsonl_dir = Path(jsonl_base_dir) / data_name
    print(f"jsonl_dir: {jsonl_dir}")
    data_list = load_data_list(jsonl_dir)

    token_len_list = stat_token_from_data_list(data_list, tokenizer)

    token_len_df = pd.DataFrame(token_len_list)
    token_len_df.head()

    plot_title = f"{Path(jsonl_base_dir).name}-{data_name}"
    plot_hist(token_len_df, plot_title)


In [None]:
pairs = [
"ood_font_family",
"train_font_family",
]
# jsonl_base_dir = storage_base_dir / "workspace/svg_glyph_llm/data/processed/sft/250814-oxford_5000-100_fonts-apply_word_sep"
jsonl_base_dir = storage_base_dir / "workspace/svg_glyph_llm/data/processed/sft/250903-alphanumeric/"
for data_name in pairs:
    jsonl_dir = Path(jsonl_base_dir) / data_name
    print(f"jsonl_dir: {jsonl_dir}")
    data_list = load_data_list(jsonl_dir)

    token_len_list = stat_token_from_data_list(data_list, tokenizer)

    token_len_df = pd.DataFrame(token_len_list)
    token_len_df.head()

    plot_title = f"{Path(jsonl_base_dir).name}-{data_name}"
    plot_hist(token_len_df, plot_title)

In [None]:
pairs = [
"train-alphanumeric",
"ood_test-alphanumeric",
"train-sample_100",
"ood_test-sample_30-contents_600",
]
jsonl_base_dir = storage_base_dir / "workspace/svg_glyph_llm/data/processed/sft/250903-oxford_5000-100_fonts-apply_word_sep"
for data_name in pairs:
    jsonl_dir = Path(jsonl_base_dir) / data_name
    print(f"jsonl_dir: {jsonl_dir}")
    data_list = load_data_list(jsonl_dir)

    token_len_list = stat_token_from_data_list(data_list, tokenizer)

    token_len_df = pd.DataFrame(token_len_list)
    token_len_df.head()

    plot_title = f"{Path(jsonl_base_dir).name}-{data_name}"
    plot_hist(token_len_df, plot_title)
