In [1]:
from modelscope import SwinModel, SwinConfig
from modelscope import AutoImageProcessor

encoder_config = SwinConfig.from_pretrained(
    "microsoft/swin-tiny-patch4-window7-224"
)

encoder = SwinModel.from_pretrained(
    "microsoft/swin-tiny-patch4-window7-224",
    config=encoder_config
)
feature_extractor = AutoImageProcessor.from_pretrained(
    "microsoft/swin-tiny-patch4-window7-224"
)

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/microsoft/swin-tiny-patch4-window7-224
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/microsoft/swin-tiny-patch4-window7-224
Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/microsoft/swin-tiny-patch4-window7-224


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
from modelscope import AutoTokenizer, VisionEncoderDecoderModel, AutoImageProcessor

tokenizer = AutoTokenizer.from_pretrained(
    "MixTeX/MixTex-ZhEn-Latex-OCR",
    use_fast=False
)

Downloading Model from https://www.modelscope.cn to directory: /root/.cache/modelscope/hub/models/MixTeX/MixTex-ZhEn-Latex-OCR


In [3]:
from modelscope import BertConfig, BertLMHeadModel

decoder_config = {
    "vocab_size": tokenizer.vocab_size,
    "d_model": 512,
    "num_hidden_layers": 6,
    "num_attention_heads": 8,
    "dim_feedforward": 2048,
    "max_position_embeddings": 320,
    "activation_function": "gelu",
}

decoder = BertLMHeadModel(
    BertConfig(
        **decoder_config,
        is_decoder=True,
        add_cross_attention=True
    )
)

In [4]:
# 模型拼装
max_length = 296

model = VisionEncoderDecoderModel(
    encoder=encoder,
    decoder=decoder
)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 296

model.generation_config.decoder_start_token_id = tokenizer.cls_token_id
model.generation_config.eos_token_id = tokenizer.sep_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.generation_config.max_length = 296

# resize embedding
new_vocab_size = len(tokenizer)

model.decoder.resize_token_embeddings(new_vocab_size)
model.config.vocab_size = new_vocab_size
model.generation_config.vocab_size = new_vocab_size

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
import numpy as np
# 定义工具函数
def normalize(s: str) -> str:
    # 与论文一致：仅去除空白，不做结构等价
    return s.replace(" ", "").replace("\n", "")


def compute_metrics(eval_preds):
    pred_ids, label_ids = eval_preds

    # Token Accuracy
    correct = 0
    total = 0
    for p, l in zip(pred_ids, label_ids):
        for pi, li in zip(p, l):
            if li == -100:
                continue
            total += 1
            if pi == li:
                correct += 1
    token_acc = correct / max(total, 1)

    # Exact Match
    em_correct = 0
    for p, l in zip(pred_ids, label_ids):
        pred = tokenizer.decode(
            p,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
        gt = tokenizer.decode(
            np.where(l != -100, l, tokenizer.pad_token_id),
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False
        )
        if normalize(pred) == normalize(gt):
            em_correct += 1

    em = em_correct / len(pred_ids)

    return {
        "exact_match": em,
        "token_accuracy": token_acc,
    }

In [6]:
from datasets import load_dataset
from PIL import Image
import torch
from torch.utils.data import Dataset

# raw_dataset = load_dataset("MixTex/Pseudo-Latex-ZhEn-1")

raw_dataset = load_dataset(
    "parquet",
    data_files="pseudo_latex_train.parquet",
    # split="train",
)

class MixTexDataset(Dataset):
    def __init__(
        self,
        hf_dataset_split,
        tokenizer,
        feature_extractor,
        max_length=296,
    ):
        self.data = hf_dataset_split
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]

        image = sample["image"].convert("RGB")
        text = sample["text"]

        pixel_values = self.feature_extractor(
            image, return_tensors="pt"
        ).pixel_values.squeeze(0)

        labels = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).input_ids.squeeze(0)

        # pad -> -100
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "pixel_values": pixel_values,
            "labels": labels,
        }

split = raw_dataset["train"].train_test_split(
    test_size=0.01,
    seed=42,
)

train_dataset = MixTexDataset(
    split["train"],
    tokenizer,
    feature_extractor,
)

val_dataset = MixTexDataset(
    split["test"],
    tokenizer,
    feature_extractor,
)

In [7]:
# 第一阶段训练，先冻结encoder
from modelscope import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator

data_collator = default_data_collator

training_args = Seq2SeqTrainingArguments(
    output_dir="./mixtex_ckpt",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="steps",
    logging_steps=200,
    eval_steps=1000,
    save_steps=1000,
    num_train_epochs=5,
    learning_rate=1e-4,
    warmup_steps=500,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=296,
    generation_num_beams=1,   # greedy，与论文一致
    report_to="none",
    save_total_limit=2,
)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

trainer = Seq2SeqTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
)

# Freeze encoder
for p in model.encoder.parameters():
    p.requires_grad = False

trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss,Exact Match,Token Accuracy
1000,5.3046,5.26084,0.0,0.034989
2000,4.9784,4.926052,0.0,0.036357
3000,4.7775,4.764875,0.0,0.037317
4000,4.6894,4.626085,0.0,0.035331
5000,4.5693,4.529797,0.0,0.035453
6000,4.4689,4.446381,0.0,0.036169
7000,4.4497,4.379902,0.0,0.038867
8000,4.3692,4.332084,0.0,0.038891
9000,4.3339,4.281874,0.0,0.037916
10000,4.2688,4.247337,0.0,0.040446




TrainOutput(global_step=36425, training_loss=4.189468571879558, metrics={'train_runtime': 19884.2888, 'train_samples_per_second': 29.31, 'train_steps_per_second': 1.832, 'total_flos': 4.466184087843717e+19, 'train_loss': 4.189468571879558, 'epoch': 5.0})