In [15]:
import os
import pickle
import torch
import pandas as pd

from tqdm import tqdm
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import f1_score, accuracy_score
import datasets
import sys
sys.path.append("/home/xucong24/Compiler")
from src.model.tokenizer import Inst2VecTokenizer
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


In [16]:
data_folder = '/home/xucong24/Compiler/datasets/POJ104Dataset'
out_folder = '/home/xucong24/Compiler/work_dirs/modernbert_inst2vec_for_classifyapp'
model_path = "/home/xucong24/Compiler/work_dirs/inst2vec_poj104_modernbert/20250825_063731/final_model"
tokenizer_path = "/home/xucong24/Compiler/work_dirs/inst2vec_poj104_modernbert/20250825_063731/final_model"

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

## 数据预处理

In [6]:
# 加载分词器
print("Loading tokenizer...")
tokenizer = Inst2VecTokenizer.from_pretrained(tokenizer_path)

Loading tokenizer...


In [4]:
dataset = datasets.load_from_disk(data_folder)
dataset

DatasetDict({
    train: Dataset({
        features: ['llvm', 'label'],
        num_rows: 221344
    })
    test: Dataset({
        features: ['llvm', 'label'],
        num_rows: 9227
    })
    val: Dataset({
        features: ['llvm', 'label'],
        num_rows: 9155
    })
})

In [25]:
def tokenize_func(examples):
    tokenized_data = tokenizer(
        examples['llvm'], 
        padding=True, 
        truncation=True, 
        max_length=512, 
        return_tensors="pt"
    )
    
    # 将 label 从 1-based 映射到 0-based  这里要求batch=false
    labels = int(examples['label']) - 1
    tokenized_data['labels'] = labels
    return tokenized_data

tokenized_dataset = dataset.map(
    tokenize_func,
    batched=False,
    num_proc=32,
    remove_columns=dataset['train'].column_names
)

Map (num_proc=32):   0%|          | 0/221344 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/9227 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/9155 [00:00<?, ? examples/s]

## 加载模型

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=104
)
model

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at /home/xucong24/Compiler/work_dirs/inst2vec_poj104_modernbert/20250825_063731/final_model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(8569, 768, padding_idx=8565)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      )


## 模型训练

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir=out_folder,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=out_folder,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# 创建训练器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['val'],
    eval_dataset=tokenized_dataset['val'],
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()

# 保存模型
trainer.save_model(out_folder)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6541,0.447803,0.884544,0.879137,0.905913,0.884544
