In [40]:
import sys
from pathlib import Path
sys.path.append("/home/xucong24/Compiler")

from transformers import (
    EncoderDecoderConfig,
    EncoderDecoderModel,
    AutoConfig,
    GPT2Config,
    GPT2LMHeadModel
)
import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import time
import os

In [41]:
from src.model import (
    Inst2VecTokenizer,
    OptiSeqTokenizer
)

# load tokenizer
encoder_tokenizer_path = "/home/xucong24/Compiler/checkpoints/Inst2VecTokenizer"
decoder_tokenizer_path = "/home/xucong24/Compiler/checkpoints/OptiSeqTokenizer"

encoder_tokenizer = Inst2VecTokenizer.from_pretrained(encoder_tokenizer_path)
decoder_tokenizer = OptiSeqTokenizer.from_pretrained(decoder_tokenizer_path)

decoder_tokenizer.vocab_size, decoder_tokenizer.bos_token_id, decoder_tokenizer.eos_token_id, decoder_tokenizer.pad_token_id

(128, 126, 127, 125)

In [1]:
from transformers import (
    GPT2Config,
    AutoModel
)

config = {
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 6,
    "bos_token_id": 126,
    "eos_token_id": 127,
    "vocab_size": 128,
    "add_cross_attention": True,
    # "architectures": [
    #     "GPT2LMHeadModel"
    # ],
    # "task_specific_params": {
    #     "text-generation": {
    #     "do_sample": True,
    #     "max_length": 50
    #     }
    # },
}

decoder_config = GPT2Config(**config)
encoder = AutoModel.from_pretrained("/home/xucong24/Compiler/checkpoints/modernbert_poj104_mlm")
decoder = GPT2LMHeadModel(decoder_config)

NameError: name 'GPT2LMHeadModel' is not defined

In [53]:
from transformers import (
    EncoderDecoderConfig,
    AutoConfig,
    PretrainedConfig
)

class PassformerConfig(EncoderDecoderConfig):

    model_type = "passformer"

    def __init__(
        self,
        fusion_method: str = "add",  # "concat", "add", "cross_attention"
        **kwargs
    ):
        super().__init__(**kwargs)
        self.fusion_method = fusion_method
        self.autophase_dim = 56
        self.decoder_start_token_id = 126
        self.pad_token_id = 125
        self.vocab_size = 128


In [63]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import time
import os
from typing import Optional, Tuple, Union, List
from transformers import (
    PreTrainedModel
)

class AutophaseProjection(nn.Module):
    """Project Autophase features to encoder/decoder hidden dimension."""
    
    def __init__(self, autophase_dim: int, hidden_dim: int):
        super().__init__()
        # self.proj = nn.Sequential(
        #     nn.Linear(autophase_dim, hidden_dim),
        #     nn.LayerNorm(hidden_dim),
        #     nn.GELU(),
        # )

        immediate_dim = hidden_dim // 2

        self.proj = nn.Sequential(
            nn.Linear(autophase_dim, immediate_dim),
            nn.LayerNorm(immediate_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(immediate_dim, immediate_dim),
            nn.LayerNorm(immediate_dim),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(immediate_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
        )
    
    
    def forward(self, autophase: torch.Tensor) -> torch.Tensor:
        return self.proj(autophase).unsqueeze(1)


class AutophaseCrossAttention(nn.Module):
    """Cross-attention module for Autophase feature fusion."""
    
    def __init__(self, hidden_dim: int, num_heads: int = 8, dropout: float = 0.1):
        super().__init__()
        self.cross_attn = nn.MultiheadAttention(
            hidden_dim, num_heads, dropout=dropout, batch_first=True
        )
        self.norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(
        self, 
        hidden_states: torch.Tensor, 
        autophase_emb: torch.Tensor
    ) -> torch.Tensor:
        attn_output, _ = self.cross_attn(
            query=hidden_states,
            key=autophase_emb,
            value=autophase_emb,
        )
        return self.norm(hidden_states + self.dropout(attn_output))

# ============================================================================
# Main Model
# ============================================================================

class PassformerModel(EncoderDecoderModel):
    """
        >>> from src.model import PassformerModel, PassformerConfig, Inst2VecTokenizer, OptiSeqTokenizer
        >>> from transformers import AutoModel, AutoConfig, GPT2LMHeadModel, GPT2Config
        >>> 
        >>> # 步骤1: 加载 tokenizers（必需）
        >>> encoder_tokenizer = Inst2VecTokenizer.from_pretrained("path/to/encoder_tokenizer")
        >>> decoder_tokenizer = OptiSeqTokenizer.from_pretrained("path/to/decoder_tokenizer")
        >>> 
        >>> # 步骤2: 创建 encoder 和 decoder
        >>> encoder = AutoModel.from_pretrained("path/to/encoder_model")
        >>> decoder_config = GPT2Config(
        ...     vocab_size=decoder_tokenizer.vocab_size,
        ...     n_embd=768,
        ...     n_layer=6,
        ...     n_head=12,
        ... )
        >>> decoder = GPT2LMHeadModel(decoder_config)

        >>> # 步骤4: 创建模型
        >>> model = PassformerModel(encoder=encoder, decoder=decoder, fusion_method='add')
        >>> 
        >>> # 步骤5: 使用 tokenizer 进行编码
        >>> llvm_ir = "define void @main() { ... }"
        >>> encoder_inputs = encoder_tokenizer(llvm_ir, max_length=1024, return_tensors="pt")
        >>> 
        >>> opti_seq = "mem2reg instcombine gvn"
        >>> decoder_inputs = decoder_tokenizer(opti_seq, max_length=256, return_tensors="pt")
        >>> 
        >>> # 模型前向传播
        >>> outputs = model(
        ...     input_ids=encoder_inputs["input_ids"],
        ...     attention_mask=encoder_inputs["attention_mask"],
        ...     decoder_input_ids=decoder_inputs["input_ids"],
        ...     decoder_attention_mask=decoder_inputs["attention_mask"],
        ...     labels=decoder_inputs["input_ids"],
        ...     autophase=autophase_features,  # [batch, 56]
        ... )
    """

    config: PassformerConfig
    
    def __init__(
        self,
        config: Optional[PretrainedConfig] = None,
        encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[PreTrainedModel] = None,
        fusion_method: Optional[str] = None,
        **kwargs
    ):

        super().__init__(encoder=encoder, decoder=decoder, config=config, **kwargs)

        config_dict = self.config.to_dict()
        if fusion_method is not None:
            config_dict['fusion_method'] = fusion_method
        
        self.config = PassformerConfig(**config_dict)

        # 获取 hidden sizes
        encoder_hidden_size = self.encoder.config.hidden_size
        decoder_hidden_size = self.decoder.config.n_embd
        
        if self.config.fusion_method == "concat":
            # 在 encoder 输出后拼接
            self.autophase_proj = AutophaseProjection(
                self.config.autophase_dim, encoder_hidden_size
            )
        elif self.config.fusion_method == "add":
            # 加到 encoder 输出
            self.autophase_proj = AutophaseProjection(
                self.config.autophase_dim, encoder_hidden_size
            )
        elif self.config.fusion_method == "cross_attention":
            # Cross-attention 融合
            self.autophase_proj = AutophaseProjection(
                self.config.autophase_dim, encoder_hidden_size
            )
            self.autophase_cross_attn = AutophaseCrossAttention(
                encoder_hidden_size, 
                num_heads=8
            )
        elif self.config.fusion_method is not None and self.config.fusion_method != "none":
            raise ValueError(f"Unknown fusion method: {self.config.fusion_method}")
    
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        autophase: Optional[torch.FloatTensor] = None,
        **kwargs,
    ):
        # 处理 Autophase 融合
        if autophase is not None:
            if encoder_outputs is None:
                encoder_outputs = self.encoder(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                    return_dict=True,
                )
            
            # 融合 autophase 特征
            encoder_hidden_states = encoder_outputs.last_hidden_state
            fused_hidden_states, updated_attention_mask = self._fuse_autophase(
                encoder_hidden_states, attention_mask, autophase
            )
            
            # 更新 encoder_outputs
            encoder_outputs.last_hidden_state = fused_hidden_states
            attention_mask = updated_attention_mask
        
        # 调用父类的 forward 方法
        return super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_inputs_embeds=decoder_inputs_embeds,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )
    
    def generate(
        self,
        inputs=None,
        generation_config=None,
        logits_processor=None,
        stopping_criteria=None,
        prefix_allowed_tokens_fn=None,
        synced_gpus=False,
        autophase=None,
        **kwargs,
    ):
        """
        Generate optimization sequences with Autophase support.
        
        Args:
            inputs: Encoder input token IDs [batch, seq_len]
            autophase: Autophase features [batch, autophase_dim]
            ... (其他标准 generate 参数)
        
        Returns:
            Generated token IDs [batch, gen_len]
        """

        if self.config.fusion_method != "none" and autophase is None:
            raise ValueError("Autophase features are required for generation when fusion method is not 'none'")
            
        # Autophase 融合
        if autophase is not None:
            # 获取 encoder 输入
            if kwargs.get("attention_mask") is None:
                attention_mask = torch.ones_like(inputs)
            else:
                attention_mask = kwargs.get("attention_mask")
            
            # 编码并融合 autophase
            encoder_outputs = self.encoder(
                input_ids=inputs,
                attention_mask=attention_mask,
                return_dict=True,
            )
            encoder_hidden_states = encoder_outputs.last_hidden_state
            fused_hidden_states, updated_attention_mask = self._fuse_autophase(
                encoder_hidden_states, attention_mask, autophase
            )
            
            # 更新 encoder_outputs
            encoder_outputs.last_hidden_state = fused_hidden_states
            kwargs["encoder_outputs"] = encoder_outputs
            kwargs["attention_mask"] = updated_attention_mask
        
        # 调用父类的 generate 方法
        return super().generate(
            inputs=inputs,
            generation_config=generation_config,
            logits_processor=logits_processor,
            stopping_criteria=stopping_criteria,
            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
            synced_gpus=synced_gpus,
            **kwargs,
        )

    
    def _fuse_autophase(
        self,
        encoder_hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        autophase: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        batch_size = encoder_hidden_states.size(0)
        device = encoder_hidden_states.device

        if autophase.dtype != torch.float32 and autophase.dtype != torch.float16:
            autophase = autophase.float()
        
        
        if self.config.fusion_method == "concat":
            # Project autophase and concatenate as extra token
            autophase_emb = self.autophase_proj(autophase)  # [batch, 1, hidden]
            fused = torch.cat([autophase_emb, encoder_hidden_states], dim=1)
            
            # Update attention mask
            autophase_mask = torch.ones(batch_size, 1, device=device, dtype=attention_mask.dtype)
            attention_mask = torch.cat([autophase_mask, attention_mask], dim=1)
            
            return fused, attention_mask
        
        elif self.config.fusion_method == "add":
            # Add autophase embedding to all positions]
            autophase_emb = self.autophase_proj(autophase)  # [batch, 1, hidden]
            fused = encoder_hidden_states + autophase_emb
            return fused, attention_mask
        
        elif self.config.fusion_method == "cross_attention":
            # Use cross-attention to fuse
            autophase_emb = self.autophase_proj(autophase)  # [batch, 1, hidden]
            fused = self.autophase_cross_attn(encoder_hidden_states, autophase_emb)
            return fused, attention_mask
        
        else:
            return encoder_hidden_states, attention_mask
    

model = PassformerModel(encoder=encoder, decoder=decoder, fusion_method='add')

In [55]:
model.config

PassformerConfig {
  "autophase_dim": 56,
  "decoder": {
    "activation_function": "gelu_new",
    "add_cross_attention": true,
    "architectures": [
      "GPT2LMHeadModel"
    ],
    "attn_pdrop": 0.1,
    "bos_token_id": 126,
    "embd_pdrop": 0.1,
    "eos_token_id": 127,
    "initializer_range": 0.02,
    "is_decoder": true,
    "layer_norm_epsilon": 1e-05,
    "model_type": "gpt2",
    "n_embd": 768,
    "n_head": 12,
    "n_inner": null,
    "n_layer": 6,
    "n_positions": 1024,
    "reorder_and_upcast_attn": false,
    "resid_pdrop": 0.1,
    "scale_attn_by_inverse_layer_idx": false,
    "scale_attn_weights": true,
    "summary_activation": null,
    "summary_first_dropout": 0.1,
    "summary_proj_to_labels": true,
    "summary_type": "cls_index",
    "summary_use_proj": true,
    "task_specific_params": {
      "text-generation": {
        "do_sample": true,
        "max_length": 50
      }
    },
    "use_cache": true,
    "vocab_size": 128
  },
  "decoder_start_token_id":

In [56]:
model

PassformerModel(
  (encoder): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(8569, 768, padding_idx=8565)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      )
      (1-21): 21 x

In [57]:
from datasets import load_from_disk
from src.data import create_tokenize_fn

dataset = load_from_disk("/home/xucong24/Compiler/datasets/ga_llvm_37k")
dataset_selected = dataset.select(range(5))
    
# Tokenize 数据集
tokenize_fn = create_tokenize_fn(
    encoder_tokenizer,
    decoder_tokenizer,
    128,
    32,
    include_autophase=True
)

remove_columns = [
    'Benchmark', 'CpuInfo', 'IrInstructionCountO0',
    'IrInstructionCountO3', 'IrInstructionCountOz',
    'InstCount', 'Reward', 'LLVM_IR', 'Commandline', 'Autophase'
]
existing_columns = set(dataset.column_names)
remove_columns = [col for col in remove_columns if col in existing_columns]

tokenized_data = dataset_selected.map(
    tokenize_fn,
    batched=False,
    remove_columns=remove_columns,
    num_proc=32,
    desc="Tokenizing"
)

tokenized_data = tokenized_data.train_test_split(test_size=0.2, seed=42)

Loading dataset from disk:   0%|          | 0/36 [00:00<?, ?it/s]

num_proc must be <= 5. Reducing num_proc to 5 for dataset of size 5.


In [58]:
sample = tokenized_data['test'][0]
input_ids = torch.tensor(sample['input_ids'], dtype=torch.long).unsqueeze(0)
attention_mask = torch.tensor(sample['attention_mask'], dtype=torch.long).unsqueeze(0)
autophase = torch.tensor(sample['autophase'], dtype=torch.float32).unsqueeze(0)
labels = torch.tensor(sample['labels'], dtype=torch.long).unsqueeze(0)

input_ids, attention_mask, autophase, labels

(tensor([[8566,    5, 8564,    8, 8564,    8, 8564, 8564, 8564, 8564,   22,   46,
          8564,  289,  289, 8564,  289, 4062,  364,  311,  594,  293, 3032,  364,
           295,  198,  204, 8564,  597,  600,  213, 8564,  198,  204, 8564,  364,
           216, 8564,  231,  231, 8564, 4065,  200,  622, 1013,  295, 8564,   79,
           252,  295, 8564,   79,  188,  961,  194,  295, 8564,  293, 3032,  481,
            77,  263,   77,  289,  289,  289,  289,  293, 3032,  293, 3032,  364,
           293, 3032,  293, 3032,  364,  216, 8564,  295,  756,  204, 8564,  295,
           311,  364,  216, 8564,  295,  425,  204, 8564,  295,  547,  634,  204,
          8564,  295,  632,  883,  311,  216, 8564,  295,  632,  311,  216, 8564,
           216, 8564,  295,  431,  311,  216, 8564,  295,  295,  252, 8564,  311,
           216, 8564,  295,  431,  311,  216, 8564,  293]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [59]:
logits = model(
    input_ids=input_ids, 
    attention_mask=attention_mask, 
    autophase=autophase, 
    labels=labels
)
logits

  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Seq2SeqLMOutput(loss=tensor(4.8353, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.5258, -0.1056,  0.2954,  ...,  0.0416,  0.2911,  0.7247],
         [ 0.8534, -0.8265,  0.1497,  ..., -0.2997,  0.4178,  1.2306],
         [ 0.3849, -0.3823, -0.0124,  ..., -0.0802, -0.3761,  0.9375],
         ...,
         [ 0.2424, -0.6945, -0.2562,  ...,  0.0376,  0.1115,  1.3001],
         [ 0.6673, -0.2131, -0.1713,  ..., -0.7453,  0.0637,  1.1720],
         [ 0.4946,  0.1214,  0.0904,  ..., -0.5475, -0.3116,  1.1780]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=EncoderDecoderCache(self_attention_cache=DynamicCache(layers=[DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer]), cross_attention_cache=DynamicCache(layers=[DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer, DynamicLayer])), decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=tensor([[[-8.5094e-01,  9.8385e-01, -1.8286e+00,  .

In [70]:
model.generate(inputs=input_ids, autophase=autophase)

Setting `pad_token_id` to `eos_token_id`:127 for open-end generation.


tensor([[126,  84,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,  57,
          57,  57,  57,  57,  57,  24,  24]])

In [68]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    num_train_epochs=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test']
)

trainer.train()


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


Step,Training Loss


TrainOutput(global_step=2, training_loss=4.693825721740723, metrics={'train_runtime': 5.8725, 'train_samples_per_second': 1.362, 'train_steps_per_second': 0.341, 'total_flos': 1029195694080.0, 'train_loss': 4.693825721740723, 'epoch': 2.0})

In [69]:
model.save_pretrained("passformer_v1")

In [71]:
model = PassformerModel.from_pretrained("passformer_v1")

In [None]:
from datasets import load_from_disk
data = load_from_disk("/home/xucong24/Compiler/datasets/ga_llvm_37k")
data = data.train_test_split(test_size=0.1, seed=42)
data.save_to_disk("/home/xucong24/Compiler/datasets/ga_llvm_37k_splited")


Loading dataset from disk:   0%|          | 0/36 [00:00<?, ?it/s]

Saving the dataset (0/33 shards):   0%|          | 0/32916 [00:00<?, ? examples/s]