In [5]:
import sys
import os
import math
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm

from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

import os
import logging
from datetime import datetime
import argparse
import yaml
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    logging as hf_logging
)
from datasets import load_from_disk
from sklearn.model_selection import StratifiedKFold

In [6]:
# 设置模型路径
model_path = "/home/xucong24/Compiler/work_dirs/modernbert_poj104_mlm_train/20250923_141929/final_model"

# 加载模型和分词器
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading tokenizer...


In [7]:
print("Initializing model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
modernbert = AutoModelForMaskedLM.from_pretrained(model_path)
modernbert = modernbert.model
modernbert = modernbert.to(device)
n_params = sum(p.numel() for p in modernbert.parameters())
print(f"{n_params=:.4e}")

Initializing model...
Using device: cuda
n_params=1.4901e+08


In [8]:
modernbert

ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
    (1-21): 21 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((768,), eps=1e-05, e

In [9]:
with open("/home/xucong24/Compiler/datasets/poj104/ir_test/86/21.ll", "r") as f:
    llvm = f.read()
inputs = tokenizer([llvm], return_tensors="pt", max_length=512, padding='max_length', truncation=True)  
inputs

{'input_ids': tensor([[50281,    28, 30073,  1838,   426, 17882,  8658,  1506,    16, 22559,
         15453,    16,  2437,  1419,  1212,    64,  3211,    16,  2566,    16,
          2691,    16,  1797,    15, 10134,    15, 14161,     8,   187,  7831,
          2856,   267,  5038,   426,   346,    70,    14,    78,    27,    70,
            14,    74,  1540,    27,  1540,    14,    71,  1438,    27,  8196,
            14,    79,    25,    27,  1036,    27,  1237,    27,  1540,    14,
            52,  8196,     3,   187,  7831, 16260,   426,   346,    89,  2691,
            64,  1540,    14, 29469,    14, 13217,    14, 26497,     3,   187,
           187, 28764,  2437,    15,  8400,  1450,  3783,    64,  4793,  1450,
         10828,     3,   426,  1511,   551,   891,    25,   748,   187,   187,
            33,    64,    59,   998,    45,    25,   876,   900,  4478,   426,
          4812,  4156,  2462,     3,  2437,    15,  8400,  1450,  3783,    64,
          4793,  1450, 10828,     3,  

In [10]:
# 设备处理
inputs = {k: v.to(device) for k, v in inputs.items()}

# 设置模型为评估模式
modernbert.eval()

# 正确的推理方式 - 使用 ** 解包字典
with torch.no_grad():
    outputs = modernbert(**inputs, output_hidden_states=True, return_dict=True)


In [11]:
outputs[0].shape

torch.Size([1, 512, 768])

In [12]:
outputs[0].squeeze().shape

torch.Size([512, 768])

In [13]:
data_folder = '/home/xucong24/Compiler/datasets/devmap'
platform = 'all'
num_epochs = 50
batch_size = 64
dense_layer_size = 32
print_summary = False
out_folder = '/home/xucong24/Compiler/work_dirs/modernbert_for_devmap'
num_classes = 2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
platform2str = {
    "amd": "AMD Tahiti 7970",
    "nvidia": "NVIDIA GTX 970"
}

if not os.path.exists(out_folder):
    os.makedirs(out_folder)
assert platform in ['all', 'amd', 'nvidia'], \
    'Choose device among: all, amd, nvidia'

In [14]:
# 加载DevMap数据
def load_data(data_path, platform):
    # Load runtime data
    df = pd.read_csv(data_path + "/cgo17-{}.csv".format(platform), index_col=0)
    print('--- Read data from', data_path)

    df["bench_data"] = (
        df.loc[df["dataset"] != "default", "benchmark"]
        + str("_")
        + df.loc[df["dataset"] != "default", "dataset"]
    )
    df.loc[df["dataset"] == "default", "bench_data"] = df.loc[
        df["dataset"] == "default", "benchmark"
    ]

    # llvm文件路径
    df["bench_data_path"] = data_path + '/kernels_ir/' + df["bench_data"] + str(".ll")

    # inst2vec编码
    input_files = df["bench_data_path"].values  
    num_files = len(input_files)
    print('--- Preparing to read', num_files, 'input files from folder', data_path + '/kernels_ir/')
    seqs = list()

    # 遍历文件，读取ir
    bad = []
    for i in tqdm(range(num_files), desc='Encoding files'):
        file = input_files[i]
        if os.path.exists(file):
            with open(file) as f:
                ir = f.read()

            with torch.no_grad():
                # 假设 data 是输入张量或字典
                inputs = tokenizer([ir], return_tensors="pt", max_length=512, padding='max_length', truncation=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                outputs = modernbert(**inputs, output_hidden_states=True, return_dict=True)

                seqs.append(outputs[0].squeeze().to('cpu'))
        else:
            bad.append(i)
    print(bad)
            
    # print('Number of benchmark  : {:>5}'.format(num_files))
    # print('Mean sequence length : {:>5} (rounded down)'.format(math.floor(np.mean(seq_lengths))))
    # print('Number of \'UNK\'      : {:>5}'.format(num_unks))
    # print('Percentage of \'UNK\'  : {:>8.4} (% among all stmts)'.format((num_unks*100)/sum(seq_lengths)))

    df = df.drop(bad)

    # aux data
    aux_in = np.array([
        df["transfer"].values,
        df["wgsize"].values,
    ]).T
    
    # 标签
    label = np.array([1 if x == "GPU" else 0 for x in df["oracle"].values])

    return seqs, aux_in, label, df
    
class DevMapDataset(Dataset):
    def __init__(self, sequences, aux_in, y):
        super().__init__()
        self.sequences = sequences
        self.aux_in = aux_in
        self.y = y
        
    def __getitem__(self, index):
        seqs = self.sequences[index]
        aux = self.aux_in[index]
        label = self.y[index]
        return seqs, aux, label

    def __len__(self):
        return len(self.y)

In [15]:
# 定义网络结构
class DevMapLSTM(nn.Module):
    def __init__(self, embedding_dim, num_layers, dropout):
        super(DevMapLSTM, self).__init__()

        self.lstm = nn.LSTM(embedding_dim, embedding_dim, num_layers,
                            bidirectional=True, batch_first=True, dropout=dropout)
        
        self.fc = nn.Linear(embedding_dim * 2, num_classes)
        
        # self.fc_with_aux = nn.Linear(embedding_dim * 2 + 2, num_classes)
    
        # self.language_model_out = nn.Linear(embedding_dim, 2)
        self.batch_norm = nn.BatchNorm1d(embedding_dim * 2 + 2)
        # self.dense_1 = nn.Linear(embedding_dim * 2 + 2, 128)
        # self.output = nn.Linear(128, 2)
        self.fc2 = nn.Linear(embedding_dim * 2 + 2, 2)
        
    def forward(self, x, aux_input):

        # x = self.embedding(x)

        x, _ = self.lstm(x)
        lang_output = self.fc(x[:, -1, :])
        # final_output = self.fc_with_aux(torch.cat((aux_input, x[:, -1, :]), dim=1))
        
        # out, _ = self.lstm_1(x)
        # out, _ = self.lstm_2(out)
        # lang_output = torch.sigmoid(self.language_model_out(out[:, -1, :]))
        x_combined = torch.cat((aux_input, x[:, -1, :]), dim=1)
        x_combined = self.batch_norm(x_combined)
        # x_combined = torch.relu(self.dense_1(x_combined))
        # final_output = torch.sigmoid(self.output(x_combined))
        final_output = self.fc2(x_combined)
        return final_output, lang_output


In [16]:
def escape_suite_name(g: str) -> str:
    c = g.split('-')
    if c[0] == "amd" or c[0] == "nvidia":
        return c[0].upper() + " SDK"
    if c[0] == "npb" or c[0] == "shoc":
        return c[0].upper()
    elif c[0] == "parboil" or c[0] == "polybench" or c[0] == "rodinia":
        return c[0].capitalize()
    else:
        raise LookupError

def escape_benchmark_name(g: str) -> str:
    c = g.split('-')
    return escape_suite_name(c[0]).split()[0] + "." + c[-2]

def eval_model(model, loader):
    model.eval()
    correct = 0
    pred_list, label_list =  [], []
    with torch.no_grad():
        for batch in loader:
            sequences, aux_input, labels = [b.to(device) for b in batch]
            outputs, lang_outputs = model(sequences, aux_input)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            pred_list.extend(preds.tolist())
            label_list.extend(batch[2].tolist())
            
    accuracy = correct / len(loader.dataset)
    return accuracy, pred_list, label_list

def train_model(model, train_loader, test_loader,  criterion, optimizer, num_epochs, model_path):
    # 模型训练
    pre_eval_acc = 0
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct = 0
        model.train()

        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
        for idx, batch in enumerate(progress_bar):
            sequences, aux_input, labels = [b.to(device) for b in batch]
            
            optimizer.zero_grad()
            outputs, lang_outputs = model(sequences, aux_input)

            # 计算loss值 由output和lang_outputs与label计算CrossEntropyLoss
            loss = criterion(outputs, labels) + 0.2 * criterion(lang_outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()

            progress_bar.set_postfix(loss=epoch_loss / (idx + 1))
                    
        accuracy = correct / len(train_loader.dataset)
        eval_acc,_ , _ = eval_model(model, test_loader)
        print(f"epoch {epoch+1}/{num_epochs}, loss: {epoch_loss:.4f}, train_acc: {accuracy:.4f}, eval_acc: {eval_acc:.4f}")

        if eval_acc > pre_eval_acc:
            pre_eval_acc = eval_acc
            torch.save(model.state_dict(), model_path)
    
    model.load_state_dict(torch.load(model_path, weights_only=False))


In [17]:
# platform_list = ["amd", "nvidia"]
platform_list = ["amd", "nvidia"]

data = []
for i, platform in enumerate(platform_list):
    # 读取数据集
    sequences, aux_in, y, df = load_data(data_folder, platform)
    aux_in_tensor = torch.tensor(aux_in, dtype=torch.float32)
    y_tensor = torch.tensor(y, dtype=torch.int64)

    # 使用 F.normalize 进行 L2 归一化
    # embeddings = torch.tensor(embeddings, dtype=torch.float32)
    # embedding_matrix_normalized = F.normalize(embeddings, p=2, dim=1)
    
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=204)
    for j, (train_index, test_index) in enumerate(kf.split(sequences, y)):
        print('--- Cross validation step [', j, '/ 10 ]')

        model_basename = 'inst2vec_modern_bert_lstm'
        model_path = os.path.join(out_folder, f"models/{model_basename}-{platform}-{j}.pth")
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        log_dir = os.path.join(out_folder, "logs")

        # 读取数据集
        train_data = DevMapDataset([sequences[i] for i in train_index], aux_in_tensor[train_index], y_tensor[train_index])
        test_data = DevMapDataset([sequences[i] for i in test_index], aux_in_tensor[test_index], y_tensor[test_index])
        
        train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_data, batch_size=batch_size)

        
        if not os.path.exists(model_path):
            # 创建模型
            model = DevMapLSTM(768, 3, 0.5).to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.001)

            print('--- Training model... ')
            train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, model_path)
            
        else:
            # 读取模型权重文件
            model = DevMapLSTM(768, 3, 0.5)
            model.load_state_dict(torch.load(model_path, weights_only=False))
            model = model.to(device)
            print("Found trained model in", model_path, ", skipping...")
            
        # 模型预测
        eval_acc, pred_list, label_list = eval_model(model, test_loader)
        print(f'--- Evaluate Accuracy {eval_acc:.4f}')
        benchmarks = df['benchmark'].values[test_index]
        correct = np.array(pred_list) == np.array(label_list)
        zero_r_dev = "runtime_cpu" if platform == "amd" else "runtime_gpu"
        zer_r_runtimes = df[zero_r_dev].values[test_index]
        runtimes = df[['runtime_cpu', 'runtime_gpu']].values[test_index]
        p_runtimes = [r[p_] for p_, r in zip(np.array(pred_list, dtype=int), runtimes)]
        p_speedup = zer_r_runtimes / p_runtimes

        assert len(benchmarks) == len(label_list) == len(correct) == len(pred_list) == len(p_speedup)

        for benchmark_, o_, p_, correct_, p_speedup_ in zip(benchmarks, label_list, pred_list, correct, p_speedup):
            data.append({
                "Model": model_basename,
                "Platform": platform2str[platform],
                'Benchmark': escape_benchmark_name(benchmark_),
                'Benchmark Suite': escape_suite_name(benchmark_),
                "Oracle Mapping": int(o_),
                "Predicted Mapping": int(p_),
                "Correct?": bool(correct_),
                "Speedup": float(p_speedup_),
            })
            
result =  pd.DataFrame(
    data, index=range(1, len(data) + 1), columns=[
        "Model",
        "Platform",
        "Benchmark",
        "Benchmark Suite",
        "Oracle Mapping",
        "Predicted Mapping",
        "Correct?",
        "Speedup"
    ])

--- Read data from /home/xucong24/Compiler/datasets/devmap
--- Preparing to read 680 input files from folder /home/xucong24/Compiler/datasets/devmap/kernels_ir/


Encoding files: 100%|██████████| 680/680 [01:04<00:00, 10.56it/s]


[555, 556, 557, 558, 559, 560, 561, 562, 564, 566, 568, 569, 570, 571, 573]
--- Cross validation step [ 0 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-amd-0.pth , skipping...
--- Evaluate Accuracy 0.8209
--- Cross validation step [ 1 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-amd-1.pth , skipping...
--- Evaluate Accuracy 0.8060
--- Cross validation step [ 2 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-amd-2.pth , skipping...
--- Evaluate Accuracy 0.7313
--- Cross validation step [ 3 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-amd-3.pth , skipping...
--- Evaluate Accuracy 0.6567
--- Cross validation step [ 4 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/

Encoding files: 100%|██████████| 680/680 [00:30<00:00, 21.95it/s]


[555, 556, 557, 558, 559, 560, 561, 562, 564, 566, 568, 569, 570, 571, 573]
--- Cross validation step [ 0 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-nvidia-0.pth , skipping...
--- Evaluate Accuracy 0.6567
--- Cross validation step [ 1 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-nvidia-1.pth , skipping...
--- Evaluate Accuracy 0.6418
--- Cross validation step [ 2 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-nvidia-2.pth , skipping...
--- Evaluate Accuracy 0.6866
--- Cross validation step [ 3 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert_for_devmap/models/inst2vec_modern_bert_lstm-nvidia-3.pth , skipping...
--- Evaluate Accuracy 0.6269
--- Cross validation step [ 4 / 10 ]
Found trained model in /home/xucong24/Compiler/work_dirs/modernbert

In [18]:
print('\n--- Prediction results')
print(result.groupby(['Platform', 'Benchmark Suite'])[['Correct?', 'Speedup']].mean())
print('\n--- Prediction results (summarized)')
print(result.groupby(['Platform'])[['Correct?', 'Speedup']].mean())


--- Prediction results
                                 Correct?    Speedup
Platform        Benchmark Suite                     
AMD Tahiti 7970 AMD SDK          0.562500   0.913787
                NPB              0.749526   2.958338
                NVIDIA SDK       0.750000   3.394625
                Parboil          0.750000   1.771634
                Polybench        0.851852  13.750485
                Rodinia          0.451613   3.748234
                SHOC             0.562500   1.247279
NVIDIA GTX 970  AMD SDK          0.250000   0.545483
                NPB              0.721063   1.333634
                NVIDIA SDK       0.500000   1.066231
                Parboil          0.500000   1.336314
                Polybench        0.444444   0.929101
                Rodinia          0.516129   1.396762
                SHOC             0.916667   2.109839

--- Prediction results (summarized)
                 Correct?   Speedup
Platform                           
AMD Tahiti 7970  0.

In [19]:
static_pred_vals = [58.823529, 56.911765]
static_pred_mean = 57.867647
static_sp_vals = [1.0, 1.0]
static_sp_mean = 1.0
grewe_pred_vals = [73.382353, 72.941176]
grewe_pred_mean = 73.161765
grewe_sp_vals = [2.905822, 1.264801]
grewe_sp_mean = 2.085312
deeptune_pred_vals = [83.676471, 80.294118]
deeptune_pred_mean = 81.985294
deeptune_sp_vals = [2.998314, 1.088315]
deeptune_sp_mean = 2.043315

In [20]:
# Model comparison: prediction accuracy
print('\n--- Model comparison: prediction accuracy')
d = list()
d.append(np.append(static_pred_vals, static_pred_mean))
d.append(np.append(grewe_pred_vals, grewe_pred_mean))
d.append(np.append(deeptune_pred_vals, deeptune_pred_mean))
d.append(np.append(result.groupby(['Platform'])['Correct?'].mean().values * 100,
                    result['Correct?'].mean() * 100))
d = np.array(d).T
print('\n', pd.DataFrame(d, columns=['Static mapping', 'Grewe et al.', 'DeepTune', 'Modernbert'],
                             index=['AMD Tahiti 7970', 'NVIDIA GTX 970', 'Average']))



--- Model comparison: prediction accuracy

                  Static mapping  Grewe et al.   DeepTune  Modernbert
AMD Tahiti 7970       58.823529     73.382353  83.676471   72.180451
NVIDIA GTX 970        56.911765     72.941176  80.294118   69.774436
Average               57.867647     73.161765  81.985294   70.977444


In [21]:
# Model comparison: speedups
print('\n--- Model comparison: speedups')
d = list()
d.append(np.append(static_sp_vals, static_sp_mean))
d.append(np.append(grewe_sp_vals, grewe_sp_mean))
d.append(np.append(deeptune_sp_vals, deeptune_sp_mean))
d.append(np.append(result.groupby(['Platform'])['Speedup'].mean().values,
                    result['Speedup'].mean()))
d = np.array(d).T
print('\n', pd.DataFrame(d, columns=['Static mapping', 'Grewe et al.', 'DeepTuneInst2Vec', 'Modernbert'],
                            index=['AMD Tahiti 7970', 'NVIDIA GTX 970', 'Average']))



--- Model comparison: speedups

                  Static mapping  Grewe et al.  DeepTuneInst2Vec  Modernbert
AMD Tahiti 7970             1.0      2.905822          2.998314    3.261375
NVIDIA GTX 970              1.0      1.264801          1.088315    1.352407
Average                     1.0      2.085312          2.043315    2.306891
