In [3]:

import sys
import os
import math
import torch
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm

from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader


import os
import logging
from datetime import datetime
import argparse
import yaml
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    logging as hf_logging
)
from datasets import load_from_disk
sys.path.append("/home/xucong24/Compiler")
from src.model.tokenizer import Inst2VecTokenizer
from sklearn.model_selection import StratifiedKFold

In [4]:
# 设置模型路径
model_path = "/home/xucong24/Compiler/work_dirs/inst2vec_modernbert_poj104_mlm_train/20250825_063731/final_model"
tokenizer_path = "/home/xucong24/Compiler/work_dirs/inst2vec_modernbert_poj104_mlm_train/20250825_063731/final_model"

# 加载模型和分词器
print("Loading tokenizer...")
tokenizer = Inst2VecTokenizer.from_pretrained(tokenizer_path)

Loading tokenizer...


In [5]:
print("Initializing model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
modernbert = AutoModelForMaskedLM.from_pretrained(model_path)
modernbert = modernbert.model
modernbert = modernbert.to(device)
n_params = sum(p.numel() for p in modernbert.parameters())
print(f"{n_params=:.4e}")

Initializing model...
Using device: cuda
n_params=1.1691e+08


In [6]:
modernbert

ModernBertModel(
  (embeddings): ModernBertEmbeddings(
    (tok_embeddings): Embedding(8569, 768, padding_idx=8565)
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (drop): Dropout(p=0.0, inplace=False)
  )
  (layers): ModuleList(
    (0): ModernBertEncoderLayer(
      (attn_norm): Identity()
      (attn): ModernBertAttention(
        (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
        (rotary_emb): ModernBertRotaryEmbedding()
        (Wo): Linear(in_features=768, out_features=768, bias=False)
        (out_drop): Identity()
      )
      (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): ModernBertMLP(
        (Wi): Linear(in_features=768, out_features=2304, bias=False)
        (act): GELUActivation()
        (drop): Dropout(p=0.0, inplace=False)
        (Wo): Linear(in_features=1152, out_features=768, bias=False)
      )
    )
    (1-21): 21 x ModernBertEncoderLayer(
      (attn_norm): LayerNorm((768,), eps=1e-05, ele

In [7]:
with open("/home/xucong24/Compiler/datasets/poj104/ir_test/86/21.ll", "r") as f:
    llvm = f.read()
inputs = tokenizer([llvm], return_tensors="pt", max_length=512, padding='max_length', truncation=True)  
inputs

{'input_ids': tensor([[8566,    0, 8564,   40,    8,   71,   77, 8564, 8564, 8564,  263, 1313,
          8564,  289,  289,  289,  289, 8564,  289,  289,  289, 8564, 8564,  364,
           216, 8564,  295,  295,  394,  204, 8564, 8564,  364,  216, 8564,  295,
           295,  394,  204, 8564,  295,  252, 8564, 8564,  216, 8564,  295,  431,
           311,  216, 8564,  295,  311,  295, 6277,  252, 8564,  295,  295,  626,
           695, 8564,  204, 8564,  295, 8564,  694,  311,  216, 8564,  216, 8564,
           295, 6277,  252, 8564,  295,  295,  626,  695, 6277,  425,  204, 8564,
           295, 6277,  311,  216, 8564,  295, 6277,  252, 8564,  295,  295,  626,
           695, 6277, 8564,  204, 8564,  295, 6277,  252, 8564,  295,  311,  216,
          8564,  295, 8564,  694,  311,  216, 8564,  216, 8564,  295, 8564,  216,
          8564,  295,  431,  311,  216, 8564,  481,   77, 8564,  231,  263, 8567,
          8565, 8565, 8565, 8565, 8565, 8565, 8565, 8565, 8565, 8565, 8565, 8565,
   

In [8]:
# 设备处理
inputs = {k: v.to(device) for k, v in inputs.items()}

# 设置模型为评估模式
modernbert.eval()

# 正确的推理方式 - 使用 ** 解包字典
with torch.no_grad():
    outputs = modernbert(**inputs, output_hidden_states=True, return_dict=True)


In [9]:
outputs[0].shape

torch.Size([1, 512, 768])

In [10]:
data_folder = '/home/xucong24/Compiler/datasets/threadcoarsening'
platform = 'all'
num_epochs = 50
batch_size = 64
dense_layer_size = 32
print_summary = False
out_folder = '/home/xucong24/Compiler/work_dirs/inst2vec_modernbert_for_threadcoarsening'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cfs = [1, 2, 4, 8, 16, 32]  # thread coarsening factors
platform_list = ["Cypress", "Tahiti", "Fermi", "Kepler"]
_FLAG_TO_DEVICE_NAME = {
    'Cypress': 'AMD Radeon HD 5900',
    'Tahiti': 'AMD Tahiti 7970',
    'Fermi': 'NVIDIA GTX 480',
    'Kepler': 'NVIDIA Tesla K20c'
}

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

assert platform in ["all", "Cypress", "Tahiti", "Fermi", "Kepler"], \
        'Choose device among: all, Cypress, Tahiti, Fermi, Kepler'

In [11]:
def find_runtime(df, kernel, cf, platform):
    filter1 = df["kernel"] == kernel
    filter2 = df["cf"] == cf
    return df.where(filter1 & filter2)["runtime_" + platform].dropna()


def load_data(data_folder):
    oracle_file = os.path.join(data_folder + "/pact-2014-oracles.csv")
    oracles = pd.read_csv(oracle_file)

    runtimes_file = os.path.join(data_folder + "/pact-2014-runtimes.csv")
    df = pd.read_csv(runtimes_file)

    cfs = np.array([1, 2, 4, 8, 16, 32])
    kernel_freq = df["kernel"].value_counts().sort_index().reset_index()

    inferencetime = []
    llfiles = pd.read_csv(data_folder + "/all.txt", sep="\s+")
    fileNum = llfiles["FileNum"]
    filesname = llfiles["ProgramName"]
    oracles["kernel_path"] = str("./") + oracles["kernel"] + str(".ll")
    resultant_data = pd.DataFrame()
    for i, platform in enumerate(platform_list):
        data = pd.merge(
            llfiles, oracles, left_on="ProgramName", right_on="kernel_path"
        )
        data["cf"] = data["cf_" + platform]
        data["device"] = i + 1
        resultant_data = pd.concat([resultant_data, data])

    resultant_data = pd.get_dummies(resultant_data, columns=["device"])
    resultant_data.reset_index(inplace=True)

    print('--- Preparing to read', len(resultant_data), 'input files from folder', data_folder + '/kernels_ir/')
    seqs = list()
    num_unks = 0

    # 遍历文件
    for file in tqdm(resultant_data["kernel"], desc='Encoding files'):
        # print(file)
        file = data_folder + '/kernels_ir/' + file + str(".ll")
        if os.path.exists(file):

            with open(file) as f:
                ir = f.read()

            with torch.no_grad():
                # 假设 data 是输入张量或字典
                inputs = tokenizer([ir], return_tensors="pt", max_length=512, padding='max_length', truncation=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                outputs = modernbert(**inputs, output_hidden_states=True, return_dict=True)
                seqs.append(outputs[0].squeeze().to('cpu'))

        else:
            raise FileNotFoundError('Input file not found: ' + file)

    targetLabel = resultant_data["cf"]
    data = resultant_data
    data = data.drop(
        columns=[
            "index",
            "FileNum",
            "ProgramName",
            "kernel",
            "cf_Fermi",
            "runtime_Fermi",
            "cf_Kepler",
            "runtime_Kepler",
            "cf_Cypress",
            "runtime_Cypress",
            "cf_Tahiti",
            "runtime_Tahiti",
            "kernel_path",
            "cf",
        ]
    )

    return seqs, data, targetLabel, df, oracles

class ThreadCoaDataset(Dataset):
    def __init__(self, seqs, data, targetLabel):
        super().__init__()
        self.sequences = seqs
        self.dev = data
        self.y = targetLabel
        
    def __getitem__(self, index):
        seqs = self.sequences[index]
        dev = self.dev[index]
        label = self.y[index]
        return seqs, dev, label

    def __len__(self):
        return len(self.y)

  llfiles = pd.read_csv(data_folder + "/all.txt", sep="\s+")


In [12]:
# 定义网络结构
class ThreadCoaLSTM(nn.Module):
    def __init__(self, embedding_dim, dense_layer_size):
        super(ThreadCoaLSTM, self).__init__()
        self.lstm_1 = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
        self.lstm_2 = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
        self.language_model_out = nn.Linear(embedding_dim, 6)
        self.batch_norm = nn.BatchNorm1d(embedding_dim + 4)
        self.dense_1 = nn.Linear(embedding_dim + 4, dense_layer_size)
        self.output = nn.Linear(dense_layer_size, 6)
        
    def forward(self, x, device_input):
        out, _ = self.lstm_1(x)
        out, _ = self.lstm_2(out)
        lang_output = torch.sigmoid(self.language_model_out(out[:, -1, :]))
        x_combined = torch.cat((device_input, out[:, -1, :]), dim=1)
        x_combined = self.batch_norm(x_combined)
        x_combined = torch.relu(self.dense_1(x_combined))
        final_output = torch.sigmoid(self.output(x_combined))
        return final_output, lang_output


In [13]:
def eval_model(model, loader):
    model.eval()
    correct = 0
    pred_list, label_list =  [], []
    with torch.no_grad():
        for batch in loader:
            sequences, aux_input, labels = [b.to(device) for b in batch]
            outputs, lang_outputs = model(sequences, aux_input)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            pred_list.extend(preds.tolist())
            label_list.extend(batch[2].tolist())
            
    accuracy = correct / len(loader.dataset)
    return accuracy, pred_list, label_list

def train_model(model, train_loader, test_loader,  criterion, optimizer, num_epochs, model_path):
    # 模型训练
    pre_eval_acc = -1
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct = 0
        model.train()
        for batch in train_loader:
            sequences, aux_input, labels = [b.to(device) for b in batch]
            
            optimizer.zero_grad()
            outputs, lang_outputs = model(sequences, aux_input)

            # 计算loss值 由output和lang_outputs与label计算CrossEntropyLoss
            loss = criterion(outputs, labels) + 0.2 * criterion(lang_outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
                    
        accuracy = correct / len(train_loader.dataset)
        eval_acc,_ , _ = eval_model(model, test_loader)
        # print(f"epoch {epoch+1}/{num_epochs}, loss: {epoch_loss:.4f}, train_acc: {accuracy:.4f}, eval_acc: {eval_acc:.4f}")

        if eval_acc > pre_eval_acc:
            pre_eval_acc = eval_acc
            torch.save(model.state_dict(), model_path)
    
    model.load_state_dict(torch.load(model_path, weights_only=False))


In [14]:
from sklearn.model_selection import KFold

In [15]:
# platform_list = ["amd", "nvidia"]
sequences, data, targetLabel, df, oracles = load_data(data_folder)
device_onehot_tensor = torch.tensor(data.values, dtype=torch.float32)
y_tensor = torch.argmax(torch.tensor(pd.get_dummies(targetLabel).values, dtype=torch.float32), dim=1)
cfs = np.array([1.0, 2.0, 4.0, 8.0, 16.0, 32.0])
kernel_freq = df["kernel"].value_counts().sort_index().reset_index()

--- Preparing to read 68 input files from folder /home/xucong24/Compiler/datasets/threadcoarsening/kernels_ir/


Encoding files: 100%|██████████| 68/68 [00:01<00:00, 36.14it/s]


In [16]:
data = []
kf = KFold(n_splits=len(targetLabel), shuffle=False)
for j, (train_index, test_index) in enumerate(kf.split(sequences, targetLabel)):
    print('--- Cross validation step [', j, '/ ',len(targetLabel),' ]')
    kernel = sorted(set(df["kernel"]))[test_index[0] % 17]

    model_basename = 'lstm'
    model_path = os.path.join(out_folder, f"models/{model_basename}-{platform}-{j}.pth")
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    log_dir = os.path.join(out_folder, "logs")
    
    # 读取数据集
    train_data = ThreadCoaDataset([sequences[i] for i in train_index], device_onehot_tensor[train_index], y_tensor[train_index])
    test_data = ThreadCoaDataset([sequences[i] for i in test_index], device_onehot_tensor[test_index], y_tensor[test_index])
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    if not os.path.exists(model_path):
        # 创建模型
        model = ThreadCoaLSTM(embedding_dim=768, dense_layer_size=dense_layer_size).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        print('--- Training model... ')
        train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, model_path)
        
    else:
        # 读取模型权重文件
        model = ThreadCoaLSTM(embedding_dim=768, dense_layer_size=dense_layer_size).to(device)
        model.load_state_dict(torch.load(model_path, weights_only=False))
        model = model.to(device)
        print("Found trained model in", model_path, ", skipping...")

    # 模型预测
    accuracy, pred_list, label_list = eval_model(model, test_loader)
    # accuracy, pred_list, label_list
    prediction = cfs[pred_list[0]]

    if device_onehot_tensor[test_index].tolist()[0][0] == 1:
        platform = platform_list[0]
    elif device_onehot_tensor[test_index].tolist()[0][1] == 1:
        platform = platform_list[1]
    elif device_onehot_tensor[test_index].tolist()[0][2] == 1:
        platform = platform_list[2]
    elif device_onehot_tensor[test_index].tolist()[0][3] == 1:
        platform = platform_list[3]

    oracle_runtimes = np.array([float(x) for x in oracles["runtime_" + platform]])
    oracle = targetLabel[test_index[0]]
    print(oracle == prediction)

    rt_baseline = float(find_runtime(df, kernel, 1, platform))
    # rt_pred = float(find_runtime(df, kernel, prediction, platform))
    result = find_runtime(df, kernel, prediction, platform)
    if not result.empty:
        rt_pred = float(result.iloc[0])
    else:
        # rt_pred = float('nan')  # 或你可以 raise 或跳过这个样本
        print('error')
        continue
    rt_oracle = float(oracle_runtimes[test_index[0] % 17])
    data.append(
        {
            "Model": "IR2vec",
            "Platform": _FLAG_TO_DEVICE_NAME[platform],
            "Kernel": kernel,
            "Oracle-CF": oracle,
            "Predicted-CF": prediction,
            "Speedup": rt_baseline / rt_pred,
            "Oracle": rt_oracle / rt_pred,
            "OracleSpeedUp": rt_baseline / rt_oracle,
        }
    )

--- Cross validation step [ 0 /  68  ]
--- Training model... 
True
--- Cross validation step [ 1 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 2 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
error
--- Cross validation step [ 3 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 4 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 5 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 6 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 7 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 8 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 9 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 10 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 11 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 12 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 13 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 14 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 15 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 16 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 17 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 18 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 19 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 20 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 21 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 22 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 23 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 24 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 25 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 26 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 27 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 28 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 29 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 30 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 31 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 32 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 33 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 34 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 35 /  68  ]


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


--- Training model... 
True
--- Cross validation step [ 36 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 37 /  68  ]


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


--- Training model... 
False
--- Cross validation step [ 38 /  68  ]


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


--- Training model... 
True
--- Cross validation step [ 39 /  68  ]


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


--- Training model... 
False
--- Cross validation step [ 40 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 41 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 42 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 43 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 44 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 45 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 46 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 47 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 48 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 49 /  68  ]


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


--- Training model... 
True
--- Cross validation step [ 50 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 51 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 52 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 53 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 54 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 55 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 56 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 57 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 58 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


False
--- Cross validation step [ 59 /  68  ]


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


--- Training model... 
True
--- Cross validation step [ 60 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 61 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 62 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 63 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 64 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 65 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 66 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True
--- Cross validation step [ 67 /  68  ]
--- Training model... 


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


True


  rt_baseline = float(find_runtime(df, kernel, 1, platform))


# Results from other works

The accuracies and speedups are taken from the results quoted by NCC in their work for the purpose of comparison. For detailed analysis (discussed later), we run these models and the obtained results are stored as pickle files in ./data/prior_art_results. 

In [17]:
magni_sp_vals = [1.21, 1.01, 0.86, 0.94]
magni_sp_mean = [1.005]
deeptune_sp_vals = [1.10, 1.05, 1.10, 0.99]
deeptune_sp_mean = [1.06]
deeptuneTL_sp_vals = [1.17, 1.23, 1.14, 0.93]
deeptuneTL_sp_mean = [1.1175]
ncc_sp_vals = [1.29, 1.07, 0.97, 1.01]
ncc_sp_mean = [1.086]
inst2vec_sp_vals = [1.228449, 1.252760, 1.109817, 1.154982]
inst2vec_sp_mean = [1.186502]

# IR2Vec Flow-Aware Vs. Others

In [18]:
modernbert_result = pd.DataFrame(
    data,
    columns=[
        "Model",
        "Platform",
        "Kernel",
        "Oracle-CF",
        "Predicted-CF",
        "Speedup",
        "Oracle",
        "OracleSpeedUp",
    ],
)
print("\nSpeedup Matrix: Modernbert Vs. others\n")
modernbert_result_sp_vals = modernbert_result.groupby(["Platform"])["Speedup"].mean().values
modernbert_result_sp_vals_sp_mean = modernbert_result_sp_vals.mean()
sp_df = pd.DataFrame(
    {
        "Magni et al.": magni_sp_vals + magni_sp_mean,
        "DeepTune": deeptune_sp_vals + deeptune_sp_mean,
        "DeepTune-TL": deeptuneTL_sp_vals + deeptuneTL_sp_mean,
        "NCC": ncc_sp_vals + ncc_sp_mean,
        "Inst2Vec": inst2vec_sp_vals + inst2vec_sp_mean,
        'ModernBert': list(modernbert_result_sp_vals) + [modernbert_result_sp_vals_sp_mean],
    },
    index=[
        "AMD Radeon HD 5900",
        "AMD Tahiti 7970",
        "NVIDIA GTX 480",
        "NVIDIA Tesla K20c",
        "Average",
    ],
)
print(sp_df)


Speedup Matrix: Modernbert Vs. others

                    Magni et al.  DeepTune  DeepTune-TL    NCC  Inst2Vec  \
AMD Radeon HD 5900         1.210      1.10       1.1700  1.290  1.228449   
AMD Tahiti 7970            1.010      1.05       1.2300  1.070  1.252760   
NVIDIA GTX 480             0.860      1.10       1.1400  0.970  1.109817   
NVIDIA Tesla K20c          0.940      0.99       0.9300  1.010  1.154982   
Average                    1.005      1.06       1.1175  1.086  1.186502   

                    ModernBert  
AMD Radeon HD 5900    1.466084  
AMD Tahiti 7970       1.342378  
NVIDIA GTX 480        1.271334  
NVIDIA Tesla K20c     1.158082  
Average               1.309470  
