In [44]:
import os
import math
import torch
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm

from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold

from src.utils.system import read_ir_from_file
from sklearn.model_selection import StratifiedKFold
from src.observation.inst2vec import Inst2vecEncoder
from scipy.stats import gmean

In [2]:
data_folder = 'data/threadcoarsening_data'
platform = 'all'
num_epochs = 50
batch_size = 64
dense_layer_size = 32
print_summary = False
out_folder = 'output/inst2vec_for_threadcoarsening'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
cfs = [1, 2, 4, 8, 16, 32]  # thread coarsening factors
platform_list = ["Cypress", "Tahiti", "Fermi", "Kepler"]
_FLAG_TO_DEVICE_NAME = {
    'Cypress': 'AMD Radeon HD 5900',
    'Tahiti': 'AMD Tahiti 7970',
    'Fermi': 'NVIDIA GTX 480',
    'Kepler': 'NVIDIA Tesla K20c'
}

if not os.path.exists(out_folder):
    os.makedirs(out_folder)

assert platform in ["all", "Cypress", "Tahiti", "Fermi", "Kepler"], \
        'Choose device among: all, Cypress, Tahiti, Fermi, Kepler'

In [10]:
def find_runtime(df, kernel, cf, platform):
    filter1 = df["kernel"] == kernel
    filter2 = df["cf"] == cf
    return df.where(filter1 & filter2)["runtime_" + platform].dropna()


def load_data(data_folder):
    oracle_file = os.path.join(data_folder + "/pact-2014-oracles.csv")
    oracles = pd.read_csv(oracle_file)

    runtimes_file = os.path.join(data_folder + "/pact-2014-runtimes.csv")
    df = pd.read_csv(runtimes_file)

    cfs = np.array([1, 2, 4, 8, 16, 32])
    kernel_freq = df["kernel"].value_counts().sort_index().reset_index()

    inferencetime = []
    llfiles = pd.read_csv(data_folder + "/all.txt", sep="\s+")
    fileNum = llfiles["FileNum"]
    filesname = llfiles["ProgramName"]
    oracles["kernel_path"] = str("./") + oracles["kernel"] + str(".ll")
    resultant_data = pd.DataFrame()
    for i, platform in enumerate(platform_list):
        data = pd.merge(
            llfiles, oracles, left_on="ProgramName", right_on="kernel_path"
        )
        data["cf"] = data["cf_" + platform]
        data["device"] = i + 1
        resultant_data = pd.concat([resultant_data, data])

    resultant_data = pd.get_dummies(resultant_data, columns=["device"])
    resultant_data.reset_index(inplace=True)

    encoder = Inst2vecEncoder()  # inst2vec 编码器
    unk_idx = encoder.unknown_vocab_element    
    print('--- Preparing to read', len(resultant_data), 'input files from folder', data_folder + '/kernels_ir/')
    seqs = list()
    seq_lengths = list()
    num_unks = 0

    # 遍历文件
    for file in tqdm(resultant_data["kernel"], desc='Encoding files'):
        # print(file)
        file = data_folder + '/kernels_ir/' + file + str(".ll")
        if os.path.exists(file):
            ir = encoder.preprocess(file)
            encode_ir = encoder.encode(ir)  # inst2vec编码
            seq_lengths.append(len(encode_ir))
            num_unks += encode_ir.count(str(unk_idx))
            seqs.append([int(s) for s in encode_ir])
        else:
            raise FileNotFoundError('Input file not found: ' + file)
        
    maxlen = max(seq_lengths)
    print('Number of benchmark  : {:>5}'.format(len(resultant_data)))
    print('Shortest sequence    : {:>5}'.format(min(seq_lengths)))
    print('Longest sequence     : {:>5}'.format(maxlen))
    print('Mean sequence length : {:>5} (rounded down)'.format(math.floor(np.mean(seq_lengths))))
    print('Number of \'UNK\'      : {:>5}'.format(num_unks))
    print('Percentage of \'UNK\'  : {:>8.4} (% among all stmts)'.format((num_unks*100)/sum(seq_lengths)))
    print('\'UNK\' index          : {:>5}'.format(unk_idx))

    # Padding logic
    padded_sequences = []
    for seq in seqs:
        if len(seq) < maxlen:
            # Pad sequence if it is shorter than maxlen
            seq = seq + [unk_idx] * (maxlen - len(seq))
        padded_sequences.append(seq)

    # Convert to np.array
    encoded = np.array(padded_sequences)

    targetLabel = resultant_data["cf"]
    data = resultant_data
    data = data.drop(
        columns=[
            "index",
            "FileNum",
            "ProgramName",
            "kernel",
            "cf_Fermi",
            "runtime_Fermi",
            "cf_Kepler",
            "runtime_Kepler",
            "cf_Cypress",
            "runtime_Cypress",
            "cf_Tahiti",
            "runtime_Tahiti",
            "kernel_path",
            "cf",
        ]
    )

    assert len(encoded) == len(data) == len(targetLabel)

    return encoded, data, targetLabel, encoder.embeddings, df, oracles

class ThreadCoaDataset(Dataset):
    def __init__(self, encoded, data, targetLabel, embeddings):
        super().__init__()
        self.sequences = encoded
        self.dev = data
        self.y = targetLabel
        self.embeddings = embeddings
        self.embedding_input = self.embeddings[self.sequences]
        
        
    def __getitem__(self, index):
        seqs = self.embedding_input[index]
        dev = self.dev[index]
        label = self.y[index]
        return seqs, dev, label

    def __len__(self):
        return len(self.y)

In [4]:
# 定义网络结构
class ThreadCoaLSTM(nn.Module):
    def __init__(self, embedding_dim, dense_layer_size):
        super(ThreadCoaLSTM, self).__init__()
        self.lstm_1 = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
        self.lstm_2 = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)
        self.language_model_out = nn.Linear(embedding_dim, 6)
        self.batch_norm = nn.BatchNorm1d(embedding_dim + 4)
        self.dense_1 = nn.Linear(embedding_dim + 4, dense_layer_size)
        self.output = nn.Linear(dense_layer_size, 6)
        
    def forward(self, x, device_input):
        out, _ = self.lstm_1(x)
        out, _ = self.lstm_2(out)
        lang_output = torch.sigmoid(self.language_model_out(out[:, -1, :]))
        x_combined = torch.cat((device_input, out[:, -1, :]), dim=1)
        x_combined = self.batch_norm(x_combined)
        x_combined = torch.relu(self.dense_1(x_combined))
        final_output = torch.sigmoid(self.output(x_combined))
        return final_output, lang_output


In [35]:
def eval_model(model, loader):
    model.eval()
    correct = 0
    pred_list, label_list =  [], []
    with torch.no_grad():
        for batch in loader:
            sequences, aux_input, labels = [b.to(device) for b in batch]
            outputs, lang_outputs = model(sequences, aux_input)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            pred_list.extend(preds.tolist())
            label_list.extend(batch[2].tolist())
            
    accuracy = correct / len(loader.dataset)
    return accuracy, pred_list, label_list

def train_model(model, train_loader, test_loader,  criterion, optimizer, num_epochs, model_path):
    # 模型训练
    pre_eval_acc = -1
    for epoch in range(num_epochs):
        epoch_loss = 0
        correct = 0
        model.train()
        for batch in train_loader:
            sequences, aux_input, labels = [b.to(device) for b in batch]
            
            optimizer.zero_grad()
            outputs, lang_outputs = model(sequences, aux_input)

            # 计算loss值 由output和lang_outputs与label计算CrossEntropyLoss
            loss = criterion(outputs, labels) + 0.2 * criterion(lang_outputs, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
                    
        accuracy = correct / len(train_loader.dataset)
        eval_acc,_ , _ = eval_model(model, test_loader)
        # print(f"epoch {epoch+1}/{num_epochs}, loss: {epoch_loss:.4f}, train_acc: {accuracy:.4f}, eval_acc: {eval_acc:.4f}")

        if eval_acc > pre_eval_acc:
            pre_eval_acc = eval_acc
            torch.save(model.state_dict(), model_path)
    
    model.load_state_dict(torch.load(model_path, weights_only=False))


In [None]:
# platform_list = ["amd", "nvidia"]
sequences, data, targetLabel, embeddings, df, oracles = load_data(data_folder)
device_onehot_tensor = torch.tensor(data.values, dtype=torch.float32)
y_tensor = torch.argmax(torch.tensor(pd.get_dummies(targetLabel).values, dtype=torch.float32), dim=1)
cfs = np.array([1.0, 2.0, 4.0, 8.0, 16.0, 32.0])
kernel_freq = df["kernel"].value_counts().sort_index().reset_index()


# 使用 F.normalize 进行 L2 归一化
embeddings = torch.tensor(embeddings, dtype=torch.float32)
embedding_matrix_normalized = F.normalize(embeddings, p=2, dim=1)

data = []
kf = KFold(n_splits=len(targetLabel), shuffle=False)
for j, (train_index, test_index) in enumerate(kf.split(sequences, targetLabel)):
    print('--- Cross validation step [', j, '/ ',len(targetLabel),' ]')
    kernel = sorted(set(df["kernel"]))[test_index[0] % 17]

    model_basename = 'lstm'
    model_path = os.path.join(out_folder, f"models/{model_basename}-{platform}-{j}.pth")
    os.makedirs(os.path.dirname(model_path), exist_ok=True)
    log_dir = os.path.join(out_folder, "logs")
    
    # 读取数据集
    train_data = ThreadCoaDataset(sequences[train_index], device_onehot_tensor[train_index], y_tensor[train_index], embedding_matrix_normalized)
    test_data = ThreadCoaDataset(sequences[test_index], device_onehot_tensor[test_index], y_tensor[test_index], embedding_matrix_normalized)
    
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    if not os.path.exists(model_path):
        # 创建模型
        model = ThreadCoaLSTM(embedding_dim=embedding_matrix_normalized.shape[1], dense_layer_size=dense_layer_size).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        print('--- Training model... ')
        train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, model_path)
        
    else:
        # 读取模型权重文件
        model = ThreadCoaLSTM(embedding_dim=embedding_matrix_normalized.shape[1], dense_layer_size=dense_layer_size).to(device)
        model.load_state_dict(torch.load(model_path, weights_only=False))
        model = model.to(device)
        print("Found trained model in", model_path, ", skipping...")

    # 模型预测
    accuracy, pred_list, label_list = eval_model(model, test_loader)
    # accuracy, pred_list, label_list
    prediction = cfs[pred_list[0]]

    if device_onehot_tensor[test_index].tolist()[0][0] == 1:
        platform = platform_list[0]
    elif device_onehot_tensor[test_index].tolist()[0][1] == 1:
        platform = platform_list[1]
    elif device_onehot_tensor[test_index].tolist()[0][2] == 1:
        platform = platform_list[2]
    elif device_onehot_tensor[test_index].tolist()[0][3] == 1:
        platform = platform_list[3]

    oracle_runtimes = np.array([float(x) for x in oracles["runtime_" + platform]])
    oracle = targetLabel[test_index[0]]
    print(oracle == prediction)

    rt_baseline = float(find_runtime(df, kernel, 1, platform))
    rt_pred = float(find_runtime(df, kernel, prediction, platform))
    rt_oracle = float(oracle_runtimes[test_index[0] % 17])
    data.append(
        {
            "Model": "IR2vec",
            "Platform": _FLAG_TO_DEVICE_NAME[platform],
            "Kernel": kernel,
            "Oracle-CF": oracle,
            "Predicted-CF": prediction,
            "Speedup": rt_baseline / rt_pred,
            "Oracle": rt_oracle / rt_pred,
            "OracleSpeedUp": rt_baseline / rt_oracle,
        }
    )

# Results from other works

The accuracies and speedups are taken from the results quoted by NCC in their work for the purpose of comparison. For detailed analysis (discussed later), we run these models and the obtained results are stored as pickle files in ./data/prior_art_results. 

In [38]:
magni_sp_vals = [1.21, 1.01, 0.86, 0.94]
magni_sp_mean = [1.005]
deeptune_sp_vals = [1.10, 1.05, 1.10, 0.99]
deeptune_sp_mean = [1.06]
deeptuneTL_sp_vals = [1.17, 1.23, 1.14, 0.93]
deeptuneTL_sp_mean = [1.1175]
ncc_sp_vals = [1.29, 1.07, 0.97, 1.01]
ncc_sp_mean = [1.086]

# IR2Vec Flow-Aware Vs. Others

In [41]:
ir2vec = pd.DataFrame(
    data,
    columns=[
        "Model",
        "Platform",
        "Kernel",
        "Oracle-CF",
        "Predicted-CF",
        "Speedup",
        "Oracle",
        "OracleSpeedUp",
    ],
)
print("\nSpeedup Matrix: IR2Vec Vs. others\n")
ir2vec_sp_vals = ir2vec.groupby(["Platform"])["Speedup"].mean().values
ir2vec_sp_mean = ir2vec_sp_vals.mean()
sp_df = pd.DataFrame(
    {
        "Magni et al.": magni_sp_vals + magni_sp_mean,
        "DeepTune": deeptune_sp_vals + deeptune_sp_mean,
        "DeepTune-TL": deeptuneTL_sp_vals + deeptuneTL_sp_mean,
        "NCC": ncc_sp_vals + ncc_sp_mean,
        "Inst2Vec": list(ir2vec_sp_vals) + [ir2vec_sp_mean],
    },
    index=[
        "AMD Radeon HD 5900",
        "AMD Tahiti 7970",
        "NVIDIA GTX 480",
        "NVIDIA Tesla K20c",
        "Average",
    ],
)
print(sp_df)


Speedup Matrix: IR2Vec Vs. others

                    Magni et al.  DeepTune  DeepTune-TL    NCC  Inst2Vec
AMD Radeon HD 5900         1.210      1.10       1.1700  1.290  1.228449
AMD Tahiti 7970            1.010      1.05       1.2300  1.070  1.252760
NVIDIA GTX 480             0.860      1.10       1.1400  0.970  1.109817
NVIDIA Tesla K20c          0.940      0.99       0.9300  1.010  1.154982
Average                    1.005      1.06       1.1175  1.086  1.186502


# Other related observations
For the comparison, we use the results obtained on training the earlier works  
## Speedup comparison

In [42]:
magni_res = pd.read_pickle(data_folder + "/prior_art_results/magni_tf.results")
deeptune_res = pd.read_pickle(data_folder + "/prior_art_results/deeptune_tf.results")
deeptune_tl_res = pd.read_pickle(data_folder + "/prior_art_results/deeptune_tl_tf.results")
ncc_res = pd.read_pickle(data_folder + "/prior_art_results/ncc_fix_tf.results")

In [46]:
magni_geomean = gmean(magni_res["Speedup"].values)
deeptune_geomean = gmean(deeptune_res["Speedup"].values)
deeptune_tl_geomean = gmean(deeptune_tl_res["Speedup"].values)
ncc_geomean = gmean(ncc_res["Speedup"].values)
inst2vec_geomean = gmean(ir2vec["Speedup"].values)


print(f"Geometric mean of Magni et al. {magni_geomean:.2f}x")
print(f"Geometric mean of DeepTune {deeptune_geomean:.2f}x")
print(f"Geometric mean of Inst2Vec {inst2vec_geomean:.2f}x")

Geometric mean of Magni et al. 0.86x
Geometric mean of DeepTune 1.00x
Geometric mean of Inst2Vec 1.15x


In [48]:
def calcSpeedup(platform):
    magni_geomean = gmean(
        magni_res[magni_res["Platform"] == platform]["Speedup"].values
    )
    deeptune_geomean = gmean(
        deeptune_res[deeptune_res["Platform"] == platform]["Speedup"].values
    )
    deeptune_tl_geomean = gmean(
        deeptune_tl_res[deeptune_tl_res["Platform"] == platform]["Speedup"].values
    )
    ncc_geomean = gmean(ncc_res[ncc_res["Platform"] == platform]["Speedup"].values)
    ir2vec_sym_geomean = gmean(
        ir2vec[ir2vec["Platform"] == platform]["Speedup"].values
    )

    print(f"Geometric mean of Magni et al. {magni_geomean:.2f}x")
    print(f"Geometric mean of DeepTune {deeptune_geomean:.2f}x")
    print(f"Geometric mean of DeepTune-TL {deeptune_tl_geomean:.2f}x")
    print(f"Geometric mean of Inst2Vec {ir2vec_sym_geomean:.2f}x")

    return (
        round(magni_geomean, 2),
        round(deeptune_geomean, 2),
        round(deeptune_tl_geomean, 2),
        round(ir2vec_sym_geomean, 3),
    )

In [50]:
rad_magni, rad_dt, rad_dtTL, rad_inst2vec = calcSpeedup(
    "AMD Radeon HD 5900"
)

Geometric mean of Magni et al. 0.94x
Geometric mean of DeepTune 1.14x
Geometric mean of DeepTune-TL 1.14x
Geometric mean of Inst2Vec 1.19x


In [51]:
tah_magni, tah_dt, tah_dtTL, tah_inst2vec = calcSpeedup(
    "AMD Tahiti 7970"
)

Geometric mean of Magni et al. 0.98x
Geometric mean of DeepTune 0.95x
Geometric mean of DeepTune-TL 0.90x
Geometric mean of Inst2Vec 1.20x


In [52]:
gtx_magni, gtx_dt, gtx_dtTL, gtx_inst2vec = calcSpeedup(
    "NVIDIA GTX 480"
)

Geometric mean of Magni et al. 0.81x
Geometric mean of DeepTune 0.94x
Geometric mean of DeepTune-TL 0.99x
Geometric mean of Inst2Vec 1.09x


In [53]:
tes_magni, tes_dt, tes_dtTL, tes_inst2vec = calcSpeedup(
    "NVIDIA Tesla K20c"
)

Geometric mean of Magni et al. 0.74x
Geometric mean of DeepTune 0.98x
Geometric mean of DeepTune-TL 1.01x
Geometric mean of Inst2Vec 1.13x
