# Latency Dataset - GNN Model

This example will demonstrate our ability to predict latency with data from NN-meter through GNN.

Let's start our journey!

## Step 1: Download data and Construct our graph

In order to predict latency through GNN, we need to build adjacent matrix and feature matrix of an original model.

### Step 1.1: Define URL and Hardware

We first give the url to download the data. And we have four types of hardware: CortexA76 CPU, Adreno 640 GPU, Adreno 630 GPU and Myriad VPU.

In [1]:
import os

RAW_DATA_URL = "https://github.com/microsoft/nn-Meter/releases/download/v1.0-data/datasets.zip"

hws = [
    "cortexA76cpu_tflite21",
    "adreno640gpu_tflite21",
    "adreno630gpu_tflite21",
    "myriadvpu_openvino2019r2",
]

### Step 1.2: Construct our graph

We now download the dataset, load latency and the corresponding model data, and build the graphs. In order to reduce the time of data loading, we also saved the processed data for convenient use next time. You can change the dgl-cu version shown below based on your CUDA version. Since our CUDA version is 11.0, we install dgl-cu110 here

In [2]:
# !pip install jsonlines
# !pip install dgl-cu110

In [3]:
import os
import torch
import pickle
import jsonlines

from torch.serialization import save

class LatencyDataset(torch.utils.data.Dataset):

    def __init__(self, data_dir='./dataset', train=True, device='cpu', split_ratio=0.8):
        """
        Dataloader of the Latency Dataset

        Parameters
        ----------
        data_dir : string
            Path to save the downloaded dataset
        train: bool
            Get the train dataset or the test dataset
        device: string
            The Device type of the corresponding latency
        shuffle: bool
            If shuffle the dataset at the begining of an epoch
        batch_size: int
            Batch size.
        split_ratio: float
            The ratio to split the train dataset and the test dataset.
        """
        err_str = "Only support device type cpu/gpu640/gpu630/vpu"
        assert device in ['cpu', 'gpu640', 'gpu630', 'vpu'], err_str
        if device == 'cpu':
            self.device = hws[0]
        elif device == 'gpu640':
            self.device = hws[1]
        elif device == 'gpu630':
            self.device = hws[2]
        else:
            self.device = hws[3]
        self.data_dir = data_dir
        self.train = train
        self.split_ratio = split_ratio
        self.adjs = {}
        self.attrs = {}
        self.nodename2id = {}
        self.id2nodename = {}
        self.op_types = set()
        self.opname2id = {}
        self.raw_data = {}
        self.name_list = []
        self.latencies = {}
        self.download_data()
        if self.train:
            if not os.path.exists(os.path.join(self.data_dir, 'train_data_package.pkl')):
                self.load_model_archs_and_latencies(self.data_dir)
                self.construct_attrs()
                self.save_data()
            else:
                self.load_data()

        else:
            if not os.path.exists(os.path.join(self.data_dir, 'test_data_package.pkl')):
                self.load_model_archs_and_latencies(self.data_dir)
                self.construct_attrs()
                self.save_data()
            else:
                self.load_data()

        self.name_list = list(
            filter(lambda x: x in self.latencies, self.name_list))



    def download_data(self):
        print("Downloading.")
        if not os.path.exists(self.data_dir):
            os.makedirs(self.data_dir, exist_ok=True)
            os.system('wget -P %s  %s' % (self.data_dir, RAW_DATA_URL))
            os.system('unzip %s/datasets.zip -d %s' %
                      (self.data_dir, self.data_dir))

    def load_model_archs_and_latencies(self, data_dir):
        filelist = os.listdir(data_dir)
        for filename in filelist:
            if os.path.splitext(filename)[-1] != '.jsonl':
                continue
            self.load_model(os.path.join(data_dir, filename))

    def load_model(self, fpath):
        """
        Load a concrete model type.
        """
        print('Loading models in ', fpath)
        assert os.path.exists(fpath), '{} does not exists'.format(fpath)

        with jsonlines.open(fpath) as reader:
            _names = []
            for obj in reader:
                if obj[self.device]:
                    # print(obj['id'])
                    _names.append(obj['id'])
                    self.latencies[obj['id']] = float(obj[self.device])

            _names = sorted(_names)
            split_ratio = self.split_ratio if self.train else 1-self.split_ratio
            count = int(len(_names) * split_ratio)

            if self.train:
                _model_names = _names[:count]
            else:
                _model_names = _names[-1*count:]

            self.name_list.extend(_model_names)

        with jsonlines.open(fpath) as reader:
            for obj in reader:
                if obj['id'] in _model_names:
                    model_name = obj['id']
                    model_data = obj['graph']
                    self.parse_model(model_name, model_data)
                    self.raw_data[model_name] = model_data
    

    def construct_attrs(self):
        """
        Construct the attributes matrix for each model.
        Attributes tensor:
        one-hot encoded type + input_channel , output_channel,
        input_h, input_w + kernel_size + stride
        """
        op_types_list = list(sorted(self.op_types))
        for i, _op in enumerate(op_types_list):
            self.opname2id[_op] = i
        n_op_type = len(self.op_types)
        attr_len = n_op_type + 6
        for model_name in self.raw_data:
            n_node = len(self.raw_data[model_name])
            # print("Model: ", model_name, " Number of Nodes: ", n_node)
            t_attr = torch.zeros(n_node, attr_len)
            for node in self.raw_data[model_name]:
                node_attr = self.raw_data[model_name][node]
                nid = self.nodename2id[model_name][node]
                op_type = node_attr['attr']['type']
                op_id = self.opname2id[op_type]
                t_attr[nid][op_id] = 1
                other_attrs = self.parse_node(model_name, node)
                # t_attr[nid+1][-6:] = other_attrs
                t_attr[nid][-6:] = other_attrs
            # t_attr[0][n_op_type] = 1 # global
            self.attrs[model_name] = t_attr

    def parse_node(self, model_name, node_name):
        """
        Parse the attributes of specified node
        Get the input_c, output_c, input_h, input_w, kernel_size, stride
        of this node. Note: filled with 0 by default if this doesn't have
        coressponding attribute.
        """
        node_data = self.raw_data[model_name][node_name]
        t_attr = torch.zeros(6)
        op_type = node_data['attr']['type']
        if op_type =='Conv2D':
            weight_shape = node_data['attr']['attr']['weight_shape']
            kernel_size, _, in_c, out_c = weight_shape
            stride, _= node_data['attr']['attr']['strides']
            _, h, w, _ = node_data['attr']['output_shape'][0]
            t_attr = torch.tensor([in_c, out_c, h, w, kernel_size, stride])
        elif op_type == 'DepthwiseConv2dNative':
            weight_shape = node_data['attr']['attr']['weight_shape']
            kernel_size, _, in_c, out_c = weight_shape
            stride, _= node_data['attr']['attr']['strides']
            _, h, w, _ = node_data['attr']['output_shape'][0]
            t_attr = torch.tensor([in_c, out_c, h, w, kernel_size, stride])
        elif op_type == 'MatMul':
            in_node = node_data['inbounds'][0]
            in_shape = self.raw_data[model_name][in_node]['attr']['output_shape'][0]
            in_c = in_shape[-1]
            out_c = node_data['attr']['output_shape'][0][-1]
            t_attr[0] = in_c
            t_attr[1] = out_c
        elif len(node_data['inbounds']):
            in_node = node_data['inbounds'][0]
            h, w, in_c, out_c = 0, 0, 0, 0
            in_shape = self.raw_data[model_name][in_node]['attr']['output_shape'][0]
            in_c = in_shape[-1]
            if 'ConCat' in op_type:
                for i in range(1, len(node_data['in_bounds'])):
                    in_shape = self.raw_data[node_data['in_bounds']
                                             [i]]['attr']['output_shape'][0]
                    in_c += in_shape[-1]
            if len(node_data['attr']['output_shape']):
                out_shape = node_data['attr']['output_shape'][0]
                # N, H, W, C
                out_c = out_shape[-1]
                if len(out_shape) == 4:
                    h, w = out_shape[1], out_shape[2]
            t_attr[-6:-2] = torch.tensor([in_c, out_c, h, w])

        return t_attr

    def parse_model(self, model_name, model_data):
        """
        Parse the model data and build the adjacent matrixes
        """
        n_nodes = len(model_data)
        m_adj = torch.zeros(n_nodes, n_nodes, dtype=torch.int32)
        id2name = {}
        name2id = {}
        tmp_node_id = 0
        # build the mapping between the node name and node id

        for node_name in model_data.keys():
            id2name[tmp_node_id] = node_name
            name2id[node_name] = tmp_node_id
            op_type = model_data[node_name]['attr']['type']
            self.op_types.add(op_type)
            tmp_node_id += 1

        for node_name in model_data:
            cur_id = name2id[node_name]
            for node in model_data[node_name]['inbounds']:
                if node not in name2id:
                    # weight node
                    continue
                in_id = name2id[node]
                m_adj[in_id][cur_id] = 1
            for node in model_data[node_name]['outbounds']:
                if node not in name2id:
                    # weight node
                    continue
                out_id = name2id[node]
                m_adj[cur_id][out_id] = 1
        
        for idx in range(n_nodes):
            m_adj[idx][idx] = 1

        self.adjs[model_name] = m_adj
        self.nodename2id[model_name] = name2id
        self.id2nodename[model_name] = id2name

    def __getitem__(self, index):
        model_name = self.name_list[index]
        return (self.adjs[model_name], self.attrs[model_name]), self.latencies[model_name]

    def __len__(self):
        return len(self.name_list)

    def save_data(self):
        data_package = {}
        data_package['adjs'] = self.adjs
        data_package['attrs'] = self.attrs
        data_package['nodename2id'] = self.nodename2id
        data_package['latencies'] = self.latencies
        data_package['id2nodename'] = self.id2nodename
        data_package['op_types'] = self.op_types
        data_package['opname2id'] = self.opname2id
        data_package['raw_data'] = self.raw_data
        data_package['name_list'] = self.name_list
        if self.train:
            with open(os.path.join(self.data_dir, 'train_data_package.pkl'),"wb") as file:
                pickle.dump(data_package, file)
            print("Processing and saving train data successfully.")
        else:
            with open(os.path.join(self.data_dir, 'test_data_package.pkl'),"wb") as file:
                pickle.dump(data_package, file)
            print("Processing and saving test data successfully.")

    def load_data(self):
        if self.train:
            with open(os.path.join(self.data_dir, 'train_data_package.pkl'), "rb") as file:
                data_package = pickle.load(file)
                self.adjs = data_package['adjs']
                self.attrs = data_package['attrs']
                self.nodename2id = data_package['nodename2id']
                self.latencies = data_package['latencies']
                self.id2nodename = data_package['id2nodename']
                self.op_types = data_package['op_types']
                self.opname2id = data_package['opname2id']
                self.raw_data = data_package['raw_data']
                self.name_list = data_package['name_list']
            print("Loading train data successfully.")
        else:
            with open(os.path.join(self.data_dir, 'test_data_package.pkl'), "rb") as file:
                data_package = pickle.load(file)
                self.adjs = data_package['adjs']
                self.attrs = data_package['attrs']
                self.nodename2id = data_package['nodename2id']
                self.latencies = data_package['latencies']
                self.id2nodename = data_package['id2nodename']
                self.op_types = data_package['op_types']
                self.opname2id = data_package['opname2id']
                self.raw_data = data_package['raw_data']
                self.name_list = data_package['name_list']
            print("Loading test data successfully.")

# Step 2: Build our Dataloader

We build our DataLoader here with the help of the DGL library, and we will pass latency of the model as well as the graph built from the DGL library.

In [4]:
import random
import dgl

MAX_NORM = torch.tensor([1]*20 + [6963, 6963, 224, 224, 11, 4])
def default_transform(t_in):
    return t_in/MAX_NORM

class DGLDataloader(torch.utils.data.DataLoader):
    def __init__(self, dataset, transforms=default_transform, shuffle=False, batchsize=1):
        self.dataset = dataset
        self.shuffle = shuffle
        self.batchsize = batchsize
        self.transforms = transforms # used to normalized the features
        self.length = len(self.dataset)
        self.indexes = list(range(self.length))
        self.pos = 0
        self.graphs = {}
        self.latencies = {}
        self.construct_graphs()

    def construct_graphs(self):
        for gid in range(self.length):
            (adj, attrs), latency = self.dataset[gid]
            u, v = torch.nonzero(adj, as_tuple=True)
            # import pdb; pdb.set_trace()
            graph = dgl.graph((u, v))
            if self.transforms:
                attrs = self.transforms(attrs)
            graph.ndata['h'] = attrs
            self.graphs[gid] = graph
            self.latencies[gid] = latency

    def __iter__(self):
        if self.shuffle:
            random.shuffle(self.indexes)
        self.pos = 0
        return self

    def __len__(self):
        return self.length

    def __next__(self):
        start = self.pos
        end = min(start + self.batchsize, self.length)
        self.pos = end
        if end - start <= 0:
            raise StopIteration
        batch_indexes = self.indexes[start:end]
        batch_graphs = [self.graphs[i] for i in batch_indexes]
        batch_latencies = [self.latencies[i] for i in batch_indexes]
        return torch.tensor(batch_latencies), dgl.batch(batch_graphs)

Using backend: pytorch


## Step 3: Build Model and Training

In this part, we will first build our GNN model, which is constructed based on GraphSAGE, and maxpooling is selected as out pooling method. Next, we will start training after the data is loaded.

### Step3.1: Build our GraphSAGE Model

We built our model mainly with the help of DGL library.

In [5]:
from os import access
import torch.nn as nn
from torch.nn.modules.module import Module

from dgl.nn.pytorch.glob import MaxPooling
import dgl.nn as dglnn
from torch.optim.lr_scheduler import CosineAnnealingLR


class GNN(Module):
    def __init__(self, 
                num_features=0, 
                num_layers=2,
                num_hidden=32,
                dropout_ratio=0,
                binary_classifier=False):

        super(GNN, self).__init__()
        self.nfeat = num_features
        self.nlayer = num_layers
        self.nhid = num_hidden
        self.dropout_ratio = dropout_ratio
        self.gc = nn.ModuleList([dglnn.SAGEConv(self.nfeat if i==0 else self.nhid, self.nhid, 'pool') for i in range(self.nlayer)])
        self.bn = nn.ModuleList([nn.LayerNorm(self.nhid) for i in range(self.nlayer)])
        self.relu = nn.ModuleList([nn.ReLU() for i in range(self.nlayer)])
        self.pooling = MaxPooling()
        self.fc = nn.Linear(self.nhid, 1)
        self.fc1 = nn.Linear(self.nhid, self.nhid)
        self.dropout = nn.ModuleList([nn.Dropout(self.dropout_ratio) for i in range(self.nlayer)])

        self.binary_classifier = binary_classifier

    def forward_single_model(self, g, features):
        x = self.relu[0](self.bn[0](self.gc[0](g, features)))
        x = self.dropout[0](x)
        for i in range(1,self.nlayer):
            x = self.relu[i](self.bn[i](self.gc[i](g, x)))
            x = self.dropout[i](x)
        return x

    def forward(self, g, features):
        x = self.forward_single_model(g, features)
        with g.local_scope():
            g.ndata['h'] = x
            x = self.pooling(g, x)
            x = self.fc1(x)
            return self.fc(x)

### Step 3.2: Loading Data.

Next, we will finish loading the data and learn about the size of the Training and Testing datasets

In [6]:
print("Processing Training Set.")
train_set = LatencyDataset('./dataset', train=True, device='cpu') 
print("Processing Testing Set.")
test_set = LatencyDataset('./dataset', train=False, device='cpu')

train_loader = DGLDataloader(train_set, batchsize=1 , shuffle=True)
test_loader = DGLDataloader(test_set, batchsize=1, shuffle=False)
print('Train Dataset Size:', len(train_set))
print('Testing Dataset Size:', len(test_set))
print('Attribute tensor shape:', next(train_loader)[1].ndata['h'].size(1))
ATTR_COUNT = next(train_loader)[1].ndata['h'].size(1)

Processing Training Set.
Downloading.
Loading train data successfully.
Processing Testing Set.
Downloading.
Loading test data successfully.
Train Dataset Size: 22324
Testing Dataset Size: 5571
Attribute tensor shape: 26


### Step3.3: Run and Test

We can run the model and evaluate it now!

In [7]:
if torch.cuda.is_available():
    print("Using CUDA.")
# device = "cpu"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

load_model = False
if load_model:
    model = GNN(ATTR_COUNT, 3, 400, 0.1).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=4e-4)
    checkpoint = torch.load('LatencyGNN.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    opt.load_state_dict(checkpoint['optimizer_state_dict'])
    # EPOCHS = checkpoint['epoch']
    EPOCHS = 0
    loss_func = checkpoint['loss']
else:
    model = GNN(ATTR_COUNT, 3, 400, 0.1).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=4e-4)
    EPOCHS=20
    loss_func = nn.L1Loss()

lr_scheduler = CosineAnnealingLR(opt, T_max=EPOCHS)
loss_sum = 0
for epoch in range(EPOCHS):
    train_length = len(train_set)
    tran_acc_ten = 0
    print('Epoch %d' % epoch)
    loss_sum = 0 
    # latency, graph, types, flops
    for batched_l, batched_g in train_loader:
        opt.zero_grad()
        batched_l = batched_l.to(device).float()
        batched_g = batched_g.to(device)
        batched_f = batched_g.ndata['h'].float()
        logits = model(batched_g, batched_f)
        for i in range(len(batched_l)):
            pred_latency = logits[i].item()
            prec_latency = batched_l[i].item()
            if (pred_latency >= 0.9 * prec_latency) and (pred_latency <= 1.1 * prec_latency):
                tran_acc_ten += 1
        # print("true latency: ", batched_l)
        # print("Predict latency: ", logits)
        batched_l = torch.reshape(batched_l, (-1 ,1))
        loss = loss_func(logits, batched_l)
        loss_sum += loss
        loss.backward()
        opt.step()
    lr_scheduler.step()
    print("Training accuracy within 10%: ", tran_acc_ten / train_length * 100, " %.")
    print('Learning Rate:', lr_scheduler.get_last_lr())
    print('Loss:', loss_sum / train_length)

torch.save({
    'epoch': EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': opt.state_dict(),
    'loss': loss_func,
}, 'LatencyGNN.pt')


count = 0
with torch.no_grad():
    test_length = len(test_set)
    test_acc_ten = 0
    for batched_l, batched_g in test_loader:
        batched_l = batched_l.to(device).float()
        batched_g = batched_g.to(device)
        batched_f = batched_g.ndata['h'].float()
        result = model(batched_g, batched_f)
        if (result.item() >= 0.9 * batched_l.item()) and (result.item() <= 1.1 * batched_l.item()):
            test_acc_ten += 1
        acc = (abs(result.item() - batched_l.item()) / batched_l.item()) * 100
        count += 1
    print("Testing accuracy within 10%: ", test_acc_ten / test_length * 100, " %.")

Epoch 0
Training accuracy within 10%:  22.41981723705429  %.
Learning Rate: [0.00039753766811902755]
Loss: tensor(130.2296, grad_fn=<DivBackward0>)
Epoch 1
Training accuracy within 10%:  29.667622289912206  %.
Learning Rate: [0.00039021130325903074]
Loss: tensor(97.9046, grad_fn=<DivBackward0>)
Epoch 2
Training accuracy within 10%:  30.984590575165743  %.
Learning Rate: [0.0003782013048376736]
Loss: tensor(88.9308, grad_fn=<DivBackward0>)
Epoch 3
Training accuracy within 10%:  32.516574090664754  %.
Learning Rate: [0.0003618033988749895]
Loss: tensor(82.0981, grad_fn=<DivBackward0>)
Epoch 4
Training accuracy within 10%:  34.74735710446157  %.
Learning Rate: [0.0003414213562373095]
Loss: tensor(78.0732, grad_fn=<DivBackward0>)
Epoch 5
Training accuracy within 10%:  36.34205339544884  %.
Learning Rate: [0.00031755705045849464]
Loss: tensor(72.6763, grad_fn=<DivBackward0>)
Epoch 6
Training accuracy within 10%:  37.94122917039957  %.
Learning Rate: [0.00029079809994790937]
Loss: tensor(69.