# Designing an execution time predictor for arbitrary OpenCL kernels, based on measurements taken by the oclude OpenCL profiling tool

intro PH

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#0.-Libraries-and-helper-functions" data-toc-modified-id="0.-Libraries-and-helper-functions-1">0. Libraries and helper functions</a></span></li><li><span><a href="#1.-OCLBoi,-the-&quot;small&quot;-predictor---from-oclude-measurements-to-instruction-counts" data-toc-modified-id="1.-OCLBoi,-the-&quot;small&quot;-predictor---from-oclude-measurements-to-instruction-counts-2">1. OCLBoi, the "small" predictor - from oclude measurements to instruction counts</a></span></li><li><span><a href="#2.-OCLMan,-the-&quot;big&quot;-predictor---from-oclude-measurements-to-execution-time" data-toc-modified-id="2.-OCLMan,-the-&quot;big&quot;-predictor---from-oclude-measurements-to-execution-time-3">2. OCLMan, the "big" predictor - from oclude measurements to execution time</a></span></li><li><span><a href="#3.-The-moment-of-&quot;truth&quot;---OCLMan-vs.-a-simplistic-static-predictor" data-toc-modified-id="3.-The-moment-of-&quot;truth&quot;---OCLMan-vs.-a-simplistic-static-predictor-4">3. The moment of "truth" - OCLMan vs. a simplistic static predictor</a></span></li></ul></div>

## 0. Libraries and helper functions

In [1]:
%%capture
!pip install --upgrade numpy
!pip install --upgrade scipy
!pip install --upgrade scikit-learn
!pip install --upgrade pandas
!pip install --upgrade plotly
!pip install --upgrade matplotlib
!pip install --upgrade seaborn

In [2]:
### basic imports ###

import oclude

import os
import itertools
from collections import defaultdict
from time import time
import numpy as np
import pandas as pd
from oclude import profile_opencl_kernel
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
from IPython.display import display, HTML
from pprint import pprint
from tqdm import tqdm

# for plotly: set notebook mode to work offline
pyo.init_notebook_mode(connected=True)

### regression models and related stuff ###
from sklearn.linear_model import LinearRegression, MultiTaskElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

### globals go here ###
outdir = os.path.join(os.pardir, 'desk', 'outputs')
from oclude.utils.constants import llvm_instructions

pd.set_option('display.max_rows', 500)

%matplotlib inline

In [3]:
# some helper functions, do not bother with them

def sorted_filenames_keyfunc(filename):
     return int(filename.split('__')[3].split('.')[0])

def get_grouped_all_profilings_filenames():
    
    def groupby_keyfunc(filename):
        return filename.split('__')[:3]

    sorted_filenames = sorted(os.listdir(outdir), key=groupby_keyfunc)

    retlist = []
    for k, v in itertools.groupby(sorted_filenames, groupby_keyfunc):
        retlist.append((k, sorted(v, key=sorted_filenames_keyfunc)))
    return retlist

def get_gsize_from_profiling_filename(filename):
    return int(filename.split('__')[-1].split('.')[0])

def get_average_instcounts_dicts(arg, discard_zeros=False):
    if isinstance(arg, list) or isinstance(arg, tuple):
        return list(map(get_average_instcounts_dicts, arg))

    from collections import Counter
    from functools import reduce
    from operator import add

    if not isinstance(arg, dict):
        arg = load_profiling_dicts(arg)

    results = arg['results']
    samples = len(results)

    if discard_zeros:
        avg_instcounts = dict(
            reduce(add, map(Counter, map(lambda x : x['instcounts'], results)))
        )
    else:
        c = Counter()
        for d in map(lambda x : x['instcounts'], results):
            c.update(d)
        avg_instcounts = dict(c)

    avg_instcounts = {
        k : int(v) // samples for k, v in avg_instcounts.items()
    }

    if 'timeit' in results[0]:
        avg_timeit = dict(
            reduce(add, map(Counter, map(lambda x : x['timeit'], results)))
        )
        avg_timeit = {
            k : v / samples for k, v in avg_timeit.items()
        }
    else:
        avg_timeit = None

    avg_dict = {}
    for k in filter(lambda x : x != 'results', arg.keys()):
        avg_dict[k] = arg[k]

    avg_dict['results'] = dict(instcounts=avg_instcounts)
    if avg_timeit:
        avg_dict['results']['timeit'] = avg_timeit

    return avg_dict

def get_list_of_profilings_filenames(benchmark=None, file=None, kernel=None):
    
    if benchmark is None:
        return os.listdir(outdir)

    profilings_filenames = list(filter(
        lambda x : x.startswith('__'.join([benchmark, file, kernel]) + '__'),
        os.listdir(outdir)
    ))

    return list(map(lambda x : os.path.join(outdir, x), profilings_filenames))

def get_sorted_list_of_profilings_filenames(benchmark=None, file=None, kernel=None):
    return sorted(
        get_list_of_profilings_filenames(benchmark, file, kernel),
        key=sorted_filenames_keyfunc
    )

def load_profiling_dicts(arg, append_gsize=True):
    if isinstance(arg, list) or isinstance(arg, tuple):
        return list(map(load_profiling_dicts, arg))

    from json import load
    filedir = os.path.join(outdir, arg) if not arg.startswith(outdir) else arg
    with open(filedir, 'r') as f:
        profiling_dict = load(f)

    if append_gsize:
        profiling_dict['gsize'] = get_gsize_from_profiling_filename(arg)

    return profiling_dict

def multi_table(table_list):
    '''
    Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

In [4]:
class MultiModelGridSearchCV:
    '''A helper class for multi-model gridsearch
    The original class was found in the following blog post:
    http://www.davidsbatista.net/blog/2018/02/23/model_optimization/
    and was heavily modified to support multiple scorers, amongst others

    Attributes
    ----------
    models : dict
        dict of models
    params : dict(dict)
        dict of parameter grids
    
    Methods
    -------
    fit : same as GridSearchCV, but for every model
        in the model list
    scores : returns a list of dictionaries (one for each model/gridsearch)
             which contain the following:
             - model_name : the name of the best model (estimator)
             - best_estimator : estimator with the best score
             - r2 : the best r2 score achieved, i.e. the best_estimator r2 score
    '''

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError('Some estimators are missing parameters: %s' % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=5, n_jobs=1, verbose=1, scoring=None, refit=False):
        self.scorer = scoring
        for key in self.keys:
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(
                model, params, cv=cv, n_jobs=n_jobs,
                verbose=verbose, scoring=scoring, refit=refit
            )
            gs.fit(X, y)
            self.grid_searches[key] = gs

        return self.grid_searches

    def scores(self, x, y):
        # return score FOR ALL MODELS
        model_scores = []
        for k, gs in self.grid_searches.items():
            curr_best_estimator = gs.best_estimator_
            model_scores.append(dict(
                model_name=k,
                best_estimator=curr_best_estimator,
                r2_score=curr_best_estimator.score(x, y)
            ))
        return model_scores

## 1. OCLBoi, the "small" predictor - from oclude measurements to instruction counts

We are going to build a "small" **kernel-specific** predictor, `OCLBoi` (i.e. "OpenCL but one in-particular"), with the following characteristics:

- Regarding the **training** phase:
    - inputs:
        - measurements regarding the **instruction counts of the kernel** as taken by the `oclude` OpenCL profiling tool


- Regarding the **prediction** phase:
    - inputs:
        - a `gsize` value (i.e. the OpenCL global NDRange)
    - output:
        - a vector of predicted instruction counts for the given `gsize` value

For more information regarding `OCLBoi`, see the regression notebook, where it is designed and thoroughly tested.

In [5]:
### some preparations for our small predictor ###

PolynomialRegressionLinear = Pipeline(
    [
        ('polyfeatures', PolynomialFeatures(degree=2)),
        ('regression', LinearRegression())
    ]
)

PolynomialRegressionElasticNet = Pipeline(
    [
        ('polyfeatures', PolynomialFeatures(degree=2)),
        ('regression', MultiTaskElasticNet())
    ]
)

alpha = np.geomspace(1e-2, 1e4, 10)
l1_ratio = np.arange(0.05, 1.01, .05)
selection = ['cyclic', 'random']

# parameter grid for Elastic Net
enet_param_grid = dict(
    alpha = alpha,
    l1_ratio = l1_ratio,
    selection = selection
)

# parameter grid for Polynomial Regression based on Elastic Net
poly_l_param_grid = dict(polyfeatures__degree=[2])

# parameter grid for Polynomial Regression based on Elastic Net
poly_enet_param_grid = dict(
    polyfeatures__degree = [2],
    regression__alpha = alpha,
    regression__l1_ratio = l1_ratio,
    regression__selection = selection
)

# list of regressor models
models = [
    LinearRegression(),
    MultiTaskElasticNet(),
    PolynomialRegressionLinear,
    PolynomialRegressionElasticNet
]

# list of regressor models names
models_names = [
    'Linear Regression',
    'Elastic Net',
    'Polynomial Regression (Linear)',
    'Polynomial Regression (Elastic Net)'
]

# list of regressor models parameter grids
pgrids = [
    {},
    enet_param_grid,
    poly_l_param_grid,
    poly_enet_param_grid    
]

models = dict(zip(models_names, models))
pgrids = dict(zip(models_names, pgrids))

In [6]:
class OCLBoi:

    def fit(self, x_train, x_test, y_train, y_test):
        '''
        x : gsizes
        y : instcounts
        '''
        self.clf = MultiModelGridSearchCV(models, pgrids)
        self.clf.fit(x_train, y_train, cv=5, scoring='r2', n_jobs=-1, verbose=0, refit=True)
        model_scores = self.clf.scores(x_test, y_test)
        self.kernel_regressor = max(
            [estimator for estimator in model_scores],
            key=lambda info : info['r2_score']
        )['best_estimator']
#         self.kernel_regressor = self.clf.best_estimator_

    def predict(self, x):
        return self.kernel_regressor.predict(x)

A small demonstration follows.

First, we create a predictor for a specific kernel, namely `BFS_2` from the `bfs/Kernels.cl` rodinia benchmark file:

In [7]:
# let's get the data for the following kernel:
benchmark = 'bfs'
file = 'Kernels.cl'
kernel = 'BFS_2'

profs_filenames = get_sorted_list_of_profilings_filenames(benchmark, file, kernel)
profs_dicts = get_average_instcounts_dicts(profs_filenames)

# x: gsizes
x = np.array([x['gsize'] for x in profs_dicts]).reshape(-1, 1)
# Y: instcounts
Y = []
Y = np.array([[x['results']['instcounts'][i] for i in llvm_instructions] for x in profs_dicts])

x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=.3)

predictor = OCLBoi()
predictor.fit(x_train, x_test, Y_train, Y_test)

We are now ready to predict instruction counts from arbitrary `gsize` values. For the shake of supervision, we present the following comparative table.

- The `experimental count` column holds the actual instcounts that we got from profiling the specified kernel for the given `gsize`. **Note that this column may vary significantly between runs!**
- The `predicted count` column holds the instcounts that our predictor predicted for the given `gsize`.

In [8]:
GSIZE = 1024

experimental_instcounts = profile_opencl_kernel(
    file='../tests/rodinia_kernels/bfs/Kernels.cl', kernel='BFS_2',
    gsize=GSIZE,
    samples=1,
    timeout=0, # no timeout
    instcounts=True
)['results'][0]['instcounts']

predicted_instcounts = dict(zip(
    llvm_instructions,
    map(int, predictor.predict([[GSIZE]])[0])
))

instcounts = pd.DataFrame(
    columns=['instruction', 'experimental count', 'predicted count'],
    data=[(i, experimental_instcounts[i], predicted_instcounts[i]) for i in llvm_instructions]
)

display(instcounts)

[oclude] INFO: Input file ../tests/rodinia_kernels/bfs/Kernels.cl is cached
[oclude] INFO: Using cached instrumented file
[oclude] Running kernel 'BFS_2' from file ../tests/rodinia_kernels/bfs/Kernels.cl
[hostcode] Using the following device:
[hostcode] Platform:	Intel(R) OpenCL HD Graphics
[hostcode] Device:	Intel(R) Gen9 HD Graphics NEO
[hostcode] Version:	OpenCL 2.1 NEO
[hostcode] Kernel name: BFS_2
[hostcode] Kernel arg 1: g_graph_mask (char*, global)
[hostcode] Kernel arg 2: g_updating_graph_mask (char*, global)
[hostcode] Kernel arg 3: g_graph_visited (char*, global)
[hostcode] Kernel arg 4: g_over (char*, global)
[hostcode] Kernel arg 5: no_of_nodes (int, private)
[hostcode] About to execute kernel with Global NDRange = 1024
[hostcode] Number of executions (a.k.a. samples) to perform: 1
[hostcode] Collecting instruction counts...
[hostcode] Kernel run completed successfully


Unnamed: 0,instruction,experimental count,predicted count
0,add,0,0
1,sub,0,0
2,mul,0,0
3,udiv,0,0
4,sdiv,0,0
5,urem,0,0
6,srem,0,0
7,fneg,0,0
8,fadd,0,0
9,fsub,0,0


## 2. OCLMan, the "big" predictor - from oclude measurements to execution time

We are going to build a "big" **kernel-agnostic** predictor, `OCLMan` (i.e. "OpenCL Maybe? Approximately? Nope!"), with the following characteristics:

- Regarding the **training** phase:
    - inputs:
        - *(more like dependency)* a "small" predictor (see above) for each kernel of the training benchmark suite
        - measurements regarding the **execution times of all the kernels of the training benchmark suite** as taken by the `oclude` OpenCL profiling tool


- Regarding the **prediction** phase:
    - inputs:
        - a kernel
        - measurements regarding the instruction counts of the kernel as taken by the `oclude` OpenCL profiling tool, to create and train a kernel-specific "small" predictor (see above)\*
        - a `gsize` value (i.e. the OpenCL global NDRange)
        - *(internal process)* a vector of predicted instruction counts for the given `gsize` value, from the "small" predictor of the given kernel
    - output:
        - the predicted kernel execution time for the given `gsize` value

\* note that this prediction phase input implies that these measurements have already been taken at some point in the past. To fully automate the whole process, and hence to create a complete predictor which will accept a kernel and a `gsize` value as its sole inputs, the implementation of an automatic measurement subsystem is needed, which should be based on the `oclude` OpenCL kernel profiling tool.

In [9]:
# step 1: extract data (instcounts, device exec times) from ALL kernels

kernels, xs, Ys = [], [], []

for k in map(lambda x : tuple(x[0]), get_grouped_all_profilings_filenames()):

    x, Y = [], []
    kernel = k

    profs_filenames = get_sorted_list_of_profilings_filenames(*k)
    profs_dicts = get_average_instcounts_dicts(profs_filenames)

    for prof_dict in profs_dicts:
        gsize = prof_dict['gsize']
        instcounts = [prof_dict['results']['instcounts'][i] for i in llvm_instructions]
        device_time = prof_dict['results']['timeit']['device']
        x.append(gsize)
        Y.append(np.array(instcounts + [device_time]))

    kernels.append(kernel)
    xs.append(np.array(x))
    Ys.append(np.array(Y))

In [10]:
# step 2: split into train and test

x_trains, x_tests, Y_trains, Y_tests = [], [], [], []

for x, Y in zip(xs, Ys):
    x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=.3)
    x_trains.append(x_train)
    x_tests.append(x_test)
    Y_trains.append(Y_train)
    Y_tests.append(Y_test)

In [11]:
class OCLMan:

    def __init__(self, regressor=LinearRegression()):
        self.regressor = regressor
        self.kernel_predictors = {}

    def fit(self, X, y):
        '''
        fit is kernel-agnostic; it just fits the regressor
        (i.e. the instcounts -> time function)
        '''
        return self.regressor.fit(X, y)

    def predict(self, kernel, gsizes, kdata=None):
        '''
        kdata is a parameter that exists as long as an automatic sybsystem
        for kernel microprofiling has not been implemented;
        kdata holds all the measurements from the profilings of the kernel that,
        at this point, OCLMan will do automatically in the future.
        kdata is a dict which holds 4 numpy arrays:
        - x_train, x_test : gsizes
        - Y_train, Y_test : instcounts
        '''
        # step 1: create "small" kernel-specific predictor
        #         and cache it
        kernel_predictor = self.kernel_predictors.get(kernel)
        if kernel_predictor is None:
            if kdata is None:
                raise AttributeError('new kernel, need train/test data (kdata parameter for now)')
            kernel_predictor = OCLBoi()
            kernel_predictor.fit(**kdata)
            self.kernel_predictors[kernel] = kernel_predictor

        # step 2: get a (predicted) instcounts vector for the given gsize
        instcounts = kernel_predictor.predict(gsizes)

        # step 3: predict execution time given the predicted instcounts vector
        return self.regressor.predict(instcounts)

In [12]:
IDX = np.random.randint(40)

# step 3: train OCLMan!

oclman = OCLMan()

# OCLMan needs all the training data to be trained, concatenated
# OCLMan fits the instcounts -> time function
prof_data = np.vstack(Y_trains)
oclman_X_train = prof_data[:, :-1]
oclman_y_train = prof_data[:, -1:]
oclman.fit(oclman_X_train, oclman_y_train)

oclboi_x = list(map(lambda x : x.reshape(-1, 1), x_trains))
oclboi_Y = [y[:, :-1] for y in Y_trains]

# oclboi needs a further split into train and test
oclboi_x_trains, oclboi_x_tests, oclboi_Y_trains, oclboi_Y_tests = [], [], [], []
for x, Y in zip(oclboi_x, oclboi_Y):
    (
        oclboi_x_train, oclboi_x_test,
        oclboi_Y_train, oclboi_Y_test
    ) = train_test_split(x, Y, test_size=.3)
    oclboi_x_trains.append(oclboi_x_train)
    oclboi_x_tests.append(oclboi_x_test)
    oclboi_Y_trains.append(oclboi_Y_train)
    oclboi_Y_tests.append(oclboi_Y_test)

kdata = dict(
    x_train=oclboi_x_trains[IDX], x_test=oclboi_x_tests[IDX],
    y_train=oclboi_Y_trains[IDX], y_test=oclboi_Y_tests[IDX]
)

time_preds = oclman.predict(
    kernel=tuple(get_grouped_all_profilings_filenames()[IDX][0]),
    gsizes=x_tests[IDX].reshape(-1, 1),
    kdata=kdata
)

In [13]:
### THE NAMES ARE A MESS HERE, NEVERMIND ###

xy = sorted([(a, b) for a, b in zip(x_tests[IDX], time_preds)], key=lambda q : q[0])
XXX = np.array([a[0] for a in xy]).reshape(-1, 1)
YYY = np.array([a[1] for a in xy])

YYY_test = np.apply_along_axis(lambda r : r[-1:], 1, Y_tests[IDX])

xy = sorted([(a, b) for a, b in zip(x_tests[IDX], YYY_test)], key=lambda q : q[0])
YYY_test = np.array([a[1] for a in xy])

print('r2 score:', r2_score(YYY_test, YYY))

fig = go.Figure(data=[
    go.Scatter(name='predicted', x=XXX.reshape(-1), y=YYY.reshape(-1)),
    go.Scatter(name='actual', x=XXX.reshape(-1), y=YYY_test.reshape(-1))
])

fig.update_layout(
    title=dict(
        text=f'An OCLMan example regarding kernel {"/".join(kernels[IDX])}',
        font=dict(
            family='Courier New, monospace',
            size=18
        ),
        x=.5
    ),
    yaxis_title='execution time (ms)',
    xaxis_title='gsize',
    font=dict(
        family='Courier New, monospace',
        size=10
    )
)

fig.show()

r2 score: 0.42193666145416


## 3. The moment of "truth" - OCLMan vs. a simplistic static predictor

The time has come - our final step is to compare the "big" predictor above with a baseline predictor - one which will rely solely on static features, i.e. the source code instruction counts. Here, no notion of dynamic (micro)profiling is present.

We have proven (see regression notebook) that instcounts are a polynomial (at most) function of the gsize.

Before we start, we need to make one more assumption:

> The execution time is a **linear** function of the instcounts

We base this assumption on nothing more than common sense; it makes a lot of sense for the time a kernel needs to execute to be a linear function of the different individual jobs (i.e. instructions) that it has to complete. We can imagine a function of the following form:

$$t_{exec} = t_{add}count_{add} + t_{sub}count_{sub} + t_{mul}count_{mul} + \dots$$

where:
- $t_i$ is the time that a *single* LLVM instruction of the i type needs to execute on the processing unit of interest, and
- $count_i$ is the number of LLVM instructions of the i type that were executed (i.e. our instcounts).

It is obvious that this approach makes simplifications regarding the behavior of the execution environment - for example, our model does not take into consideration the order of the instructions. However, the simplicity of this model is sufficient to achieve the last objectives of this dissertation, which are to prove that:

1. `oclude` works as designed, expected and promised.
2. the measurements taken by using `oclude` contain valuable information that deepens our understanding of the kernels behavior and facilitates our effort to better deal with the problems of the heterogeneous computing world; in other words, the process dynamic microprofiling was worth it.

To further support our second goal, we will compare `OCLMan` with `OCLBase`, a predictor that will rely solely on static features, i.e. source code LLVM instruction counts. Here, no notion of dynamic microprofiling is present.

To be absolutely fair, we must give `OCLBase` the same freedom and constrain it with the same bias as we do with `OCLMan`. That means, that we will make `OCLBase` a **polynomial regressor of order 2 based on linear regression**. The reason we do this is the relationships between $gsize$, $instcounts$ and $execution time$:

$$
gsize \longmapsto instcounts : polynomial, proven \\
instcounts \longmapsto execution time : linear, assumed \\
\Downarrow \\
gsize \longmapsto execution time : polynomial, assumed
$$

Therefore, if the regressor of `OCLMan` is a simple linear regressor\* and the `OCLBase` is a polynomial regressor of order 2 based on simple linear regression, both models have, in theory, the same capabilities.

\* it is reminded that, regarding `OCLMan`, the polynomial nature of the last relation is manifested (if needed) through the kernel-specific `OCLBoi` predictor that `OCLMan` uses internally.

In [14]:
# step 1: get static features for every available kernel

rodinia_base_dir = os.path.join(os.pardir, 'tests', 'rodinia_kernels')
static_instcounts = {}

for kernel in kernels:
    kernel_instcounts = oclude.get_opencl_kernel_static_instcounts(
        os.path.join(rodinia_base_dir, kernel[0], kernel[1]), kernel[2]
    )
    static_instcounts[kernel] = [kernel_instcounts[i] for i in llvm_instructions]

[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (1/2)
[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)
[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (1/2)
[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)
[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (1/2)
[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)
[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (1/2)
[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)
[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (

[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)
[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (1/2)
[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)
[instrumentation] Preprocessing source file
[instrumentation] Compiling source to LLVM bitcode (1/2)
[instrumentation] Retrieving instrumentation data from LLVM bitcode
[instrumentation] Compiling source to LLVM bitcode (2/2)


In [15]:
# step 2: create X and y data for both OCLMan, OCLBoi and OCLBase
#
# === for OCLMan (the regressor component) ===
# x : dynamic instcounts
# y : device execution times
#
# === for OCLBoi ===
# X : gsize
# y : dynamic instcounts
#
# === for OCLBase ===
# x : gsizes + static instcounts
# y : device execution times
#
# right now, xs and Ys hold gsizes and dynamic instcounts + device execution times, respectively

# step 2a : organize data into appropriate lists
gsizes, sinstcounts, dinstcounts, times = [], [], [], []
for i, kernel in enumerate(kernels):
    kernel_gsizes = xs[i].reshape(-1, 1)
    kernel_static = static_instcounts[kernel]
    gsizes.append(kernel_gsizes)
    sinstcounts.append(np.vstack([kernel_static]*len(kernel_gsizes)))
    dinstcounts.append(Ys[i][:, :-1])
    times.append(Ys[i][:, -1:])

# step 2b : split everything into train and test

(
    kernels_train, kernels_test,
    gsizes_trains, gsizes_tests,
    sinstcounts_trains, sinstcounts_tests,
    dinstcounts_trains, dinstcounts_tests,
    times_trains, times_tests
) = train_test_split(kernels, gsizes, sinstcounts, dinstcounts, times, test_size=1/8)

# OCLMan train data
oclman_X_train = np.vstack(dinstcounts_trains)
oclman_y_train = np.vstack(times_trains)

# OCLMan test data
oclman_x_tests = gsizes_tests
oclman_y_tests = times_tests

# OCLBoi data
oclboi_x = gsizes_tests
oclboi_Y = dinstcounts_tests

# OCLBase train data 
oclbase_X_train = np.vstack([np.hstack((gs, si)) for gs, si in zip(gsizes_trains, sinstcounts_trains)])
oclbase_y_train = np.vstack(times_trains)

# OCLBase test data
oclbase_X_tests = [np.hstack((gs, si)) for gs, si in zip(gsizes_tests, sinstcounts_tests)]
oclbase_y_tests = times_tests

# train oclman
oclman = OCLMan()
oclman.fit(oclman_X_train, oclman_y_train)

# train a polynomial regressor as oclbase
oclbase = PolynomialRegressionLinear
oclbase.fit(oclbase_X_train, oclbase_y_train)

Pipeline(steps=[('polyfeatures', PolynomialFeatures()),
                ('regression', LinearRegression())])

In [16]:
%%time

###  run OCLMan and OCLBase on the train set  ###
### and compute the R2 scores for each kernel ###

oclman_scores, oclbase_scores = {}, {}

for i, kernel in enumerate(tqdm(kernels_test)):

    # preprocessing needed for OCLBoi
    (
        oclboi_x_train, oclboi_x_test,
        oclboi_Y_train, oclboi_Y_test
    ) = train_test_split(oclboi_x[i], oclboi_Y[i], test_size=.3)
    kdata = dict(
        x_train=oclboi_x_train, x_test=oclboi_x_test,
        y_train=oclboi_Y_train, y_test=oclboi_Y_test
    )

    # get OCLMan prediction and score
    oclman_pred = oclman.predict(
        kernel=kernel,
        gsizes=oclman_x_tests[i],
        kdata=kdata
    )
    oclman_scores[kernel] = oclman_y_tests[i], oclman_pred

    # get OCLBase prediction and score
    oclbase_pred = oclbase.predict(oclbase_X_tests[i])
    oclbase_scores[kernel] = oclbase_y_tests[i], oclbase_pred


Objective did not converge. You might want to increase the number of iterations. Duality gap: 1.0694342417189273e+37, tolerance: 3.5278273975282805e+33


Objective did not converge. You might want to increase the number of iterations. Duality gap: 9.958111851373623e+36, tolerance: 3.5278273975282805e+33

100%|██████████| 5/5 [01:13<00:00, 14.61s/it]

CPU times: user 20.3 s, sys: 1.35 s, total: 21.6 s
Wall time: 1min 13s





In [17]:
keys = list(oclman_scores.keys())
x = np.arange(len(keys))
oclman_r2 = [r2_score(*oclman_scores[k]) for k in keys]
oclbase_r2 = [r2_score(*oclbase_scores[k]) for k in keys]
oclman_rmse = [mean_squared_error(*oclman_scores[k], squared=False) for k in keys]
oclbase_rmse = [mean_squared_error(*oclbase_scores[k], squared=False) for k in keys]

pd.DataFrame(
    columns=['OCLMan R2 score', 'OCLBase R2 score', 'OCLMan RMS error', 'OCLBase RMS error'],
    data=zip(oclman_r2, oclbase_r2, oclman_rmse, oclbase_rmse)
)

Unnamed: 0,OCLMan R2 score,OCLBase R2 score,OCLMan RMS error,OCLBase RMS error
0,0.902958,0.8998345,1.838034,1.867378
1,-98.151065,-124.6192,176.792961,198.995943
2,0.467303,-1970.361,19.129077,1163.689462
3,0.467474,-17300950.0,3.405769,19412.417732
4,0.792912,-200.5667,12.833066,400.370287


In [18]:
fig = make_subplots(
    rows=len(keys), cols=2,
    subplot_titles=[
        x for sl in [[r2_score(*oclman_scores[k]), r2_score(*oclbase_scores[k])] for k in keys] for x in sl
        
    ]
)

for i in range(len(keys)):
    y_true, y_pred = tuple(map(lambda x : x.reshape(-1), oclman_scores[keys[i]]))
    x = np.arange(len(y_true))
    fig.add_trace(
        go.Scatter(x=x, y=y_true, marker_color='indianred', showlegend=False),
        row=i+1, col=1
    )
    fig.add_trace(
        go.Scatter(x=x, y=y_pred, marker_color='darkblue', showlegend=False),
        row=i+1, col=1
    )
    y_true, y_pred = tuple(map(lambda x : x.reshape(-1), oclbase_scores[keys[i]]))
    x = np.arange(len(y_true))
    fig.add_trace(
        go.Scatter(x=x, y=y_true, marker_color='indianred', name='actual', showlegend=i==len(keys)-1),
        row=i+1, col=2
    )
    fig.add_trace(
        go.Scatter(x=x, y=y_pred, marker_color='darkblue', name='predicted', showlegend=i==len(keys)-1),
        row=i+1, col=2
    )

fig.update_layout(
    autosize=False,
    height=4000,
    width=1200
)

fig.show()