## Run the executable to measure its runtime

In [None]:
%%writefile exe.pl
#!/usr/bin/env perl

#Author: Xingfu Wu
#MCS, ANL
# exe.pl: average the execution time in 5 runs
#
use Time::HiRes qw(gettimeofday); 

foreach $filename (@ARGV) {
 #  print "Start to preprocess ", $filename, "...\n";
   $ssum = 0.0;
   $nmax = 5;
   @nn = (1..$nmax);
   for(@nn) {
    $retval = gettimeofday( ); 
    system("$filename >/dev/null 2>&1");
    $tt = gettimeofday( );
    $ttotal = $tt - $retval;
    $ssum = $ssum + $ttotal;
   }
   $avg = $ssum / $nmax;
 #  print "End to preprocess ", $avg, "...\n";
   printf("%.4f", $avg);
}

### Make the previous script executable

In [None]:
%%sh
chmod a+x exe.pl

## Parameterized Code

In [None]:
%%writefile mmm_block.cpp
#include <stdlib.h>
#include <assert.h>

#define N MATRIX_SIZE

int main(int argc, const char** argv)
{

  int n = BLOCK_SIZE * (N/BLOCK_SIZE);
  int a[N][N];
  int b[N][N];
  int c[N][N];
#ifdef CHECK
  int cref[N][N];
  for(int i = 0; i < N; i++)
    for(int j = 0; j < N; j++) {
        a[i][j] = rand();
        b[i][j] = rand();
    }
  for(int i = 0; i < N; i++)
    for(int j = 0; j < N; j++) {
      c[i][j] = 0;
      cref[i][j] = 0;
      for(int k = 0; k < N; k++)
        cref[i][j] += a[i][k] * b[k][j];
    }
#endif
  int sum=0;
  
#ifndef CHECK
  // Repeat so measure is long enough
  for(int repeat = 0; repeat < 10; repeat++) {
#endif
    // For all blocks in j
    for(int j1 = 0; j1 < n; j1 += BLOCK_SIZE) {
      // Muliply by all block in k
      for(int k1 = 0; k1 < n; k1 += BLOCK_SIZE)
        for(int i = 0; i < N; i++)
          for(int j = j1; j < j1 + BLOCK_SIZE; j++) {
            sum = c[i][j];
            for(int k = k1; k < k1 + BLOCK_SIZE; k++)               
              sum += a[i][k] * b[k][j];
            c[i][j] = sum;
          }
      // and by the remainder in k dimension
      for(int i = 0; i < N; i++)
        for(int j = j1; j < j1 + BLOCK_SIZE; j++) {
            sum = c[i][j];
            for(int k = n; k < N; k++)
              sum += a[i][k] * b[k][j];
            c[i][j] = sum;
        }
    }
    // C[0...N][0..n] is already done here
    // Need to compute C[0...N][n...N]
    for(int k1 = 0; k1 < n; k1 += BLOCK_SIZE)
      for(int i = 0; i < N; i++)
        for(int j = n; j < N; j++) {
          sum = c[i][j];
          for(int k = k1; k < k1 + BLOCK_SIZE; k++)
            sum += a[i][k] * b[k][j];
          c[i][j] = sum;
        }
    for(int i = 0; i < N; i++)
      for(int j = n; j < N; j++) {
        sum = c[i][j];
        for(int k = n; k < N; k++)
          sum += a[i][k] * b[k][j];
        c[i][j] = sum;
      }
#ifndef CHECK
  }
#endif
    
#ifdef CHECK
  for(int i = 0; i < N; i++)
    for(int j = 0; j < N; j++)
      assert(c[i][j] == cref[i][j]);
#endif
  return 0;
}


## Use the selected configuration to generate a new code, compile and execute it

In [None]:
%%writefile plopper.py
import os, sys, subprocess, random, uuid

class Plopper:
    def __init__(self,sourcefile,outputdir):

        # Initilizing global variables
        self.sourcefile = sourcefile
        self.outputdir = outputdir+"/tmp_files"

        if not os.path.exists(self.outputdir):
            os.makedirs(self.outputdir)

    #Creating a dictionary using parameter label and value
    def createDict(self, x, params):
        dictVal = {}
        for p, v in zip(params, x):
            dictVal[p] = v
        return(dictVal)
    
    # Function to find the execution time of the interim file, and return the execution time as cost to the search module
    def findRuntime(self, x, params):
        interimfile = ""
        exetime = 1
        
        # Generate intermediate file
        dictVal = self.createDict(x, params)

        #compile and find the execution time
        tmpbinary = self.outputdir + '/tmp_'+str(uuid.uuid4())+'.bin'
        kernel_idx = self.sourcefile.rfind('/')
        kernel_dir = self.sourcefile[:kernel_idx]
        gcc_cmd = 'g++ ' + kernel_dir +'/mmm_block.cpp '
        gcc_cmd += ' -D{0}={1}'.format('BLOCK_SIZE', dictVal['BLOCK_SIZE'])
        gcc_cmd += ' -D{0}={1}'.format('MATRIX_SIZE', dictVal['MATRIX_SIZE'])
        gcc_cmd += ' -o ' + tmpbinary
        run_cmd = kernel_dir + "/exe.pl " + tmpbinary

        #Find the compilation status using subprocess
        compilation_status = subprocess.run(gcc_cmd, shell=True, stderr=subprocess.PIPE)

        #Find the execution time only when the compilation return code is zero, else return infinity
        if compilation_status.returncode == 0 :
            execution_status = subprocess.run(run_cmd, shell=True, stdout=subprocess.PIPE)
            exetime = float(execution_status.stdout.decode('utf-8'))
            if exetime == 0:
                exetime = 1
        else:
            print(compilation_status.stderr)
            print("compile failed")
        return exetime #return execution time as cost

## Define a simple search space and how to evaluate it

In [None]:
%%writefile problem.py
import numpy as np
from autotune import TuningProblem
from autotune.space import *
import os, sys, time, json, math
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
from skopt.space import Real, Integer, Categorical

sys.path.insert(1, os.path.abspath(__file__))
from plopper import Plopper

matrix_size = os.getenv('MATRIX_SIZE')

# create an object of ConfigSpace
cs = CS.ConfigurationSpace(seed=1234)
#block size for openmp dynamic schedule
# p0= CSH.OrdinalHyperparameter(name='BLOCK_SIZE', sequence=['1','2','3','4','5','6','7','8','9','10'], default_value='5')
p0= CSH.UniformIntegerHyperparameter(name='BLOCK_SIZE', lower=1, upper=100, default_value=5)
cs.add_hyperparameters([p0])

# problem space
task_space = None
input_space = cs
output_space = Space([
     Real(0.0, inf, name="time")
])

dir_path = os.path.dirname(os.path.realpath(__file__))
kernel_idx = dir_path.rfind('/')
kernel = dir_path[kernel_idx+1:]
obj = Plopper(dir_path+'/mmm_block.cpp',dir_path)

x1=['BLOCK_SIZE']
def myobj(point: dict):
    def plopper_func(x):
        x = np.asarray_chkfinite(x)  # ValueError if any NaN or Inf
        value = [point[x1[0]], matrix_size]
        print('CONFIG:',point)
        params = ['BLOCK_SIZE', 'MATRIX_SIZE']
        result = obj.findRuntime(value, params)
        return result

    x = np.array([point['BLOCK_SIZE']])
    results = plopper_func(x)
    print('OUTPUT:%f' % results)
    return results

Problem = TuningProblem(
    task_space=None,
    input_space=input_space,
    output_space=output_space,
    objective=myobj,
    constraints=None,
    model=None
    )

## Use ytopt to autotune this problem under the ytopt conda environment

In [None]:
%%sh
MATRIX_SIZE=100 python -m ytopt.search.ambs --evaluator ray --problem problem.Problem --max-evals=30 --learner RF --set-KAPPA 1.96 --acq-func gp_hedge #--set-SEED 1234 
mv results.csv results_rf_100.csv
mv ytopt.log ytopt_rf_100.log

In [None]:
%%sh
MATRIX_SIZE=200 python -m ytopt.search.ambs --evaluator ray --problem problem.Problem --max-evals=30 --learner RF --set-KAPPA 1.96 --acq-func gp_hedge #--set-SEED 3579 
mv results.csv results_rf_200.csv
mv ytopt.log ytopt_rf_200.log

In [None]:
%%sh
MATRIX_SIZE=300 python -m ytopt.search.ambs --evaluator ray --problem problem.Problem --max-evals=30 --learner RF --set-KAPPA 1.96 --acq-func gp_hedge 
mv results.csv results_rf_300.csv
mv ytopt.log ytopt_rf_300.log

## Autotuning target task online
Now, we describe a standalone code to autotune a target task online.

In [None]:
%%writefile Run_online_TL.py
import warnings
warnings.filterwarnings("ignore")
import numpy as np
from autotune import TuningProblem
from autotune.space import *
import os, sys, time, json, math
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
from skopt.space import Real, Integer, Categorical
import csv, time 
from csv import writer
from csv import reader

sys.path.insert(1, os.path.abspath(__file__))
from plopper import Plopper
import pandas as pd
from sdv.tabular import GaussianCopula
from sdv.tabular import CopulaGAN
from sdv.evaluation import evaluate
from sdv.constraints import CustomConstraint, Between
import random, argparse

parser = argparse.ArgumentParser()
parser.add_argument('--max_evals', type=int, default=10, help='maximum number of evaluations')
parser.add_argument('--n_refit', type=int, default=0, help='refit the model')
parser.add_argument('--seed', type=int, default=1234, help='set seed')
parser.add_argument('--top', type=float, default=0.1, help='how much to train')
parser.add_argument('--target', type=int, default=400, help='target task')
args = parser.parse_args()

MAX_EVALS   = int(args.max_evals)
N_REFIT     = int(args.n_refit)
TOP         = float(args.top)
RANDOM_SEED = int(args.seed)
TARGET_task = str(args.target)
print ('max_evals',MAX_EVALS, 'number of refit', N_REFIT, 'how much to train', TOP, 'seed', RANDOM_SEED, 'target task', TARGET_task)

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

Time_start = time.time()
print ('time...now', Time_start)

 - Define the objective function myobj to evaluate a point in the search space.

In [None]:
%%writefile -a Run_online_TL.py
dir_path = os.path.dirname(os.path.realpath(__file__))
kernel_idx = dir_path.rfind('/')
kernel = dir_path[kernel_idx+1:]
obj = Plopper(dir_path+'/mmm_block.cpp',dir_path)


x1=['BLOCK_SIZE']
def myobj(point: dict):
    def plopper_func(x):
        x = np.asarray_chkfinite(x)  # ValueError if any NaN or Inf
        value = [point[x1[0]], TARGET_task]
        print('CONFIG:',point)
        params = ['BLOCK_SIZE', 'MATRIX_SIZE']
        result = obj.findRuntime(value, params)
        return result

    x = np.array([point['BLOCK_SIZE']])
    results = plopper_func(x)
    print('OUTPUT:%f' % results)
    return results

 - Load data from source tasks.

In [None]:
%%writefile -a Run_online_TL.py
#### selet by best top x%   
X_opt = []
cutoff_p = TOP
print ('----------------------------- how much data to use?', cutoff_p) 
param_names = x1
n_param = len(param_names)
frames = []
for i_size in ['100','200','300']:#
    dataframe = pd.read_csv(dir_path+"/results_rf_"+str(i_size)+".csv")  
    dataframe['runtime'] = np.log(dataframe['objective']) # log(run time)
    dataframe['input']   = pd.Series(int(i_size) for _ in range(len(dataframe.index)))
    q_10_s = np.quantile(dataframe.runtime.values, cutoff_p)
    real_df = dataframe.loc[dataframe['runtime'] <= q_10_s]
    real_data = real_df.drop(columns=['elapsed_sec'])
    real_data = real_data.drop(columns=['objective'])
    frames.append(real_data)      
real_data   = pd.concat(frames)

constraint_input = Between(
    column='input',
    low=1,
    high=500,
    )

constraint_block = Between(
    column='BLOCK_SIZE',
    low=1,
    high=100,
    )

model = GaussianCopula(
            field_names = ['input','BLOCK_SIZE','runtime'],    
            field_transformers = {'input': 'integer',
                                  'BLOCK_SIZE': 'integer',
                                  'runtime': 'float'},
            constraints=[constraint_input, constraint_block]
    )

 - Fit the generative model and suggested configurations are evaluated

In [None]:
%%writefile -a Run_online_TL.py
filename = "results_sdv.csv"
fields   = ['BLOCK_SIZE','exe_time','elapsed_sec']
# writing to csv file 
with open(filename, 'w') as csvfile: 
    # creating a csv writer object 
    csvwriter = csv.writer(csvfile) 
        
    # writing the fields 
    csvwriter.writerow(fields) 

    evals_infer = []
    Max_evals = MAX_EVALS
    eval_master = 0
    while eval_master < Max_evals:         
        # update model
        model.fit(real_data)
        conditions = {'input': int(TARGET_task)}
        ss1 = model.sample(max(100,Max_evals),conditions=conditions)
        ss1 = ss1.drop_duplicates(subset='BLOCK_SIZE', keep="first")
        ss  = ss1.sort_values(by='runtime')#, ascending=False)
        new_sdv = ss[:Max_evals]
        max_evals = N_REFIT
        eval_update = 0
        stop = False
        while stop == False:
            for row in new_sdv.iterrows():
                if eval_update == max_evals:
                    stop = True
                    break    
                sample_point_val = row[1].values[1:].astype(int)
                sample_point = {x1[0]:sample_point_val[0]}
                res          = myobj(sample_point)
                evals_infer.append(res)
                now = time.time()
                elapsed = now - Time_start
                ss = [int(sample_point['BLOCK_SIZE'])]+[res]+[elapsed]
                csvwriter.writerow(ss)
                csvfile.flush()
                row_prev = row
                evaluated = row[1].values[1:]
                evaluated[-1] = float(np.log(res))
                evaluated = np.append(evaluated,row[1].values[0])
                real_data.loc[max(real_data.index)+1] = evaluated 
                eval_update += 1
                eval_master += 1 
        
csvfile.close() 

In [None]:
%%sh
python Run_online_TL.py --max_evals 10 --n_refit 10 --target 500 --top 0.3
mv results_sdv.csv results_sdv_500.csv

In [None]:
import pandas as pd

pd.read_csv("results_sdv_500.csv")

In [None]:
%%sh
python Run_online_TL.py --max_evals 10 --n_refit 5 --target 500 --top 0.3
mv results_sdv.csv results_sdv_500_refit.csv

In [None]:
pd.read_csv("results_sdv_500_refit.csv")