In [None]:
%%writefile exe.pl
#!/usr/bin/env perl

#Author: Xingfu Wu
#MCS, ANL
# exe.pl: average the execution time in 5 runs
#
use Time::HiRes qw(gettimeofday); 

foreach $filename (@ARGV) {
 #  print "Start to preprocess ", $filename, "...\n";
   $ssum = 0.0;
   $nmax = 5;
   @nn = (1..$nmax);
   for(@nn) {
    $retval = gettimeofday( ); 
    system("$filename >/dev/null 2>&1");
    $tt = gettimeofday( );
    $ttotal = $tt - $retval;
    $ssum = $ssum + $ttotal;
   }
   $avg = $ssum / $nmax;
 #  print "End to preprocess ", $avg, "...\n";
   printf("%.3f", $avg);
}


In [None]:
%%sh
chmod a+x exe.pl

In [None]:
%%writefile mmm_block.cpp: parameterized code
#include <stdlib.h>
#include <assert.h>

#define N 100

int main(int argc, const char** argv)
{

  int n = BLOCK_SIZE * (N/BLOCK_SIZE);
  int a[N][N];
  int b[N][N];
  int c[N][N];
#ifdef CHECK
  int cref[N][N];
  for(int i = 0; i < N; i++)
    for(int j = 0; j < N; j++) {
        a[i][j] = rand();
        b[i][j] = rand();
    }
  for(int i = 0; i < N; i++)
    for(int j = 0; j < N; j++) {
      c[i][j] = 0;
      cref[i][j] = 0;
      for(int k = 0; k < N; k++)
        cref[i][j] += a[i][k] * b[k][j];
    }
#endif
  int sum=0;
  
#ifndef CHECK
  // Repeat so measure is long enough
  for(int repeat = 0; repeat < 10; repeat++) {
#endif
    // For all blocks in j
    for(int j1 = 0; j1 < n; j1 += BLOCK_SIZE) {
      // Muliply by all block in k
      for(int k1 = 0; k1 < n; k1 += BLOCK_SIZE)
        for(int i = 0; i < N; i++)
          for(int j = j1; j < j1 + BLOCK_SIZE; j++) {
            sum = c[i][j];
            for(int k = k1; k < k1 + BLOCK_SIZE; k++)               
              sum += a[i][k] * b[k][j];
            c[i][j] = sum;
          }
      // and by the remainder in k dimension
      for(int i = 0; i < N; i++)
        for(int j = j1; j < j1 + BLOCK_SIZE; j++) {
            sum = c[i][j];
            for(int k = n; k < N; k++)
              sum += a[i][k] * b[k][j];
            c[i][j] = sum;
        }
    }
    // C[0...N][0..n] is already done here
    // Need to compute C[0...N][n...N]
    for(int k1 = 0; k1 < n; k1 += BLOCK_SIZE)
      for(int i = 0; i < N; i++)
        for(int j = n; j < N; j++) {
          sum = c[i][j];
          for(int k = k1; k < k1 + BLOCK_SIZE; k++)
            sum += a[i][k] * b[k][j];
          c[i][j] = sum;
        }
    for(int i = 0; i < N; i++)
      for(int j = n; j < N; j++) {
        sum = c[i][j];
        for(int k = n; k < N; k++)
          sum += a[i][k] * b[k][j];
        c[i][j] = sum;
      }
#ifndef CHECK
  }
#endif
    
#ifdef CHECK
  for(int i = 0; i < N; i++)
    for(int j = 0; j < N; j++)
      assert(c[i][j] == cref[i][j]);
#endif
  return 0;
}

In [None]:
%%writefile plopper.py: use the selected configuration to generate a new code, compile and execute it
import os, sys, subprocess, random, uuid

class Plopper:
    def __init__(self,sourcefile,outputdir):

        # Initilizing global variables
        self.sourcefile = sourcefile
        self.outputdir = outputdir+"/tmp_files"

        if not os.path.exists(self.outputdir):
            os.makedirs(self.outputdir)

    #Creating a dictionary using parameter label and value
    def createDict(self, x, params):
        dictVal = {}
        for p, v in zip(params, x):
            dictVal[p] = v
        return(dictVal)
    
    # Function to find the execution time of the interim file, and return the execution time as cost to the search module
    def findRuntime(self, x, params):
        interimfile = ""
        exetime = 1
        
        # Generate intermediate file
        dictVal = self.createDict(x, params)

        #compile and find the execution time
        tmpbinary = self.outputdir + '/tmp_'+str(uuid.uuid4())+'.bin'
        kernel_idx = self.sourcefile.rfind('/')
        kernel_dir = self.sourcefile[:kernel_idx]
        gcc_cmd = 'g++ ' + kernel_dir +'/mmm_block.cpp '
        gcc_cmd += ' -D{0}={1}'.format('BLOCK_SIZE', dictVal['BLOCK_SIZE'])
        gcc_cmd += ' -o ' + tmpbinary
        run_cmd = kernel_dir + "/exe.pl " + tmpbinary

        #Find the compilation status using subprocess
        compilation_status = subprocess.run(gcc_cmd, shell=True, stderr=subprocess.PIPE)

        #Find the execution time only when the compilation return code is zero, else return infinity
        if compilation_status.returncode == 0 :
            execution_status = subprocess.run(run_cmd, shell=True, stdout=subprocess.PIPE)
            exetime = float(execution_status.stdout.decode('utf-8'))
            if exetime == 0:
                exetime = 1
        else:
            print(compilation_status.stderr)
            print("compile failed")
        return exetime #return execution time as cost

In [None]:
%%writefile problem.py: define a simple search space and how to evaluate it
import numpy as np
from autotune import TuningProblem
from autotune.space import *
import os, sys, time, json, math
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
from skopt.space import Real, Integer, Categorical

sys.path.insert(1, os.path.abspath(__file__))
from plopper import Plopper

# create an object of ConfigSpace
cs = CS.ConfigurationSpace(seed=1234)
#block size for openmp dynamic schedule
# p0= CSH.OrdinalHyperparameter(name='BLOCK_SIZE', sequence=['1','2','3','4','5','6','7','8','9','10'], default_value='5')
p0= CSH.UniformIntegerHyperparameter(name='BLOCK_SIZE', lower=1, upper=10, default_value=5)
cs.add_hyperparameters([p0])

# problem space
task_space = None
input_space = cs
output_space = Space([
     Real(0.0, inf, name="time")
])

dir_path = os.path.dirname(os.path.realpath(__file__))
kernel_idx = dir_path.rfind('/')
kernel = dir_path[kernel_idx+1:]
obj = Plopper(dir_path+'/mmm_block.cpp',dir_path)

x1=['BLOCK_SIZE']
def myobj(point: dict):
    def plopper_func(x):
        x = np.asarray_chkfinite(x)  # ValueError if any NaN or Inf
        value = [point[x1[0]]]
        print('CONFIG:',point)
        params = ["BLOCK_SIZE"]
        result = obj.findRuntime(value, params)
        return result

    x = np.array([point['BLOCK_SIZE']])
    results = plopper_func(x)
    print('OUTPUT:%f' % results)
    return results

Problem = TuningProblem(
    task_space=None,
    input_space=input_space,
    output_space=output_space,
    objective=myobj,
    constraints=None,
    model=None
    )

In [None]:
%% use ytopt to autotune this problem under the ytopt conda environment
python -m ytopt.search.ambs --evaluator ray --problem problem.Problem --max-evals=5 --learner RF