# Part I - Devito Performance modes

This tutorial is the first one out of a series of tutorials describing the code generated when using different DLE modes. In this tutorial we present the performance optimizations applied by the Devito compiler, including the Devito Loop Engine that delivers optimizations for parallelism and cache locality.

For the purposes of this tutorial we will compare the generated code between several cobinations of DLE modes.

We will use a trivial `Operator` that, at each time step, increments by 1 all points in the physical domain and the code produced in each case.

In [1]:
# This function will be used to print the difference between the generated code.
def _unidiff_output(expected, actual):
    """
    Helper function. Returns a string containing the unified diff of two multiline strings.
    """
    import difflib
    expected=expected.splitlines(1)
    actual=actual.splitlines(1)

    diff=difflib.unified_diff(expected, actual)

    return ''.join(diff)

In [2]:
from devito import clear_cache
import numpy as np
clear_cache()


In [3]:
from devito import Grid, TimeFunction, Eq, Operator, clear_cache
from examples.cfd import plot_field, init_hat
from devito import Eq, solve
from devito import configuration

# Initialise our problem parameters
nx = 200
ny = 200
grid = Grid(shape=(nx, ny))
u = TimeFunction(name='u', grid=grid)
eq = Eq(u.forward, (u + 0.1, u + 0.2, u + 0.1))

#Set up an operator with DLE set to noop and one with DLE set to advanced.
op_noop = Operator(eq, dle = 'noop')
op_advanced = Operator(eq, dle = 'advanced')

str_op_noop = str(op_noop)
str_op_advanced = str(op_advanced)

print(_unidiff_output(str_op_noop, str_op_advanced))

--- 
+++ 
@@ -2,6 +2,8 @@
 #include "stdlib.h"
 #include "math.h"
 #include "sys/time.h"
+#include "xmmintrin.h"
+#include "pmmintrin.h"
 
 struct dataobj
 {
@@ -23,6 +25,9 @@
 int Kernel(struct dataobj *restrict u_vec, const int time_M, const int time_m, struct profiler * timers, const int x_M, const int x_m, const int y_M, const int y_m)
 {
   float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;
+  /* Flush denormal numbers to zero in hardware */
+  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
   for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))
   {
     struct timeval start_section0, end_section0;
@@ -30,6 +35,7 @@
     /* Begin section0 */
     for (int x = x_m; x <= x_M; x += 1)
     {
+      #pragma omp simd aligned(u:32)
       for (int y = y_m; y <= y_M; y += 1

The code diff in the cell above depicts some differences between these two modes.
First of all, we can notice the addition of 
```
+#include "xmmintrin.h"
+#include "pmmintrin.h"
```
and 
```
+  /* Flush denormal numbers to zero in hardware */
+  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
```
Denormals are normally flushed when using SSE-based instruction sets, except when compiling shared objects.


We can then see the addition of SIMD Vectorization 
```
+      #pragma omp simd aligned(u:32)
```
before the loop where we iterate the y-direction.

In our next comparison we import performance_mode from devito and more optimizations are enabled:
TO ADD MORE

In [5]:
from devito import mode_performance
mode_performance()

op_speculative = Operator(eq, dle = 'speculative')

str_op_noop = str(op_noop)
str_op_speculative = str(op_speculative)

print(_unidiff_output(str_op_noop, str_op_speculative))

--- 
+++ 
@@ -19,26 +19,42 @@
   double section0;
 } ;
 
+void bf0(struct dataobj *restrict u_vec, const int t0, const int t1, const int x0_blk_M, const int x0_blk_m, const int x0_blk_size, const int y0_blk_M, const int y0_blk_m, const int y0_blk_size);
 
-int Kernel(struct dataobj *restrict u_vec, const int time_M, const int time_m, struct profiler * timers, const int x_M, const int x_m, const int y_M, const int y_m)
+int Kernel(struct dataobj *restrict u_vec, const int time_M, const int time_m, struct profiler * timers, const int x0_blk_size, const int x_M, const int x_m, const int y0_blk_size, const int y_M, const int y_m)
 {
-  float (*restrict u)[u_vec->size[1]][u_vec->size[2]] __attribute__ ((aligned (64))) = (float (*)[u_vec->size[1]][u_vec->size[2]]) u_vec->data;
   for (int time = time_m, t0 = (time)%(2), t1 = (time + 1)%(2); time <= time_M; time += 1, t0 = (time)%(2), t1 = (time + 1)%(2))
   {
     struct timeval start_section0, end_section0;
     gettimeofday(&start_section0

You can now notice that the field computation is happening inside blocked loops.
The size of the blocks is decided form the autotuner.