# Benchmark Analysis

In [1]:
import sys
!{sys.executable} -m pip install -r requirements.txt

You should consider upgrading via the '/Users/diego/Documents/Studium/4_Semester/Masterarbeit/master_benchmarks/analysis/venv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from pathlib import Path
from typing import List

% matplotlib inline

In [3]:
root_dir = '/Users/diego/Desktop/BENCHMARK_COLLECTED_NEBULAH_B1-B4_1'

## Utils 

In [4]:
def get_path(*entries):
    return os.path.join(*entries)


In [5]:
def ensure_file_existence(output_filename):
    """
    Checks wheterh the path to the file exists. If not it creates the folder structure and the final file.
    :param output_filename: path to the file
    :return:
    """

    # creates dirs etc if they do not exists
    output_path = Path(output_filename)
    if not os.path.exists(output_path.parent):
        os.makedirs(output_path.parent)
    output_path.touch(exist_ok=True)  # will create file, if it exists will do nothing

In [6]:
def extraction_pandas_frame_algo(path):
    df = pd.read_csv(path)

    # dropping columns we do not care about
    df = df.drop(['iterations', 'bytes_per_second', 'items_per_second', 'label', 'error_occurred', 'error_message'],
                 axis=1)

    # adding the problem size as column
    df = df[df['name'].str.endswith(('mean', 'median', 'stddev'))]
    df['n'] = df.apply(lambda x: x[0][x[0].find('/') + 1:x[0].rfind('_')], axis=1)

    df = df.reset_index(drop=True)

    # convert to format 
    # name	real_time	cpu_time	time_unit	n	median	stddev	Compiler
    results_gcc = df.groupby('n').apply(lambda sf: pd.Series(sf.iloc[0])).reset_index(drop=True)
    results_gcc.n = results_gcc.n.astype(int)
    results_gcc = results_gcc.sort_values(['n'], ascending=True).reset_index(drop=True)

    results_gcc['C'] = np.arange(len(results_gcc))

    results_gcc['median_id'] = results_gcc['C'] * 3 + 1
    results_gcc['median'] = results_gcc['median_id'].apply(lambda x: df.iloc[x]['real_time'])

    results_gcc['stddev_id'] = results_gcc['C'] * 3 + 2
    results_gcc['stddev'] = results_gcc['stddev_id'].apply(lambda x: df.iloc[x]['real_time'])

    results_gcc = results_gcc.drop(['C', 'median_id', 'stddev_id'], axis=1)
    results_gcc['Compiler'] = 'TODO'
    results_gcc['name'] = results_gcc.apply(lambda x: x[0].replace(str(x['n']), "").replace('/_mean', ''), axis=1)

    return results_gcc


# Nebulah all Core

TODO: system info

## H1
> Some parallel backends exhibit better performance and scalability when handling nested parallelism for homogeneous
> workloads
>

### Time 

#### Time Comparison - `b1_1_for_each_linear_par`
Check how the runtime without constraining the threads develops with increasing input size

In [None]:
# load data gcc (b1_1_for_each_linear_par)

# load data nvhpc (b1_1_for_each_linear_par)

# plot

#### Time Comparison - `b1_2_for_each_quadratic_outer_std::execution::parallel_policy_par`
Check how the runtime without constraining the threads develops with increasing input size

In [None]:
# load data gcc (b1_2_for_each_quadratic_outer_std::execution::parallel_policy_par)

# load data nvhpc (b1_2_for_each_quadratic_outer_std::execution::parallel_policy_par)

# plot

#### Time Comparison - `b1_4_for_each_exponential_par`
Check how the runtime without constraining the threads develops with increasing input size

In [1]:
# load data gcc (b1_4_for_each_exponential_par)

# load data nvhpc (b1_4_for_each_exponential_par)

# plot

### Strong Scaling

`S(p)= T(1) / T(p)`

As based we use once the:
* sequential algorithm 
* parallel algorithm (1 thread)

#### Strong Scaling - `b1_1_for_each_linear` 
1 Million fixed input size with threads 1-32

##### Seq Base

In [2]:
# load gcc (b1_1_for_each_linear_seq)
# load gcc threaded b1_1_for_each_linear_par 

# load nvhpc (b1_1_for_each_linear_seq)
# load nvhpc threaded b1_1_for_each_linear_par 

# plot strong scaling

##### Par(1) Base

In [3]:
# load gcc threaded b1_1_for_each_linear_par 

# load nvhpc threaded b1_1_for_each_linear_par 

# plot strong scaling

#### Strong Scaling - `b1_2_for_each_quadratic` 
1 Million fixed input size with threads 1-32

##### Seq Base

Here we wont do it with seq base because its not really realistic

##### Par(1) Base

In [None]:
# load gcc threaded b1_2_for_each_quadratic_outer_std::execution::parallel_policy_par 

# load nvhpc threaded b1_2_for_each_quadratic_outer_std::execution::parallel_policy_par 

# plot strong scaling

#### Strong Scaling - `b1_4_for_each_exponential` 
32 fixed input size with threads 1-32

##### Seq Base

In [None]:
# load gcc (b1_4_for_each_exponential_seq)
# load gcc threaded b1_4_for_each_exponential_par 

# load nvhpc (b1_4_for_each_exponential_seq)
# load nvhpc threaded b1_4_for_each_exponential_par 

# plot strong scaling

##### Par(1) Base

In [None]:
# load gcc threaded b1_4_for_each_exponential_par 

# load nvhpc threaded b1_4_for_each_exponential_par

# plot strong scaling

### Performance Portability Calculation (Inter Compiler)

for this group we can "calculate" a performance probability by looking at the strong scaling speedup every compiler has when using the max amount of cores. (aka running with 1M entries at max core) (insipred by [1])

example:

```
|          | achieved | perfect | efficiency  | 
|----------|----------|---------|-------------|
| GCC(TBB) | 12       | 16      | 12/16=0.75  |
| NVC(OMP) | 16       | 16      | 16/16=1     |
| NVC(GPU) | 0        | 0       | 0           |
| Intel    | 14       | 16      | 14/16=0.875 |

Performance Portability for `{GCC(TBB), NVC(OMP), NVC(GPU), Intel}` = 0

Performance Portability for `{GCC(TBB), NVC(OMP), Intel}` = `3/((1/0,75)+ (1/1) + (1/0,875))` = 86.3%

```

In [None]:
# calculate efficiency for gcc on max core for `b1_1_for_each_linear`

# calculate efficiency for gcc on max core for `b1_2_for_each_quadratic`

# calculate efficiency for gcc on max core for `b1_4_for_each_exponential`


# calculate efficiency for nvhpc(mc) on max core for `b1_1_for_each_linear`

# calculate efficiency for nvhpc(mc) on max core for `b1_2_for_each_quadratic`

# calculate efficiency for nvhpc(mc) on max core for `b1_4_for_each_exponential`

### Findings for H1