In [None]:
import os
pwd = os.getcwd()

# Paths to sources
blaspp_source = "/home/weslleyp/storage/blaspp"
lapackpp_source = "/home/weslleyp/storage/lapackpp"
tlapack_source = "/home/weslleyp/storage/tlapack"

# Paths to libraries
tlapack_DIR = pwd+"/tlapack"
tlapackLAPACK_DIR = pwd+"/tlapack_LAPACK"
blaspp_DIR = pwd+"/blaspp"
lapackpp_DIR = pwd+"/lapackpp"
starpu_DIR = "/home/weslleyp/storage/starpu/build/lib"

# Set environment variables for StarPU
os.environ["PKG_CONFIG_PATH"] = starpu_DIR+"/pkgconfig"
os.environ["STARPU_SCHED"] = "dmdas"
os.environ["HWLOC_COMPONENTS"] = "-gl"

# Load Python modules
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import display, Math
import time

In [None]:
# System:
!uname -a

In [None]:
# Machine:
!lscpu

In [None]:
# Nvidia version:
!nvidia-smi

In [None]:
# MKL version:
!which mkl_link_tool

In [None]:
#Build

# Install <T>LAPACK
!cmake -B "$tlapack_DIR" -G Ninja -D CMAKE_BUILD_TYPE=Release -D BUILD_EXAMPLES=OFF -D BUILD_TESTING=OFF -D TLAPACK_NDEBUG=ON -D CMAKE_INSTALL_PREFIX="$tlapack_DIR" -D CMAKE_INSTALL_MESSAGE="LAZY" "$tlapack_source"
!cmake --build "$tlapack_DIR" --target install

# Build
!cmake -B build -G Ninja -D CMAKE_BUILD_TYPE=Release -D CMAKE_PREFIX_PATH="."
!cmake --build build

In [None]:
!./build/example_starpu_potrf 1000 20 all yes

In [None]:
!./build/example_starpu_lapack 1000

In [None]:
#Build with MKL

# Install BLAS++
!cmake -B "$blaspp_DIR" -G Ninja -D CMAKE_BUILD_TYPE=Release -D build_tests=OFF -D CMAKE_INSTALL_PREFIX="$blaspp_DIR" -D CMAKE_INSTALL_MESSAGE="LAZY" "$blaspp_source"
!cmake --build "$blaspp_DIR" --target install

# Install LAPACK++
!cmake -B "$lapackpp_DIR" -G Ninja -D CMAKE_BUILD_TYPE=Release -D build_tests=OFF -D CMAKE_INSTALL_PREFIX="$lapackpp_DIR" -D CMAKE_INSTALL_MESSAGE="LAZY" -D blaspp_DIR="$blaspp_DIR" "$lapackpp_source"
!cmake --build "$lapackpp_DIR" --target install

# Install <T>LAPACK
!cmake -B "$tlapackLAPACK_DIR" -G Ninja -D CMAKE_BUILD_TYPE=Release -D BUILD_EXAMPLES=OFF -D BUILD_TESTING=OFF -D TLAPACK_NDEBUG=ON -D CMAKE_INSTALL_PREFIX="$tlapackLAPACK_DIR" -D CMAKE_INSTALL_MESSAGE="LAZY" -D USE_LAPACKPP_WRAPPERS=ON -D blaspp_DIR="$blaspp_DIR" -D lapackpp_DIR="$lapackpp_DIR" "$tlapack_source"
!cmake --build "$tlapackLAPACK_DIR" --target install

# Build
!cmake -B build_LAPACK -G Ninja -D CMAKE_BUILD_TYPE=Release -D tlapack_DIR="$tlapackLAPACK_DIR" -D blaspp_DIR="$blaspp_DIR" -D lapackpp_DIR="$lapackpp_DIR"
!cmake --build build_LAPACK

In [None]:
!./build_LAPACK/example_starpu_potrf 1000 20 all yes

# First test: Find optimal block size for a given matrix size

In [None]:
# Set environment variables for StarPU
if 'STARPU_NCUDA' in os.environ:
    print("STARPU_NCUDA: ", os.environ['STARPU_NCUDA'])
    del os.environ['STARPU_NCUDA']
if 'STARPU_NCPU' in os.environ:
    print("STARPU_NCPU: ", os.environ['STARPU_NCPU'])
    del os.environ['STARPU_NCPU']

In [None]:
n = 7680

# vector with integer factors of n
parts = [i for i in range(8,21) if n%i==0]
N = len(parts)

precision = ["single","double"]
NT = len(precision)

nRuns = 5

executable = [
    "build_LAPACK/example_starpu_potrf"
]
methods = [
    r"C++ templates + cuBLAS + cuSOLVER + MKL"
]
M = len(executable)

# Partition sizes
nb = np.divide(n,parts)

# FLOPs
FLOPs_mul = n * (((n*1.)/6+.5)*n+1./3)
FLOPs_add = n * (((n*1.)/6)*n-1./6)
FLOPs = FLOPs_mul + FLOPs_add

print("Size of the matrix = "+str(n))
print("Partitions: "+str(parts))
print("Precisions: "+str(precision))
print("Number of runs = "+str(nRuns))

In [None]:
expr = executable[0]
!$expr {n} {nb[3]} {precision[0]}

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

data = np.ones([M,N,NT], dtype=np.float64) * 60 * 60 * 24

for s in range(M):
    expr = executable[s]
    for i in range(N):
        nbi = nb[i]
        for k in range(NT):
            for j in range(nRuns):
                time.sleep(5)
                output = !$expr {n} {nbi} {precision[k]} | grep time
                print(output)
                try:
                    aux = float(output[-1].split()[2])
                    data[s,i,k] = np.minimum( aux, data[s,i,k] )
                except:
                    print("Not a float: ", output)

In [None]:
markers = ['x-','*-','+-']
plt.rcParams['font.size'] = 12

for p in range(NT):
    print(precision[p])

    fig1, ax1 = plt.subplots()

    for m in range(M):
        gflops = np.divide(FLOPs/1e9,data[m,:,p])
        plt.plot(nb,gflops,markers[m%3],label=methods[m])

    # ax1.set_xscale("log")
    # ax1.set_yscale("log")
    # ax1.set_xticks(parts)
    ax1.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())

    plt.xlabel("$n_b$")
    plt.ylabel("GFLOPS")
    plt.legend()

    plt.tight_layout()
    plt.savefig("curves_"+precision[p]+".pdf")
    plt.show()

# Take the best block size and compare with MKL and StarPU

In [None]:
nx_best = np.zeros([NT], dtype=int)
for k in range(NT):
    nx_best[k] = parts[np.argmin(data[0,:,k])]
    # nx_best[k] = 16

print(nx_best)

nRuns = 5 # for the best partition size

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

dataMKL = np.ones([NT], dtype=np.float64) * 60 * 60 * 24

expr = "build/example_starpu_lapack"
for k in range(NT):
    for j in range(nRuns):
        time.sleep(5)
        output = !$expr {n} {precision[k]} | grep time
        print(output)
        try:
            aux = float(output[-1].split()[2])
            dataMKL[k] = np.minimum( aux, dataMKL[k] )
        except:
            print("Not a float: ", output)

for p in range(NT):
    print(precision[p])

    gflops = np.divide(FLOPs/1e9,dataMKL[p])
    print("time[s] = ", dataMKL[p])
    print("GFLOPS = ", gflops)

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

dataTLAPACKStarPUmkl = np.ones([NT], dtype=np.float64) * 60 * 60 * 24

expr = "build_LAPACK/example_starpu_potrf"
for k in range(NT):
    for j in range(nRuns):
        time.sleep(5)
        output = !$expr {n} {n/nx_best[k]} {precision[k]} | grep time
        print(output)
        try:
            aux = float(output[-1].split()[2])
            dataTLAPACKStarPUmkl[k] = np.minimum( aux, dataTLAPACKStarPUmkl[k] )
        except:
            print("Not a float: ", output)

for p in range(NT):
    print(precision[p])

    gflops = np.divide(FLOPs/1e9,dataTLAPACKStarPUmkl[p])
    print("time[s] = ", dataTLAPACKStarPUmkl[p])
    print("GFLOPS = ", gflops)

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

expr = starpu_DIR + "/starpu/examples/cholesky_implicit"
for j in range(nRuns):
    time.sleep(5)
    output = !$expr -size {n} -nblocks {nx_best[0]} -no-prio
    print(output[-1])

# First test: Find optimal block size without GPU

In [None]:
# Set environment variables for StarPU
os.environ["STARPU_NCUDA"] = "0"
if 'STARPU_NCPU' in os.environ:
    del os.environ['STARPU_NCPU']

In [None]:
n = 7680

# vector with integer factors of n
parts = [i for i in range(6,60) if n%i==0]
N = len(parts)

precision = ["single"]
NT = len(precision)

nRuns = 5

executable = [
    "build_LAPACK/example_starpu_potrf"
]
methods = [
    r"C++ templates + MKL"
]
M = len(executable)

# Partition sizes
nb = np.divide(n,parts)

# FLOPs
FLOPs_mul = n * (((n*1.)/6+.5)*n+1./3)
FLOPs_add = n * (((n*1.)/6)*n-1./6)
FLOPs = FLOPs_mul + FLOPs_add

print("Size of the matrix = "+str(n))
print("Partitions: "+str(parts))
print("Precisions: "+str(precision))
print("Number of runs = "+str(nRuns))

In [None]:
expr = executable[0]
!$expr {n} {nb[0]} {precision[0]} y

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

data = np.ones([M,N,NT], dtype=np.float64) * 60 * 60 * 24

for s in range(M):
    expr = executable[s]
    for i in range(N):
        nbi = nb[i]
        for k in range(NT):
            for j in range(nRuns):
                time.sleep(5)
                output = !$expr {n} {nbi} {precision[k]} | grep time
                print(output)
                try:
                    aux = float(output[-1].split()[2])
                    data[s,i,k] = np.minimum( aux, data[s,i,k] )
                except:
                    print("Not a float: ", output)

In [None]:
markers = ['x-','*-','+-']
plt.rcParams['font.size'] = 12

for p in range(NT):
    print(precision[p])

    fig1, ax1 = plt.subplots()

    for m in range(M):
        gflops = np.divide(FLOPs/1e9,data[m,:,p])
        plt.plot(nb,gflops,markers[m%3],label=methods[m])

    # ax1.set_xscale("log")
    # ax1.set_yscale("log")
    # ax1.set_xticks(parts)
    ax1.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())

    plt.xlabel("$n_b$")
    plt.ylabel("GFLOPS")
    plt.legend()

    plt.tight_layout()
    plt.savefig("curves_"+precision[p]+".pdf")
    plt.show()

In [None]:
nx_best = np.zeros([NT], dtype=np.float64)
for k in range(NT):
    nx_best[k] = parts[np.argmin(data[0,:,k])]
    # nx_best[k] = 15
print("Best nb:", np.divide(n,nx_best))

nRuns = 5 # for the best partition size

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

dataTLAPACKStarPUmkl = np.ones([NT], dtype=np.float64) * 60 * 60 * 24

expr = "build_LAPACK/example_starpu_potrf"
for k in range(NT):
    for j in range(nRuns):
        time.sleep(5)
        output = !$expr {n} {n/nx_best[k]} {precision[k]} | grep time
        print(output)
        try:
            aux = float(output[-1].split()[2])
            dataTLAPACKStarPUmkl[k] = np.minimum( aux, dataTLAPACKStarPUmkl[k] )
        except:
            print("Not a float: ", output)

for p in range(NT):
    print(precision[p])

    gflops = np.divide(FLOPs/1e9,dataTLAPACKStarPUmkl[p])
    print("time[s] = ", dataTLAPACKStarPUmkl[p])
    print("GFLOPS = ", gflops)

In [None]:
# %%script false --no-raise-error
# # Comment the line above to run this cell

expr = starpu_DIR + "/starpu/examples/cholesky_implicit"
for j in range(nRuns):
    time.sleep(5)
    output = !$expr -size {n} -nblocks {nx_best[0]} -no-prio
    print(output[-1])

In [None]:
expr = "build_LAPACK/example_starpu_potrf"
!$expr {n} {n/48} single