In [1]:
%%time

from torch import nn
import torch.utils.model_zoo as model_zoo
import torch.onnx
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
f = open("model.trt", "rb")

runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) 

engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()


def modelRT(samples,batch_size=102400):
    num_samples=len(samples)
    
    #print("batches ",num_samples//batch_size)
    reftime=0   
    R=np.zeros((num_samples,2))
    for i in range(0, num_samples, batch_size):
        #print("batch ",i)
        
        batch=np.array(samples[i:i+batch_size],dtype=np.float32)#np.array(samples[i:i+batch_size],dtype=np.float32).T
        #print((batch))
        #print(num_samples)
        # need to set input and output precisions to FP16 to fully enable it
        output = np.empty([batch_size, 2], dtype = np.float32) 
        #print(np.shape(output))
        # allocate device memory
        #print(batch.nbytes,output.nbytes)
        d_output = cuda.mem_alloc(1 * output.nbytes)
        d_input = cuda.mem_alloc(1 * batch.nbytes)
        bindings = [int(d_input), int(d_output)]

        stream = cuda.Stream()
        
        if(True): # result gets copied into output
            # transfer input data to device

            cuda.memcpy_htod_async(d_input, batch, stream)

            # execute model
            start_time = TIME.time()
            context.execute_async_v2(bindings, stream.handle, None)
            stream.synchronize()
            reftime =reftime+ TIME.time() - start_time
            # transfer predictions back
            cuda.memcpy_dtoh_async(output, d_output, stream)
            # syncronize threads
            stream.synchronize()
            #print(np.shape(output))
            #plt.plot(output.T[0][100:])
            #plt.show()
        d_output.free()
        d_input.free()
        if(i+batch_size<num_samples):

            R[i:i+batch_size,:]=output
        else:
            R[i:num_samples]=output[:num_samples-i]

    return np.array(R),reftime

def modelTexec(samples,batch_size=102400):
    # Assuming the shape is known (replace rows and columns with actual values)
    
    rows = 102400
    columns =2
    dt=np.float32
    metric=0
    num_samples=len(samples)
    R=np.zeros((num_samples,2))
    for i in range(0, num_samples, batch_size):
        c=np.array(samples[i:i+batch_size],dtype=np.float32)
        with open('inp', 'wb') as file:
            np.array(c,dtype=dt).tofile(file)

        !trtexec --loadEngine=model.trt --loadInputs=input:"inp"  --dumpRawBindingsToFile --exportTimes="r.json" --avgRuns=50 --duration=5 1>/dev/null
        # Read the data with the known shape and data type (float32 in this case)
        with open('input.input.102400.4.Float.raw', 'rb') as file:
            raw_data = np.fromfile(file, dtype=dt)
            shaped_data = raw_data.reshape(rows, 4)
           # print(shaped_data)
            
        # Read the data with the known shape and data type (float32 in this case)
        with open('output.output.102400.2.Float.raw', 'rb') as file:
            raw_data = np.fromfile(file, dtype=dt)
            output = raw_data.reshape(rows, 2)
            if(i+batch_size<num_samples):

                R[i:i+batch_size,:]=output
            else:
                R[i:num_samples]=output[:num_samples-i]
        # Read the content of the JSON file
        with open('r.json', 'r') as file:
            data = json.load(file)

            computetimes=[data[x]['computeMs'] for x in range(len(data))]
            latencytimes=[data[x]['latencyMs'] for x in range(len(data))]

            compute_min = min(computetimes)
            compute_max = max(computetimes)
            compute_avg = sum(computetimes) / len(computetimes)

            latency_min = min(latencytimes)
            latency_max = max(latencytimes)
            latency_avg = sum(latencytimes) / len(latencytimes)
            metric+=latency_avg

    return np.array(R),metric/1000 ##ms->s

CPU times: user 995 ms, sys: 961 ms, total: 1.96 s
Wall time: 1.03 s


In [2]:
import numpy as np
import time as TIME
import torch_tensorrt
from FHNCUDAlib import FHNCUDA
import numpy as np
import matplotlib.pyplot as plt
import chaospy as cp
from itertools import product
#!nvcc cuda.cu -o a.out -arch=sm_86 -O3 --use_fast_math --ptxas-options=-v -Xptxas -dlcm=cg -Xcompiler -ffast-math --maxrregcount=32


def runCuda(T=0.5*1E2):
    # Define the ranges
    krange = [0.08, 0.12]
    vrange = [0.0, 0.12]
    urange = [0.0, 0.8]
        # Create uniform distributions
    k_dist = cp.Uniform(*krange)
    v_dist = cp.Uniform(*vrange)
    u_dist = cp.Uniform(*urange)

    joint_dist = cp.J(u_dist,v_dist,k_dist)
#    print(T)
    sample_set = joint_dist.sample(T, rule="L").T
    np.random.shuffle(sample_set)
    sample_set=sample_set
    x0=np.array(sample_set)
#    print(np.shape(x0))
    dt,tt=0.01,50

    rate=50

#    print("-------------------------CUDAref----------------------------")

    ###Cuda run
    start_time = TIME.time()
    u,v ,t,_=FHNCUDA.run(x0,tt,dt*0.1,rate*10)
    cudatime = TIME.time()- start_time

    u_ref=np.array(u).flatten()
   # print("Shape cudapred ",np.shape(u))

   # print("-------------------------CUDA ----------------------------")

    ###Cuda run
    start_time = TIME.time()
    u,v ,t,p=FHNCUDA.run(x0,tt,dt,rate)
    cudatime = TIME.time()- start_time

    u_num=np.array(u).flatten()
    #print("Shape cudapred ",np.shape(u))

    #print(np.unique(t))
    p=[i/1000 for i in p[0]]
   # print("cuda time",p)
   # print("Error Calculation")
    e=((u_ref-u_num)**2)**(1/2)
   # print("mean",np.mean(e))
    m=np.max(e)
   # print("max",m)
    #plt.plot(u_ref[:100],"b")
    #plt.plot(u_num[:100],"r")

    #plt.show()

    t=np.array(t).flatten()
    #print(np.shape(t))

    
    #print(t)
    param_list = []

    for sample in sample_set:
        u,v,k=sample
        for T in t:
                        param_list.append([T,u,v,k])




    x0=np.array(param_list)
    return p,x0,u_num,u_ref
    #print(x0)
    #print(x0)


In [3]:
print(runCuda(120000 + 1024))

kernel call: ./a.out 50 0.001 500
187
kernel call: ./a.out 50 0.01 50
187
([0.00053232, 0.061937888999999996, 3.556833252], array([[0.00000000e+00, 3.88149772e-01, 1.01516627e-02, 9.28066181e-02],
       [5.00000000e-01, 3.88149772e-01, 1.01516627e-02, 9.28066181e-02],
       [1.00000000e+00, 3.88149772e-01, 1.01516627e-02, 9.28066181e-02],
       ...,
       [4.85000000e+01, 5.20963479e-01, 1.08252699e-01, 9.04674326e-02],
       [4.90000000e+01, 5.20963479e-01, 1.08252699e-01, 9.04674326e-02],
       [4.95000000e+01, 5.20963479e-01, 1.08252699e-01, 9.04674326e-02]]), array([ 0.38815 ,  0.972075,  1.075629, ..., -0.001732,  0.015706,
        0.03304 ]), array([ 3.881500e-01,  9.723270e-01,  1.075212e+00, ..., -9.820000e-04,
        1.643100e-02,  3.374200e-02]))


In [4]:
import json



In [1]:
ts=[1024*(4**x) for x in range(1,10)]
tcs,tns,tes=[],[],[]
for T in ts:
        print("Set of size ",T)
        
        print("Cuda -\n")
        cuda_time,x0,un,uref=runCuda(T)
        print("time : ",cuda_time, "s\n")
        print("tensort py -\n")
        #pr,net_time=modelRT(x0)
        print("time : ",net_time, "s\n")
        print("texec -\n")

        #er,exectime=modelTexec(x0)
        print("time : ",exectime, "s\n")
        tcs.append(cuda_time[1])
        #tns.append(net_time)
        #tes.append(exectime)
print(tcs,tns)
plt.plot(ts,tcs,label="Tempo Cuda")
#plt.plot(ts,tes, label="Tempo texec")
#plt.plot(ts,tns, label="Tempo tensorrt python")
plt.legend(loc="best")
plt.show()

Set of size  4096
Cuda -



NameError: name 'runCuda' is not defined

In [None]:
tcs

In [None]:
plt.plot(ts,tcs)
plt.plot(ts,tns)
plt.show()