In [None]:
!nvidia-smi

Tue Apr 15 11:05:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Step 1: Write CUDA code to a file
code = r'''
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <random>

//------------------------------------------------------------------------------
// Kernel for tanh
//------------------------------------------------------------------------------
__global__ void tanh_kernel(float* data, float* output, unsigned int pixels) {
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;


    if (i<pixels) {

    output[i] = (expf(data[i])-expf(-data[i]))/(expf(data[i])+expf(-data[i]));

    }


  }


//------------------------------------------------------------------------------
// GPU function for memory allocation, transfer, kernel execution and deletion
//------------------------------------------------------------------------------
void tanh_gpu(float* data_h, float* output_h, int batches, int images, int width, int height, int channels) {

    float *data_d, *output_d;
    unsigned int pixels = batches * images * width * height * channels;
    unsigned int size_t = pixels * sizeof(float);

    cudaMalloc((void **)&data_d, size_t);
    cudaMalloc((void **)&output_d, size_t);

    cudaMemcpy(data_d, data_h, size_t, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(256);
    dim3 numBlocks((pixels + threadsPerBlock.x - 1) / threadsPerBlock.x);

    tanh_kernel<<<numBlocks, threadsPerBlock>>>(data_d, output_d, pixels);
    cudaDeviceSynchronize();

    cudaMemcpy(output_h, output_d, size_t, cudaMemcpyDeviceToHost);

    cudaFree(data_d);
    cudaFree(output_d);
}


int main() {

  const int batches = 5;
  const int images = 32;
  const int height = 224;
  const int width = 224;
  const int channels = 3;

  const int total_size = batches * images * width * height * channels;

  float *data = new float[total_size];
  float *output = new float[total_size];

  // initialize random generator
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<float>dist(-1.0f,1.0f);

  // feed random data into the variable
  for (int i=0; i<total_size; i++) {
    data[i] = dist(gen);
  }

  tanh_gpu(data, output, batches, images, width, height, channels);

  std::cout << "Sample value after TanH: " << output[123456] << std::endl;

  delete[] data;
  delete[] output;

  return 0;
}



'''

# Step 2: Save to file
with open('script.cu', 'w') as f:
    f.write(code)

# Step 3: Compile using nvcc
# Ref: https://stackoverflow.com/questions/73361454/i-am-getting-zeros-as-a-result-of-vector-additon-in-cuda-and-no-errors
!nvcc -arch=sm_75 script.cu -o script

# Step 4: Run the binary
!./script


Sample value after TanH: -0.453483
