In [1]:
%%writefile cuda_sin_test.cu

#include <iostream>
#include <cmath>
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>

#define PI M_PI

const size_t N = 1000000000ULL;

template <typename T>
__global__ void kernel_sin(T *arr, unsigned long long N) {
    unsigned long long i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        arr[i] = sin((T)(i % 360) * PI / 180.0);
}

template <typename T>
__global__ void kernel_sinf(T *arr, unsigned long long N) {
    unsigned long long i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        arr[i] = sinf((T)(i % 360) * (T)PI / 180.0f);
}

template <typename T>
__global__ void kernel___sinf(T *arr, unsigned long long N) {
    unsigned long long i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        arr[i] = __sinf((T)(i % 360) * (T)PI / 180.0f);
}

template<typename T>
T compute_error(const T* arr) {
    T err = 0.0;
    for (size_t i = 0; i < N; ++i) {
        T ref = sin((i % 360) * PI / 180.0);
        err += fabs(ref - (T)arr[i]);
    }
    return err / N;
}

template<typename T>
void test() {
    T *d_arr, *h_arr;
    h_arr = new T[N];
    cudaMalloc(&d_arr, N * sizeof(T));

    dim3 block(256);
    dim3 grid((N + block.x - 1) / block.x);

    {
        std::cout << "Running sin() ..." << std::endl;
        auto start = std::chrono::steady_clock::now();
        kernel_sin<<<grid, block>>>(d_arr, N);
        cudaDeviceSynchronize();
        auto stop = std::chrono::steady_clock::now();
        cudaMemcpy(h_arr, d_arr, N * sizeof(T), cudaMemcpyDeviceToHost);
        std::cout << "Time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count()
                  << " ms" << std::endl;
        double err = compute_error(h_arr);
        std::cout << "Average error: " << err << std::endl << std::endl;
    }

    {
        std::cout << "Running sinf() ..." << std::endl;
        auto start = std::chrono::steady_clock::now();
        kernel_sinf<<<grid, block>>>(d_arr, N);
        cudaDeviceSynchronize();
        auto stop = std::chrono::steady_clock::now();
        cudaMemcpy(h_arr, d_arr, N * sizeof(T), cudaMemcpyDeviceToHost);
        std::cout << "Time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count()
                  << " ms" << std::endl;
        double err = compute_error(h_arr);
        std::cout << "Average error: " << err << std::endl << std::endl;
    }

    {
        std::cout << "Running __sinf() ..." << std::endl;
        auto start = std::chrono::steady_clock::now();
        kernel___sinf<<<grid, block>>>(d_arr, N);
        cudaDeviceSynchronize();
        auto stop = std::chrono::steady_clock::now();
        cudaMemcpy(h_arr, d_arr, N * sizeof(T), cudaMemcpyDeviceToHost);
        std::cout << "Time: "
                  << std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count()
                  << " ms" << std::endl;
        double err = compute_error(h_arr);
        std::cout << "Average error: " << err << std::endl << std::endl;
    }

    cudaFree(d_arr);
    delete[] h_arr;
}

int main() {
    std::cout << "=== CUDA Sine Accuracy Test ===" << std::endl;
    std::cout << "Elements count: " << N << std::endl;

    std::cout << "\n--- FLOAT MODE ---" << std::endl;
    test<float>();
    std::cout << "\n--- DOUBLE MODE ---" << std::endl;
    test<double>();

    std::cout << "\nExperiment completed." << std::endl;
    return 0;
}


Writing cuda_sin_test.cu


In [2]:
!nvcc -arch=sm_75 --use_fast_math cuda_sin_test.cu -o cuda_sin_test
!./cuda_sin_test

=== CUDA Sine Accuracy Test ===
Elements count: 1000000000

--- FLOAT MODE ---
Running sin() ...
Time: 434 ms
Average error: 0

Running sinf() ...
Time: 16 ms
Average error: 1.6e-08

Running __sinf() ...
Time: 16 ms
Average error: 1.6e-08


--- DOUBLE MODE ---
Running sin() ...
Time: 337 ms
Average error: 8.77963e-18

Running sinf() ...
Time: 258 ms
Average error: 1.30149e-07

Running __sinf() ...
Time: 287 ms
Average error: 1.30149e-07


Experiment completed.


In [3]:
!nvidia-smi

Tue Oct 14 15:15:49 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   67C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
%%writefile cuda_specs.cu

#include <iostream>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount); // Get the number of CUDA-capable devices

    if (deviceCount == 0) {
        std::cerr << "No CUDA devices found." << std::endl;
        return 1;
    }

    for (int i = 0; i < deviceCount; ++i) {
        cudaDeviceProp prop{}; // Initialize a cudaDeviceProp structure
        cudaGetDeviceProperties(&prop, i); // Get properties for device 'i'

        std::cout << "--- Device Number: " << i << " ---" << std::endl;
        std::cout << "  Device Name: " << prop.name << std::endl;
        std::cout << "  Compute Capability: " << prop.major << "." << prop.minor << std::endl;
        std::cout << "  Total Global Memory (bytes): " << prop.totalGlobalMem << std::endl;
        std::cout << "  Max Threads per Block: " << prop.maxThreadsPerBlock << std::endl;
        std::cout << "  Multiprocessor Count: " << prop.multiProcessorCount << std::endl;
        std::cout << "  Clock Rate (kHz): " << prop.clockRate << std::endl;
        std::cout << "  Shared Memory per Block (bytes): " << prop.sharedMemPerBlock << std::endl;
        std::cout << "  Warp Size: " << prop.warpSize << std::endl;
        std::cout << "  ECC Enabled: " << (prop.ECCEnabled ? "Yes" : "No") << std::endl;
        std::cout << std::endl;
    }

    return 0;
}

Writing cuda_specs.cu


In [5]:
!nvcc -arch=sm_75 cuda_specs.cu -o cuda_specs
!./cuda_specs

--- Device Number: 0 ---
  Device Name: Tesla T4
  Compute Capability: 7.5
  Total Global Memory (bytes): 15828320256
  Max Threads per Block: 1024
  Multiprocessor Count: 40
  Clock Rate (kHz): 1590000
  Shared Memory per Block (bytes): 49152
  Warp Size: 32
  ECC Enabled: Yes

