In [16]:
def fib(n):
    if n<= 1:
        return n
    else:
        return fib(n-1)+fib(n-2);

import time
n = 10;
start = time.time()
print(f"Fibonnci({n}) is {fib(n)}")
end = time.time()
print(f"Time taken: {end - start} seconds")

n = 30;
start = time.time()
print(f"Fibonnci({n}) is {fib(n)}")
end = time.time()
print(f"Time taken: {end - start} seconds")


Fibonnci(10) is 55
Time taken: 0.00012421607971191406 seconds
Fibonnci(30) is 832040
Time taken: 0.15385222434997559 seconds


In [1]:
%%writefile fib_cuda.cu
#include <iostream>
#include <cuda_runtime.h>

// Error checking
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

// CUDA Kernel
__global__ void fib_kernel(int* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    if (idx == 0) {
        output[idx] = 0;
    }
    else if (idx == 1) {
        output[idx] = 1;
    }
    else {
        int a = 0, b = 1, c;
        for (int i = 2; i <= idx; ++i) {
            c = a + b;
            a = b;
            b = c;
        }
        output[idx] = b;
    }
}

int main() {
    const int N = 20;
    int* d_output;
    int* h_output = new int[N];

    // Allocate unified memory
    gpuErrchk(cudaMallocManaged(&d_output, N * sizeof(int)));

    int threadsPerBlock = 256;
    int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch the kernel
    fib_kernel<<<blocks, threadsPerBlock>>>(d_output, N);
    gpuErrchk(cudaPeekAtLastError()); // <<< Catches launch errors
    gpuErrchk(cudaDeviceSynchronize()); // <<< Wait for kernel to finish

    // Copy result back (optional for Unified memory, technically not needed)
    for (int i = 0; i < N; ++i) {
        std::cout << "fib(" << i << ") = " << d_output[i] << std::endl;
    }

    // Cleanup
    gpuErrchk(cudaFree(d_output));
    delete[] h_output;

    return 0;
}


Writing fib_cuda.cu


In [1]:
!nvcc -o fib_cuda fib_cuda.cu

/bin/bash: line 1: nvcc: command not found


In [3]:
!./fib_cuda

GPUassert: the provided PTX was compiled with an unsupported toolchain. fib_cuda.cu 50


In [2]:
!pip install -q cupy-cuda11x


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.0/100.0 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import cupy as cp
import time
import pandas as pd
# Write a raw CUDA kernel directly in Python
fib_kernel = cp.RawKernel(r'''
extern "C" __global__
void fib_kernel(int* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx >= n) return;

    if (idx == 0) {
        output[idx] = 0;
    }
    else if (idx == 1) {
        output[idx] = 1;
    }
    else {
        int a = 0, b = 1, c;
        for (int i = 2; i <= idx; ++i) {
            c = a + b;
            a = b;
            b = c;
        }
        output[idx] = b;
    }
}
''', 'fib_kernel')

results = []
for n in [10, 30, 50, 80, 100, 500, 10000, 2**22]:

    # GPU Calculation
    start_gpu = time.time()
    output = cp.zeros(n, dtype=cp.int32)
    threads_per_block = 256
    blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
    fib_kernel((blocks_per_grid,), (threads_per_block,), (output, n))
    cp.cuda.Device(0).synchronize()
    end_gpu = time.time()
    gpu_time = end_gpu - start_gpu

    results.append([n, gpu_time])

    # Clean up GPU memory
    del output
    cp._default_memory_pool.free_all_blocks()

df = pd.DataFrame(results, columns=['n','GPU Time (s)'])
df


Unnamed: 0,n,GPU Time (s)
0,10,0.69411
1,30,0.000136
2,50,5.2e-05
3,80,4e-05
4,100,3.8e-05
5,500,4.1e-05
6,10000,8.7e-05
7,4194304,0.757003
