---
# **LAB 9 - CUDA Libraries**
---

# ▶️ CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

## [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)

## NVCC Plugin for Jupyter notebook

*Usage*:


*   Load Extension `%load_ext nvcc_plugin`
*   Mark a cell to be treated as cuda cell
`%%cuda --name example.cu --compile false`

**NOTE**: The cell must contain either code or comments to be run successfully. It accepts 2 arguments. `-n | --name` - which is the name of either CUDA source or Header. The name parameter must have extension `.cu` or `.h`. Second argument -c | --compile; default value is false. The argument is a flag to specify if the cell will be compiled and run right away or not. It might be usefull if you're playing in the main function

*  We are ready to run CUDA C/C++ code right in your Notebook. For this we need explicitly say to the interpreter, that we want to use the extension by adding `%%cu` at the beginning of each cell with CUDA code. 




In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

Clone GPUcomputing site on github...

In [None]:
!git clone https://github.com/giulianogrossi/GPUcomputing.git

# ▶️ VS Code on Colab

In [None]:
#@title Colab-ssh tunnel
#@markdown Execute this cell to open the ssh tunnel. Check [colab-ssh documentation](https://github.com/WassimBenzarti/colab-ssh) for more details.

# Install colab_ssh on google colab
!pip install colab_ssh --upgrade

from colab_ssh import launch_ssh_cloudflared, init_git_cloudflared
ssh_tunnel_password = "gpu" #@param {type: "string"}
launch_ssh_cloudflared(password=ssh_tunnel_password)

# Optional: if you want to clone a Github or Gitlab repository
repository_url="https://github.com/giulianogrossi/GPUcomputing" #@param {type: "string"}
init_git_cloudflared(repository_url)

# ▶️ DeviceQuery

In [None]:
# DeviceQuery dell'attuale device (su Colab!)
!nvcc /content/GPUcomputing/utils/deviceQuery.cu -o deviceQuery
!./deviceQuery

Check whether the device can transfer in both directions simultaneously

In [None]:
%%cu
#include <stdio.h>

int main(void) {

  cudaDeviceProp dProp;
	cudaGetDeviceProperties(&dProp, 0);

  // Shows whether the device can transfer in both directions simultaneously
  printf("Device %s capable of simultaneous CPU-to-GPU and GPU-to-CPU datatransfers\n", dProp.deviceOverlap ? "IS": "NOT");
  return 0;
}

# ✅ cuBLAS

In [None]:
%%cuda --name mat_prod_cublas.cu

#include <stdio.h>
#include <stdlib.h>
#include "cublas_v2.h"
#include "../GPUcomputing/utils/common.h"

#define IDX2R(r,c,D) ( r * D + c) 
#define IDX2C(r,c,D) ( c * D + r )

#define BLOCK_SIZE 4
#define M          (1<<12)
#define N          (1<<12)
#define P          (1<<12)

void generate_random_vector(int, float**);
void generate_random_dense_matrix_Row_Maj(int, int, float**);
void generate_random_dense_matrix_Col_Maj(int, int, float**);
void plot_mat_Row_Maj(int, int, float*, char);
void plot_mat_Col_Maj(int, int, float*, char);
__global__ void matProdSMEMstatic(float*, float*, float*, int, int, int);

/*
 * comparison between standard prod kernel and cuBLAS
 */
int main(int argc, char **argv) {

	int n = N, m = M, p = P;
	float *A, *d_A;  // matrix M x N  (row M, col N)
	float *B, *d_B;  // matrix N x P  (row N, col P)
	float *C, *d_C;  // matrix M x P, C = A*B
	float *x, *d_x;  // vector N x 1 
	float *y, *d_y;  // vector N x 1, y = A*x
	float beta = 0.0f;
	float alpha = 1.0f;
	cublasHandle_t handle;
	device_name();

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// Generate inputs
	srand(10);
	generate_random_dense_matrix_Col_Maj(m, n, &A);
	generate_random_dense_matrix_Col_Maj(n, p, &B);
	generate_random_vector(n, &x);
	generate_random_vector(n, &y);

	C = (float *) malloc(m * p * sizeof(float));

	// Allocate device memory
	CHECK(cudaMalloc((void **)&d_A, m * n * sizeof(float)));
	CHECK(cudaMalloc((void **)&d_B, n * p * sizeof(float)));
	CHECK(cudaMalloc((void **)&d_C, m * p * sizeof(float)));
	CHECK(cudaMalloc((void **)&d_x, n * sizeof(float)));
	CHECK(cudaMalloc((void **)&d_y, m * sizeof(float)));

	// Create the cuBLAS handle
	CHECK_CUBLAS(cublasCreate(&handle));
	int version;
	CHECK_CUBLAS(cublasGetVersion(handle, &version));
	printf("Using CUBLAS Version: %d\n", version);
	
	// Transfer inputs to the device, column-major order
	CHECK_CUBLAS(cublasSetMatrix(m, n, sizeof(float), A, m, d_A, m));
	CHECK_CUBLAS(cublasSetMatrix(n, p, sizeof(float), B, n, d_B, n));
	CHECK_CUBLAS(cublasSetMatrix(m, p, sizeof(float), C, m, d_C, m));
	CHECK_CUBLAS(cublasSetVector(n, sizeof(float), x, 1, d_x, 1));
	CHECK_CUBLAS(cublasSetVector(m, sizeof(float), y, 1, d_y, 1));

	/***************************************************
	 *      Multipl. matrix-vector CUBLAS              *
	 ***************************************************/
	
  printf("\n**  Matrix-vector product...\n");
  printf("    y(%d x 1) = A(%d x %d) * x(%d x 1)\n",n,m,n,n);

	cudaEventRecord(start);
	CHECK_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, d_A, m, d_x, 1, &beta, d_y, 1));
	cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	float milliseconds;
	cudaEventElapsedTime(&milliseconds, start, stop);
	printf("    elapsed time: %.5f (sec)\n", milliseconds / 1000.0);

	// Retrieve the output vector from the device
	CHECK_CUBLAS(cublasGetVector(m, sizeof(float), d_y, 1, y, 1));


	/**********************************************
	 *  Multiplic. matrix-matrix CUBLAS           *
	 **********************************************/

	printf("\n**  Matrix-Matrix product...\n");
  printf("    C(%d x %d) = A(%d x %d) * B(%d x %d)\n",m,p,m,n,n,p);

  //plot_mat_Col_Maj(m, n, A, 'A');
  //plot_mat_Col_Maj(n, p, B, 'B');

	CHECK(cudaMemset(d_C, 0,  m * p *sizeof(float)));
	CHECK(cudaEventRecord(start));
	CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, p, n, &alpha, d_A, m, d_B, n, &beta, d_C, m));
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	printf("    elapsed time: %.5f (sec)\n", milliseconds / 1000.0);

	// Retrieve the output vector from the device
	CHECK_CUBLAS(cublasGetMatrix(m, p, sizeof(float), d_C, m, C, m));

  //plot_mat_Col_Maj(m, p, C, 'C');


	/*****************************************************
	 *  Multiplicat. matrix-matrix kernel ad-hoc         *
	 *****************************************************/

	printf("\n**  Matrix-Matrix product using ad-hoc kernel (with SMEM)...\n");
  printf("    C(%d x %d) = A(%d x %d) * B(%d x %d)\n",m,p,m,n,n,p);
  
  float *A1, *B1; 
  srand(10);
	generate_random_dense_matrix_Row_Maj(m, n, &A1);
	generate_random_dense_matrix_Row_Maj(n, p, &B1);

  //plot_mat_Row_Maj(m, n, A1, 'A');
  //plot_mat_Row_Maj(n, p, B1, 'B');

	// copy matrices A and B to the GPU
	CHECK(cudaMemcpy(d_A, A1, m * n * sizeof(float), cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(d_B, B1, n * p * sizeof(float), cudaMemcpyHostToDevice));
  CHECK(cudaMemset(d_C, 0.0f, m * p * sizeof(float)));

	// grid block dims = shared mem dims = BLOCK_SIZE
	dim3 block(BLOCK_SIZE, BLOCK_SIZE);
	dim3 grid((p + block.x - 1) / block.x, (m + block.y - 1) / block.y);
	CHECK(cudaEventRecord(start));
	matProdSMEMstatic<<<grid, block>>>(d_A, d_B, d_C, n, m, p);
  CHECK(cudaDeviceSynchronize());
	CHECK(cudaEventRecord(stop));
	CHECK(cudaEventSynchronize(stop));
	CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
	printf("    elapsed time: %.5f (sec)\n", milliseconds / 1000.0);

	// copy the array 'C' back from the GPU to the CPU
	CHECK(cudaMemcpy(C, d_C, m * p * sizeof(float), cudaMemcpyDeviceToHost));

  //plot_mat_Row_Maj(m, p, C, 'C');
  
	// free memory
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	cudaFree(d_x);
	cudaFree(d_y);
	CHECK_CUBLAS(cublasDestroy(handle));

	return EXIT_SUCCESS;
}

/*
 * Generate a vector of length N with random single-precision floating-point
 * values between 0 and 100.
 */

void generate_random_vector(int n, float **x) {
	float *z = (float *) malloc(sizeof(float) * n);

	for (int i = 0; i < n; i++)
		z[i] = (float)rand() / RAND_MAX;
	*x = z;
}

/*
 * Generate a matrix with M rows and N columns in column-major order. The matrix
 * will be filled with random single-precision floating-point values between 0 and 10
 */
void generate_random_dense_matrix_Col_Maj(int rows, int cols, float **A) {
	float *a = (float *) malloc(sizeof(float) * rows * cols);

  float val = 1.0;
  for (int c = 0; c < cols; ++c)
    for (int r = 0; r < rows; ++r){
      a[IDX2C(r,c,rows)] = val;
      val += 1;
    }
	*A = a;
}

void generate_random_dense_matrix_Row_Maj(int rows, int cols, float **A) {
	float *a = (float *) malloc(sizeof(float) * rows * cols);

  float val = 1.0;
	for (int r = 0; r < rows; r++)
		for (int c = 0; c < cols; c++) {
			a[IDX2R(r,c,cols)] = val;
      val += 1;
		}
	*A = a;
}

void plot_mat_Row_Maj(int rows, int cols, float *A, char name) {
  printf("\nShow mat %c...\n", name);
	for(int r = 0; r < rows; ++r){
    for(int c = 0; c < cols; ++c)
			printf("%4.1f ", A[IDX2R(r,c,cols)]);
    printf("\n");
	} 
  printf("\n");
}

void plot_mat_Col_Maj(int rows, int cols, float *A, char name) {
  printf("\nShow mat %c...\n", name);
  for(int r = 0; r < rows; ++r){
    for(int c = 0; c < cols; ++c)
      printf("%4.1f ", A[IDX2C(r,c,rows)]);
    printf("\n");
  }
  printf("\n");
}


/*
 * Kernel for matrix product with static SMEM
 *      C   =   A   *   B
 *   (m x p) (m x n) (n x p)
 */
__global__ void matProdSMEMstatic(float* A, float* B, float* C, int n, int m, int p) {
	// indexes
	uint row = blockIdx.y * blockDim.y + threadIdx.y; // in [0..m]
	uint col = blockIdx.x * blockDim.x + threadIdx.x; // in [0..p]

	// target: compute the right sum for the given row and col
	float sum = 0.0;

	// static shared memory
	__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
	__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];

	// loop over blocks from block row of matrix A and block column of matrix B
	uint numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;

	for (uint i = 0; i < numBlocks; i++) {

		// copy block from matrix to shared memory
		uint r = i * BLOCK_SIZE + threadIdx.y;
		uint c = i * BLOCK_SIZE + threadIdx.x;
		As[threadIdx.y][threadIdx.x] = A[IDX2R(row, c, n)];
		Bs[threadIdx.y][threadIdx.x] = B[IDX2R(r, col, p)];

		__syncthreads();  //  BARRIER SYNC on SMEM loading

		uint K = BLOCK_SIZE;
		if (i == (numBlocks - 1)) 
      K = n - i * BLOCK_SIZE;   // tune last block

		// compute this part of row-column product
		for (uint k = 0; k < K; k++)
			sum += As[threadIdx.y][k] * Bs[k][threadIdx.x];

		__syncthreads();  //  BARRIER SYNC on prod over blocks
	}

	// store computed element in matrix C
	if (row < m && col < p)
		C[row * p + col] = sum;
}



In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/mat_prod_cublas.cu  -o prod -lcublas
!./prod

# 🔴 TODO

In [None]:
%%cuda --name conj_grad.cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include "../GPUcomputing/utils/common.h"

#define IDX2R(r,c,D) ( r * D + c) 
#define IDX2C(r,c,D) ( c * D + r )

#define N          (1<<10)

void generate_random_vector(int, double**);
void generate_rand_posdefinite_mat(int, double**);
void plot_mat(int, double*, char);
void plot_vec(int, double*, char); 
double norm2(int, double *);

/*
 * This sample implements a conjugate gradient solver on GPU using CUBLAS
 */
int main(int argc, char **argv) {
  int n = N;
	double *A, *dA;      // matrix N x N  (square)
	double *x, *dx;      // vector N x 1 
	double *b, *db;      // vector N x 1
	double *dr, *dr1;    // vector N x 1
	double *dp;          // vector N x 1
	double *dAxp, *dAxr; // vector N x 1
	
	cublasHandle_t handle;
	device_name();

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// Generate instance: matrix A and vector b
	srand(time(NULL));
	generate_rand_posdefinite_mat(n, &A);      // random symmetric matrix A
	generate_random_vector(n, &b);            // random verctor b
	generate_random_vector(n, &x);            // random initial solution
	//plot_mat(n, A,'A');

	// Allocate device memory
	CHECK(cudaMalloc((void **)&dA, n * n * sizeof(double)));
	CHECK(cudaMalloc((void **)&dx, n * sizeof(double)));
	CHECK(cudaMalloc((void **)&db, n * sizeof(double)));
	CHECK(cudaMalloc((void **)&dr, n * sizeof(double)));
	CHECK(cudaMalloc((void **)&dr1, n * sizeof(double)));
	CHECK(cudaMalloc((void **)&dp, n * sizeof(double)));
	CHECK(cudaMalloc((void **)&dAxp, n * sizeof(double)));
	CHECK(cudaMalloc((void **)&dAxr, n * sizeof(double)));

	// Create the cuBLAS handle
	CHECK_CUBLAS(cublasCreate(&handle));
	int version;
	CHECK_CUBLAS(cublasGetVersion(handle, &version));
	printf("Using CUBLAS Version: %d\n", version);
	
	// Transfer inputs to the device, column-major order
	CHECK_CUBLAS(cublasSetMatrix(n, n, sizeof(double), A, n, dA, n));
	CHECK_CUBLAS(cublasSetVector(n, sizeof(double), b, 1, db, 1));
	CHECK_CUBLAS(cublasSetVector(n, sizeof(double), x, 1, dx, 1));

	// CG
	double beta = 0.0f;
	double alpha = 0.0f;
	double one = 1.0f, minusOne = -1.0f, zero = 0.0f;
	double num, den = 0, tmp;
	int k = 0, maxit = 2000;
                        //# r0 = b
												//# r0 = b − 𝐴∗x0
	                      //# p0 = r0
		                    //# p𝑘^𝑇 ∗ r𝑘 (num)
										   	//# 𝐴 ∗ p𝑘    
			               	 //# p𝑘^𝑇 ∗ 𝐴 ∗ p𝑘  (den)
			                 //# 𝛼𝑘 = num/den
			                 //# x(𝑘+1) = x𝑘 + 𝛼𝑘 * p𝑘
			                //# r(𝑘+1) = b − 𝛼𝑘 * 𝐴 ∗ p𝑘
								  		//# 𝐴 ∗ r(𝑘+1)     
			                //# p𝑘^𝑇 ∗ 𝐴 ∗ r(𝑘+1)  (num)
			                //# 𝛽𝑘 = num/den
			                 //# r1 = r(𝑘+1)
			               	//# r1 = r1 - 𝛽𝑘 * p𝑘
			               	 //# p(𝑘+1) = r(𝑘+1) - 𝛽𝑘 * p𝑘
		


	// final solution
	double *y = (double *) malloc(sizeof(double) * n);
	CHECK_CUBLAS(cublasGetVector(n, sizeof(double), dx, 1, x, 1));
	cublasDgemv(handle, CUBLAS_OP_N, n, n, &one, dA, n, dx, 1, &zero, db, 1);   // b = 𝐴∗𝑥 
	CHECK_CUBLAS(cublasGetVector(n, sizeof(double), db, 1, y, 1));                // y = b (approx solution)

	//plot norms of the vectors
	printf("norm b = %f\n", norm2(n, b));   
	printf("norm y = %f\n", norm2(n, y));   
	//plot_vec(n, b, 'b');
	//plot_vec(n, y, 'y');

  free(A);
  free(x);
  free(b);
  cudaFree(dA);
  cudaFree(dx);
	cudaFree(db);
  cudaFree(dr);
	cudaFree(dr1);
	cudaFree(dp);
	cudaFree(dAxp);
	cudaFree(dAxr);
}

void generate_rand_posdefinite_mat(int n, double **A) {
	double *a = (double *) malloc(sizeof(double) * n * n);
	double *r = (double *) malloc(sizeof(double) * n * n);

	// generate a random matrix
	for (int i = 0; i < n; i++)
		for (int j = 0; j < n; j++) 
			r[i*n+j] = (double)rand() / RAND_MAX;
		
	// compute the product with its transpose (positive definite matrix)
	for (int i = 0; i < n; i++)
  	for (int j = i; j < n; j++) {
   		a[i*n+j] = 0;
   		for (int k = 0; k < n; k++) 
    		a[i*n+j] += r[i*n+k]*r[j*n+k];
			a[j*n+i] = a[i*n+j];
  	}
	*A = a;
}

void plot_mat(int n, double *A, char name) {
  printf("\nShow mat %c...\n", name);
	for(int r = 0; r < n; ++r){
    for(int c = 0; c < n; ++c)
			printf("%4.1f ", A[IDX2R(r,c,n)]);
    printf("\n");
	} 
  printf("\n");
}

double norm2(int n, double *x) {
	double norm = 0;
	for(int i = 0; i < n; ++i)
		norm += x[i]*x[i];
	norm = sqrt(norm);
  return norm; 
}

void plot_vec(int n, double *x, char name) {
  printf("\nShow vec %c...\n", name);
	for(int i = 0; i < n; ++i)
			printf("%4.1f ", x[i]);
  printf("\n");
}

void generate_random_vector(int n, double **x) {
	double *z = (double *) malloc(sizeof(double) * n);

	for (int i = 0; i < n; i++)
		z[i] = (double)rand() / RAND_MAX;
	*x = z;
}

In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/conj_grad.cu  -o CG -lcublas
!./CG

# ✅ cuRAND


In [None]:
%%cuda --name PI_kernel_MC.cu

#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand_kernel.h>
#include "../GPUcomputing/utils/common.h"


#define TRIALS_PER_THREAD 10000
#define BLOCKS  264
#define THREADS 264
#define PI 3.1415926535 // known value of pi

float pi_mc_CPU(long trials) {
	long points_in_circle = 0;
	for (long i = 0; i < trials; i++) {
		float x = rand() / (float) RAND_MAX;
		float y = rand() / (float) RAND_MAX;
		points_in_circle += (x * x + y * y <= 1.0f);
	}
	return 4.0f * points_in_circle / trials;
}

__global__ void pi_mc_GPU(float *estimate, curandState *states) {
	unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
	int points_in_circle = 0;
	curand_init(tid, 0, 0, &states[tid]);
	for (int i = 0; i < TRIALS_PER_THREAD; i++) {
		float x = curand_uniform(&states[tid]);
		float y = curand_uniform(&states[tid]);
		points_in_circle += (x * x + y * y <= 1.0f);
	}
	estimate[tid] = 4.0f * points_in_circle / (float) TRIALS_PER_THREAD;
}

/*
 * MAIN: MC method
 */
int main(void) {

	float host[BLOCKS * THREADS];
	float *dev;

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// CPU procedure
	float iStart = seconds();
	float pi_cpu = pi_mc_CPU(THREADS * BLOCKS * TRIALS_PER_THREAD);
	float iElaps = seconds() - iStart;
	printf("CPU elapsed time: %.5f (sec)\n", iElaps);
	printf("CPU estimate of PI = %f [error of %f]\n", pi_cpu, abs(pi_cpu - PI));

	// GPU procedure
	curandState *devStates;
	cudaMalloc((void **) &dev, BLOCKS * THREADS * sizeof(float));
	cudaMalloc((void **) &devStates, BLOCKS * THREADS * sizeof(curandState));
	cudaEventRecord(start);
	pi_mc_GPU<<<BLOCKS, THREADS>>>(dev, devStates);
  cudaEventRecord(stop);
	cudaEventSynchronize(stop);
	cudaMemcpy(host, dev, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost);
	float pi_gpu = 0.0;
	for (int i = 0; i < BLOCKS * THREADS; i++)
		pi_gpu += host[i];
	pi_gpu /= (BLOCKS * THREADS);
	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);
	printf("\nGPU elapsed time (curand Monte Carlo): %.5f (sec)\n", milliseconds / 1000);
	printf("GPU estimate of PI = %f [error of %f ]\n", pi_gpu, abs(pi_gpu - PI));
  printf("Speed-up           = %.0f\n", iElaps/milliseconds*1000);
	cudaFree(dev);
	cudaFree(devStates);
	return 0;
}

In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/PI_kernel_MC.cu  -o mc_PI
!./mc_PI

In [None]:
%%cuda --name PI_host_MC.cu

#include <stdio.h>
#include <stdlib.h>
#include <curand.h>
#include "../GPUcomputing/utils/common.h"

#define TRIALS_PER_THREAD 10000
#define BLOCKS  264
#define THREADS 264
#define PI 3.1415926535 // known value of pi

int main(void) {
    
	long trials = THREADS * BLOCKS * TRIALS_PER_THREAD; // num points

  printf("Number of random points in the square = %lu\n", trials);

	curandGenerator_t gen;
	float *X_d, *X, *Y_d, *Y ;

	// Allocate points on host
	X = (float *) malloc(trials * sizeof(float));
  Y = (float *) malloc(trials * sizeof(float));

	/* Allocate n floats on device */
	CHECK(cudaMalloc((void **)&X_d, trials * sizeof(float)));
  CHECK(cudaMalloc((void **)&Y_d, trials * sizeof(float)));

	// Create pseudo-random number generator 
	CHECK_CURAND(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));

	// Set seed 
	CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));

	// Generate 2*n floats on device 
	CHECK_CURAND(curandGenerateUniform(gen, X_d, trials));
  CHECK_CURAND(curandGenerateUniform(gen, Y_d, trials));

	// Copy device memory to host 
	CHECK(cudaMemcpy(X, X_d, trials * sizeof(float), cudaMemcpyDeviceToHost));
  CHECK(cudaMemcpy(Y, Y_d, trials * sizeof(float), cudaMemcpyDeviceToHost));

  // num of points within the circle
  ulong points_in_circle = 0;
  for (long i = 0; i < trials; i++) 
		points_in_circle += (X[i] * X[i] + Y[i] * Y[i] <= 1.0f);

  // estimate PI
	float pi = 4.0f * points_in_circle / (float)trials;
  printf("Estimate of PI = %f [error of %f]\n", pi, abs(pi - PI));

	// Cleanup 
	CHECK_CURAND(curandDestroyGenerator(gen));
	CHECK(cudaFree(X_d));
  CHECK(cudaFree(Y_d));
  free(X);
	free(Y);
	return EXIT_SUCCESS;
}


In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/PI_host_MC.cu -o mc_PI -lcurand
!./mc_PI

# 🔴 TODO

In [None]:
%%cuda --name Gauss_MC.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand_kernel.h>
#include "../GPUcomputing/utils/common.h"

#define TRIALS_PER_THREAD 10000
#define BLOCKS  264
#define THREADS 264
#define PI 3.1415926535 // known value of pi

float Gauss_CPU(long trials, float a, float b, float max) {
	long s = 0;
	for (long i = 0; i < trials; i++) {
		float x = (b-a)*(rand() / (float) RAND_MAX)+a;
		float y = (rand() / (float) RAND_MAX);
		s += (y <= expf(-x*x/2));
	}
	return s / (float)trials;
}

__global__ void Gauss_GPU() {
	//# TODO
}

int main(int argc, char *argv[]) {

	float host[BLOCKS * THREADS];
	float *dev;
	float a = -1;
	float b = 2;
	float max = 1.0f/sqrt(2*PI);
	float A = (b-a)*max;
	float P_true = 0.818594;

	// events to measure time
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	// CPU procedure
	float iStart = seconds();
	long N = THREADS * BLOCKS * TRIALS_PER_THREAD;
	float P_cpu = Gauss_CPU(N,a,b,max);
	float iElaps = seconds() - iStart;
	P_cpu = P_cpu*A;
	printf("CPU elapsed time: %.5f (sec)\n", iElaps);
	printf("CPU estimate of P = %f [error of %f]\n", P_cpu, abs(P_cpu - P_true));

	// GPU procedure
	
	//# TODO

	
	printf("GPU elapsed time: %.5f (sec)\n", seconds);
	printf("GPU estimate of P = %f [error of %f ]\n", P, abs(P - P_true));
	printf("Speedup = %f\n", iElaps/seconds);
	cudaFree(dev);
	cudaFree(devStates);
	return 0;
}


In [None]:
# Compilazione ed esecuzione

!nvcc -arch=sm_75 src/Gauss_MC.cu -o Gauss_MC
!./Gauss_MC

# ✅ cuFFT

In [None]:
%%cuda --name cufft.cu

#include <stdio.h>
#include <stdlib.h>
#include <cufft.h>
#include "../GPUcomputing/utils/common.h"

#define BATCH 16

/*
 * An example usage of the cuFFT library. This example performs a 1D forward
 * FFT.
 */

int nprints = 30;

/*
 * Create N fake samplings along the function cos(x). These samplings will be
 * stored as single-precision floating-point values.
 */
void generate_fake_samples(int N, float **out) {
	int i;
	float *result = (float *) malloc(sizeof(float) * N);
	float delta = M_PI / 20.0;
	for (i = 0; i < N; i++)
		result[i] = cos(i * delta);
	*out = result;
}

void rect(uint N, float **out) {
	float *r = (float *) calloc(N, sizeof(float));
	for (uint i = 0; i < N/100; ++i) 
    r[i] = 1.0f;
	*out = r;
}

/*
 * Convert a real-valued vector r of length Nto a complex-valued vector.
 */
void real_to_complex(float *r, cufftComplex **complx, int N) {
	int i;
	(*complx) = (cufftComplex *) malloc(sizeof(cufftComplex) * N);

	for (i = 0; i < N; i++) {
		(*complx)[i].x = r[i];
		(*complx)[i].y = 0;
	}
}

int main(int argc, char **argv) {

	int i;
	int N = 1024*1024;
	float *samples;
	cufftHandle plan = 0;
	cufftComplex *dComplexSamples, *complexSamples, *complexFreq;

	// Input Generation
	rect(N, &samples);

  printf("Start computation...\n");
  float start = seconds();
	real_to_complex(samples, &complexSamples, N);
	
  complexFreq = (cufftComplex *) malloc(sizeof(cufftComplex) * N);

	// Setup the cuFFT plan
	CHECK_CUFFT(cufftPlan1d(&plan, N, CUFFT_C2C, 1));

	// Allocate device memory
	CHECK(cudaMalloc((void **)&dComplexSamples, sizeof(cufftComplex) * N));

	// Transfer inputs into device memory
	CHECK(cudaMemcpy(dComplexSamples, complexSamples, sizeof(cufftComplex) * N, cudaMemcpyHostToDevice));

	// Execute a complex-to-complex 1D FFT
	CHECK_CUFFT(cufftExecC2C(plan, dComplexSamples, dComplexSamples, CUFFT_FORWARD));

	// Retrieve the results into host memory
	CHECK(cudaMemcpy(complexFreq, dComplexSamples, sizeof(cufftComplex) * N, cudaMemcpyDeviceToHost));

  float elaps = seconds() - start;

  printf("Elapsed time: %f (sec)\n", elaps);

  // save FFT on a file
  printf("Save on file...\n");
  FILE *filePtr;
  filePtr = fopen("FFTdata.txt","w");
  for (i = 0; i < N; i++) {
    fprintf(filePtr, "%.3g, %.5g\n", complexFreq[i].x, complexFreq[i].y);
  }
 
	free(samples);
	free(complexSamples);
	free(complexFreq);

	CHECK(cudaFree(dComplexSamples));
	CHECK_CUFFT(cufftDestroy(plan));
	return 0;
}

In [None]:
# Compilazione ed esecuzione
!nvcc -arch=sm_75 src/cufft.cu -o fft -lcufft
!./fft

In [None]:
# python code: read FFT data file and plot the FFT magnitude

import matplotlib.pyplot as plt
import numpy as np

# read file
Xlist = [] 
Ylist = []
with open("FFTdata.txt", "r") as f:
  for line in f.readlines():
    x,y = line.split(",")
    Xlist.append(float(x))
    Ylist.append(float(y))

# compute magnitude
X = np.power(Xlist,2)
Y = np.power(Ylist,2)
F = np.sqrt(X+Y)

# plot
plt.subplots(figsize=(10, 6))
plt.plot(F[:500])
plt.show()