<a href="https://colab.research.google.com/github/trainsn/cuda-learning/blob/master/udacity_cs344_hw5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Homework 5 for Udacity CS344 Course, Intro to Parallel Programming
# clone the code repo,
!git clone https://github.com/depctg/udacity-cs344-colab
!pip install git+git://github.com/depctg/nvcc4jupyter.git

# load cuda plugin
%config NVCCPluginV2.static_dir = True
%config NVCCPluginV2.relative_dir = "udacity-cs344-colab/src/HW5"
%load_ext nvcc_plugin

# change to work directory, generate makefiles
!mkdir udacity-cs344-colab/build
%cd udacity-cs344-colab/build
!cmake ../src

Cloning into 'udacity-cs344-colab'...
remote: Enumerating objects: 144, done.[K
remote: Total 144 (delta 0), reused 0 (delta 0), pack-reused 144[K
Receiving objects: 100% (144/144), 3.93 MiB | 6.05 MiB/s, done.
Resolving deltas: 100% (43/43), done.
Collecting git+git://github.com/depctg/nvcc4jupyter.git
  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-cgd4s54i
  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-cgd4s54i
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4334 sha256=f70fc4154176f513c6ffc992301887d7e6c22cc69578f6014f0b9562dc4423f9
  Stored in directory: /tmp/pip-ephem-wheel-cache-rjeom66q/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [0]:
%%cuda --name student.cu

/* Udacity HW5
   Histogramming for Speed
   The goal of this assignment is compute a histogram
   as fast as possible.  We have simplified the problem as much as
   possible to allow you to focus solely on the histogramming algorithm.
   The input values that you need to histogram are already the exact
   bins that need to be updated.  This is unlike in HW3 where you needed
   to compute the range of the data and then do:
   bin = (val - valMin) / valRange to determine the bin.
   Here the bin is just:
   bin = val
   so the serial histogram calculation looks like:
   for (i = 0; i < numElems; ++i)
     histo[val[i]]++;
   That's it!  Your job is to make it run as fast as possible!
   The values are normally distributed - you may take
   advantage of this fact in your implementation.
*/


#include "utils.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>

const int N_THREADS =  1024;



__global__
void naiveHisto(const unsigned int* const vals, //INPUT
	unsigned int* const histo,      //OUPUT
	int numVals)
{
	int tid = threadIdx.x;
	int global_id = tid + blockDim.x*blockIdx.x;
	if (global_id >= numVals) return;
	atomicAdd(&(histo[vals[global_id]]), 1);
}

__global__
void perBlockHisto(const unsigned int* const vals, //INPUT
	unsigned int* const histo,      //OUPUT
	int numVals,int numBins) {

	extern __shared__ unsigned int sharedHisto[]; //size as original histo

	//coalesced initialization: multiple blocks could manage the same shared histo
	for (int i = threadIdx.x; i < numBins; i += blockDim.x) {
		sharedHisto[i] = 0;
	}

	__syncthreads();

	int globalid = threadIdx.x + blockIdx.x*blockDim.x;
	atomicAdd(&sharedHisto[vals[globalid]], 1);
	
	__syncthreads();

	for (int i = threadIdx.x; i < numBins; i += blockDim.x) {
		atomicAdd(&histo[i], sharedHisto[i]);
	}


}



void computeHistogram(const unsigned int* const d_vals, //INPUT
                      unsigned int* const d_histo,      //OUTPUT
                      const unsigned int numBins,
                      const unsigned int numElems)
{
  //TODO Launch the yourHisto kernel

	int blocks = ceil(numElems / N_THREADS);

	//naiveHisto <<< blocks, N_THREADS >>> (d_vals, d_histo, numElems);


	//more than 7x speedup over naiveHisto
	perBlockHisto << <blocks, N_THREADS, sizeof(unsigned int)*numBins >> > (d_vals, d_histo, numElems, numBins);

  //if you want to use/launch more than one kernel,
  //feel free

  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}

'File written in /content/udacity-cs344-colab/src/HW5/student.cu'

In [0]:
# make the cuda project
!make HW5
print("\n====== RESULT OF HW5 =======\n")
!bin/HW5

[ 25%] [34m[1mBuilding NVCC (Device) object HW5/CMakeFiles/HW5.dir/HW5_generated_student.cu.o[0m
[ 50%] [32m[1mLinking CXX executable ../bin/HW5[0m
[100%] Built target HW5


489
Your code ran in: 6.133504 msecs.
