Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

wrote a working version of vector add on the gpu. Both CPU and GPU ve…

…rsions take sub seconds so it's hard to justify the use of GPU. And GPU load/unload to mem takes couple subsecs longer than cpu. However, quite proud that my program runs on parallel on the gpu (~512 work items, each of 2411 adds)
  • Loading branch information...
commit 3ec4b357479a8b74e7d7f9e984daeabd63043938 1 parent 5f62acb
@yglee authored
View
15 testcode/CMakeLists.txt
@@ -2,19 +2,26 @@ include_directories(${EIGEN3_INCLUDE_DIR})
include_directories(${OPENCL_INCLUDE_DIRS})
include_directories(${CMAKE_CURRENT_BINARY_DIR}/..)
-add_library(OPENCLWRAPPER
+link_libraries(${OPENCL_LIBRARIES})
+
+add_library(OPENCL_WRAPPER
openCLWrapper.cpp
+ vector_add_cpu.cpp
)
+target_link_libraries(OPENCL_WRAPPER)
+
install(FILES
openCLWrapper.h
+ utilities.h
+ vector_add_cpu.h
DESTINATION include/fslam/test
)
install(TARGETS
- OPENCLWRAPPER
+ OPENCL_WRAPPER
DESTINATION lib
-)
+ )
-set(FSLAM_USED_LIBS ${OPENCL_LIBRARIES} OPENCLWRAPPER)
+set(FSLAM_USED_LIBS ${OPENCL_LIBRARIES} OPENCL_WRAPPER)
add_fslam_tool(fslam_gpu test_main.cc)
View
10 testcode/openCLWrapper.h
@@ -3,8 +3,18 @@
#ifndef OPENCL_WRAPPER_H
#define OPENCL_WRAPPER_H
+#ifdef __APPLE__
#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __APPLE__
#include <OpenCL/cl_gl.h>
+#else
+#include <CL/cl_gl.h>
+#endif
+
#include <string>
#include <iostream>
View
74 testcode/test_main.cc
@@ -4,6 +4,8 @@
#include <assert.h>
#include <iostream>
#include <math.h>
+#include "utilities.h"
+#include "vector_add_cpu.h"
#define PROGRAM_FILE "vector_add_gpu.cl"
@@ -12,7 +14,8 @@ using namespace std;
const int NUM_SRC_PARAMS = 2;
const int NUM_RET_PARAMS = 1;
-const int VEC_SIZE = 1234567;
+const int VEC_SIZE =
+;
size_t shrRoundUp( const size_t group_size, const size_t global_size )
{
@@ -22,28 +25,48 @@ size_t shrRoundUp( const size_t group_size, const size_t global_size )
return global_size + group_size - r;
}
-int main()
+int main()
{
- OpenCLContext context;
+ MyTimer Timer= MyTimer();
+ Timer.Start();
+ /*
+ float *src1 = new float[VEC_SIZE];
+ float *src2 = new float[VEC_SIZE];
+ float *res = new float[VEC_SIZE];
+
+ for (int i=0; i<VEC_SIZE; i++) {
+ src1[i] = src2[i] = (float) i;
+ }
+
+ vector_add_cpu(src1,src2,res,VEC_SIZE);
+ */
+
+ OpenCLContext *context;
OpenCLKernel *math_kernel;
//initialize context (sets up platform, device, context, command queue)
- context = OpenCLContext(0);
+ context = new OpenCLContext(false);
+ if (!context->IsValid()) {
+ printf("CONTEXT IS NOT VALID\n");
+ assert(false);
+ }
//get kernel paths (hard coded for now)
- string kernelpaths = "/Users/ylee8/FastSLAM/testcode";
+ string kernelpaths = "/Users/ylee8/FastSLAM/testcode/";
//create program (CreateProgramFromFile also builds the prog)
- OpenCLProgram *prog = context.CreateProgramFromFile(kernelpaths + PROGRAM_FILE);
+ cout<<"kernel path: "<<kernelpaths + PROGRAM_FILE<<endl;
+ OpenCLProgram *prog = context->CreateProgramFromFile(kernelpaths + PROGRAM_FILE);
if(!prog) {
cerr<<"Error: couldn't load OpenCL program!"<<endl;
- return 0;
+ return 1;
}
//extract the kernel
math_kernel = prog->CreateKernel("vecAdd_k");
if (!math_kernel) {
cerr<<"Error: couldn't load OpenCL Kernel!"<<endl;
+ return 1;
}
//////////////////////////
@@ -51,9 +74,11 @@ int main()
//////////////////////////
//allocates a buffer of size mem_size and copies mem_size bytes from src1_dataf and src2_dataf
- OpenCLBuffer *src_params;
+ OpenCLBuffer *src_params_1;
+ OpenCLBuffer *src_params_2;
const int mem_size = VEC_SIZE * sizeof(float);
- src_params = context.AllocMemBuffer(mem_size,CL_MEM_READ_ONLY);
+ src_params_1 = context->AllocMemBuffer(mem_size,CL_MEM_READ_ONLY);
+ src_params_2 = context->AllocMemBuffer(mem_size,CL_MEM_READ_ONLY);
// Initialize both vectors
float* src1_dataf = new float[VEC_SIZE];
@@ -61,15 +86,15 @@ int main()
for (int i = 0; i < VEC_SIZE; i++) {
src1_dataf[i] = src2_dataf[i] = (float) i;
}
- src_params->UploadData(src1_dataf, VEC_SIZE * sizeof(float));
- src_params->UploadData(src2_dataf, VEC_SIZE * sizeof(float));
+ src_params_1->UploadData(src1_dataf, VEC_SIZE * sizeof(float));
+ src_params_2->UploadData(src2_dataf, VEC_SIZE * sizeof(float));
//////////////////////////
// allocate mem for result
//////////////////////////
OpenCLBuffer *res_params;
- res_params = context.AllocMemBuffer(mem_size,CL_MEM_WRITE_ONLY);
+ res_params = context->AllocMemBuffer(mem_size,CL_MEM_WRITE_ONLY);
/////////////////////
@@ -77,12 +102,14 @@ int main()
/////////////////////
//set up the kernel args (calls clKernelArgs)
- math_kernel->SetBuf(0,src_params); //SetBuf(which argument, buffer)
- math_kernel->SetBuf(1,res_params);
+ math_kernel->SetBuf(0,src_params_1); //SetBuf(which argument, buffer)
+ math_kernel->SetBuf(1,src_params_2);
+ math_kernel->SetBuf(2,res_params);
+ math_kernel->SetArg<int>(3,VEC_SIZE);
//context finish (blocks until everything in CL queue is processed
- context.Finish();
+ context->Finish();
//run the kernel
const size_t global_size_offset = NULL; //
@@ -92,6 +119,19 @@ int main()
math_kernel->RunKernel(dim, &global_work_size, &local_work_size);
- context.Finish();
- return 1;
+
+ //Get the result
+ float *result = new float[VEC_SIZE];
+ res_params->DownloadData(result, mem_size);
+
+ /*
+ for (size_t i =0; i<VEC_SIZE; i++) {
+ cout<<"result "<<result[i]<<endl;
+ }
+ */
+
+ //context->Finish();
+ Timer.Stop();
+ Timer.Print("gpu vec add done");
+ return 0;
}
View
0  testcode/touch
No changes.
View
51 testcode/utilities.h
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <sys/time.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// MyTimer
+////////////////////////////////////////////////////////////////////////////////
+
+// little helper class to time stuff
+// this should be in a utilities header file
+class MyTimer
+{
+public:
+ MyTimer()
+ {
+ }
+
+ void Start()
+ {
+ gettimeofday(&tv1, NULL);
+ }
+ double Stop()
+ {
+ gettimeofday(&tv2, NULL);
+ int sec = tv2.tv_sec - tv1.tv_sec;
+ int usec = tv2.tv_usec - tv1.tv_usec;
+
+ if (usec < 0)
+ {
+ sec--;
+ usec = 1000000 + usec;
+ }
+
+ return (double)sec + (double)usec / 1000000.0;
+ }
+ void Print(const char *label)
+ {
+ int sec = tv2.tv_sec - tv1.tv_sec;
+ int usec = tv2.tv_usec - tv1.tv_usec;
+
+ if (usec < 0)
+ {
+ sec--;
+ usec = 1000000 + usec;
+ }
+ printf("%s took %d sec %d usec\n", label, sec, usec);
+ }
+
+private:
+ struct timeval tv1, tv2;
+};
+
View
14 testcode/vector_add_cpu.cpp
@@ -1,8 +1,10 @@
+#include "vector_add_cpu.h"
+
void vector_add_cpu (const float* src_a,
- const float* src_b,
- float* res,
- const int num)
+ const float* src_b,
+ float* res,
+ const int num)
{
- for (int i = 0; i < num; i++)
- res[i] = src_a[i] + src_b[i];
-}
+ for (int i = 0; i < num; i++)
+ res[i] = src_a[i] + src_b[i];
+}
View
8 testcode/vector_add_cpu.h
@@ -1,4 +1,4 @@
-void vector_add_cpu (const float* src_a,
- const float* src_b,
- float* res,
- const int num);
+void vector_add_cpu (const float* src_a,
+ const float* src_b,
+ float* res,
+ const int num);
View
10 testcode/vector_add_gpu.cl
@@ -1,7 +1,9 @@
-__kernel void vector_add_gpu (__global const float* src_a,
- __global const float* src_b,
- __global float* res,
- const int num)
+//CL
+__kernel void
+vecAdd_k (__global const float* src_a,
+ __global const float* src_b,
+ __global float* res,
+ const int num)
{
/* get_global_id(0) returns the ID of the thread in execution.
As many threads are launched at the same time, executing the same kernel,
Please sign in to comment.
Something went wrong with that request. Please try again.