Permalink
Browse files

examples directory

  • Loading branch information...
1 parent ec0a9b4 commit ee7475ad8ada9143a72acbcbcd7afb095ca88744 @tonyrog committed Nov 24, 2009
Showing with 853 additions and 0 deletions.
  1. +40 −0 examples/Makefile
  2. +149 −0 examples/cl_basic.erl
  3. +21 −0 examples/cl_binary_test.erl
  4. +111 −0 examples/cl_hello.erl
  5. +319 −0 examples/cl_map.erl
  6. +153 −0 examples/cl_mul.erl
  7. +30 −0 examples/mul4x4.cl
  8. +30 −0 examples/z2.cl
View
40 examples/Makefile
@@ -0,0 +1,40 @@
+
+MODULES = \
+ cl_basic \
+ cl_hello \
+ cl_map \
+ cl_binary_test \
+ cl_mul
+
+
+
+EBIN = ../ebin
+ERLC = erlc
+
+override ERLC_FLAGS = -W
+
+OBJS = $(MODULES:%=$(EBIN)/%.beam)
+
+TARGET_FILES = $(OBJS)
+
+debug: ERLC_FLAGS += -Ddebug
+
+all: $(TARGET_FILES)
+
+debug: all
+
+release: all
+
+depend:
+ edep -MM -o ../ebin $(ERLC_FLAGS) $(MODULES:%=%.erl) > depend.mk
+
+dialyze:
+ dialyzer --src -o dia.out $(ERLC_FLAGS) -c $(MODULES:%=%.erl)
+
+clean:
+ rm -f $(OBJS)
+
+-include depend.mk
+
+$(EBIN)/%.beam: %.erl
+ $(ERLC) $(ERLC_FLAGS) -o $(EBIN) $<
View
149 examples/cl_basic.erl
@@ -0,0 +1,149 @@
+%% Basic tests
+-module(cl_basic).
+
+-compile(export_all).
+-import(lists, [foreach/2]).
+
+-include("../include/cl.hrl").
+
+test() ->
+ test(all).
+
+test(DevType) ->
+ E = clu:setup(DevType),
+ {ok,PlatformInfo} = cl:get_platform_info(E#cl.platform),
+ io:format("PlatformInfo: ~p\n", [PlatformInfo]),
+
+ foreach(
+ fun(Device) ->
+ io:format("Device: ~p\n", [Device]),
+ {ok,DeviceInfo} = cl:get_device_info(Device),
+ io:format("DeviceInfo: ~p\n", [DeviceInfo])
+ end, E#cl.devices),
+
+ {ok,ContextInfo} = cl:get_context_info(E#cl.context),
+ io:format("ContextInfo: ~p\n", [ContextInfo]),
+ cl:retain_context(E#cl.context),
+ {ok,ContextInfo2} = cl:get_context_info(E#cl.context),
+ io:format("Context2: ~p\n", [ContextInfo2]),
+
+ foreach(fun(Device) ->
+ test_queue(E, Device) end,
+ E#cl.devices),
+
+ test_sampler(E),
+
+ test_program(E#cl.context, E#cl.devices),
+
+ clu:teardown(E).
+
+test_program(Context, DeviceList) ->
+ %% Program1
+ Source1 = "
+__kernel void program1(int n, int m) {
+ int result = n + m;
+}
+",
+ {ok,Program} = cl:create_program_with_source(Context,Source1),
+ io:format("Program: ~p\n", [Program]),
+ {ok,Info} = cl:get_program_info(Program),
+ io:format("ProgramInfo: ~p\n", [Info]),
+ foreach(
+ fun(Device) ->
+ {ok,BuildInfo} = cl:get_program_build_info(Program,Device),
+ io:format("BuildInfo @ ~w: ~p\n", [Device,BuildInfo])
+ end, DeviceList),
+
+ case cl:build_program(Program, DeviceList, "-Dhello=1 -Dtest") of
+ ok ->
+ foreach(
+ fun(Device) ->
+ {ok,BuildInfo} = cl:get_program_build_info(Program,Device),
+ io:format("BuildInfo @ ~w: ~p\n", [Device,BuildInfo])
+ end, DeviceList),
+ {ok,Info1} = cl:get_program_info(Program),
+ io:format("ProgramInfo1: ~p\n", [Info1]),
+ {ok,Kernels} = cl:create_kernels_in_program(Program),
+ foreach(
+ fun(Kernel) ->
+ {ok,KernelInfo} = cl:get_kernel_info(Kernel),
+ io:format("KernelInfo: ~p\n", [KernelInfo]),
+ foreach(
+ fun(Device) ->
+ {ok,I}=cl:get_kernel_workgroup_info(Kernel,Device),
+ io:format("KernelWorkGroupInfo: ~p\n", [I])
+ end, DeviceList)
+ end, Kernels),
+ foreach(
+ fun(Device) ->
+ {ok,Queue} = cl:create_queue(Context,Device,[]),
+ foreach(
+ fun(Kernel) ->
+ cl:set_kernel_arg(Kernel, 0, 12),
+ cl:set_kernel_arg(Kernel, 1, 13),
+ {ok,Event} = cl:enqueue_task(Queue, Kernel, []),
+ {ok,EventInfo} = cl:get_event_info(Event),
+ io:format("EventInfo: ~p\n", [EventInfo]),
+ cl:flush(Queue),
+ io:format("Event Status:=~p\n",
+ [cl:wait(Event,1000)])
+ end, Kernels)
+ end, DeviceList),
+ ok;
+ Error ->
+ io:format("Build Error: ~p\n", [Error])
+ end,
+ cl:release_program(Program),
+ ok.
+
+
+test_queue(E, Device) ->
+ {ok,Queue} = cl:create_queue(E#cl.context,Device,[]),
+ io:format("Queue: ~p\n", [Queue]),
+ {ok,QueueInfo} = cl:get_queue_info(Queue),
+ io:format("QueueInfo: ~p\n", [QueueInfo]),
+ cl:release_queue(Queue),
+ ok.
+
+
+test_buffer(E) ->
+ %% Read/Write buffer
+ {ok,Buffer} = cl:create_buffer(E#cl.context,[read_write],1024),
+ io:format("Buffer: ~p\n", [Buffer]),
+ {ok,BufferInfo} = cl:get_mem_object_info(Buffer),
+ io:format("BufferInfo: ~p\n", [BufferInfo]),
+ cl:release_mem_object(Buffer),
+
+ %% Read only buffer
+ {ok,Buffer2} = cl:create_buffer(E#cl.context,[read_only],0,
+ <<"Hello brave new world">>),
+ io:format("Buffer2: ~p\n", [Buffer2]),
+ {ok,Buffer2Info} = cl:get_mem_object_info(Buffer2),
+ io:format("Buffer2Info: ~p\n", [Buffer2Info]),
+ cl:release_mem_object(Buffer2),
+ ok.
+
+
+
+test_sampler(E) ->
+ %% Sampler1
+ {ok,Sampler1} = cl:create_sampler(E#cl.context,true,clamp,nearest),
+ io:format("Sampler1: ~p\n", [Sampler1]),
+ {ok,Sampler1Info} = cl:get_sampler_info(Sampler1),
+ io:format("Sampler1Info: ~p\n", [Sampler1Info]),
+ cl:release_sampler(Sampler1),
+
+ %% Sampler2
+ {ok,Sampler2} = cl:create_sampler(E#cl.context,false,repeat,linear),
+ io:format("Sampler2: ~p\n", [Sampler2]),
+ {ok,Sampler2Info} = cl:get_sampler_info(Sampler2),
+ io:format("Sampler2Info: ~p\n", [Sampler2Info]),
+ cl:release_sampler(Sampler2),
+ ok.
+
+
+
+
+
+
+
View
21 examples/cl_binary_test.erl
@@ -0,0 +1,21 @@
+%%% File : cl_binary_test.erl
+%%% Author : Tony Rogvall <tony@rogvall.se>
+%%% Description : test build of binary programs
+%%% Created : 7 Nov 2009 by Tony Rogvall <tony@rogvall.se>
+
+-module(cl_binary_test).
+
+-export([test/0]).
+
+test() ->
+ E = clu:setup(),
+ {ok,P1} = clu:build_source(E, "__kernel void foo(int n) { int x; x = n; }"),
+ {ok,B} = clu:get_program_binaries(P1),
+ ok = cl:release_program(P1),
+ {ok,P2} = clu:build_binary(E, B),
+ ok = cl:release_program(P2),
+ ok.
+
+
+
+
View
111 examples/cl_hello.erl
@@ -0,0 +1,111 @@
+%%
+%% Hello program adpoted from "Hello World" OpenCL examples by apple
+%%
+-module(cl_hello).
+
+-compile(export_all).
+
+-import(lists, [map/2]).
+
+-include("../include/cl.hrl").
+
+-define(DATA_SIZE, 1024).
+
+source() ->
+"
+__kernel void square( __global float* input,
+ __global float* output,
+ const unsigned int count)
+{
+ int i = get_global_id(0);
+ if (i < count)
+ output[i] = input[i]*input[i];
+}
+".
+
+test_data() ->
+ << <<X:32/native-float>> || X <- lists:seq(1,?DATA_SIZE) >>.
+
+dump_data(Bin) ->
+ io:format("data=~p\n", [[ X || <<X:32/native-float>> <= Bin ]]).
+
+test() ->
+ test(cpu).
+
+test(DevType) ->
+ %% Create binary with floating points 1.0 ... 1024.0
+ Data = test_data(),
+ run(Data, DevType).
+
+%%
+%% execute a kernel that squares floating point numbers
+%% now only one device is used (We run on cpu for debugging)
+%%
+run(Data, DevType) ->
+ E = clu:setup(DevType),
+ io:format("platform created\n"),
+ {ok,Program} = clu:build_source(E, source()),
+ io:format("program built\n"),
+
+ N = byte_size(Data), %% number of bytes in indata
+ Count = N div 4, %% number of floats in indata
+
+ %% Create input data memory (implicit copy_host_ptr)
+ {ok,Input} = cl:create_buffer(E#cl.context,[read_only],N),
+ io:format("input memory created\n"),
+
+ %% Create the output memory
+ {ok,Output} = cl:create_buffer(E#cl.context,[write_only],N),
+ io:format("output memory created\n"),
+
+ %% Create the command queue for the first device
+ {ok,Queue} = cl:create_queue(E#cl.context,hd(E#cl.devices),[]),
+ io:format("queue created\n"),
+
+ %% Create the squre kernel object
+ {ok,Kernel} = cl:create_kernel(Program, "square"),
+ io:format("kernel created: ~p\n", [Kernel]),
+
+ clu:apply_kernel_args(Kernel, [Input, Output, Count]),
+ io:format("kernel args set\n"),
+
+ %% Write data into input array
+ {ok,Event1} = cl:enqueue_write_buffer(Queue, Input, 0, N, Data, []),
+ io:format("write data enqueued\n"),
+ erlang:display_string("enqueu write\n"),
+
+ Device = hd(E#cl.devices),
+ {ok,Local} = cl:get_kernel_workgroup_info(Kernel, Device, work_group_size),
+ io:format("work_group_size = ~p\n", [Local]),
+
+ %% Enqueue the kernel
+ Global = Count,
+ {ok,Event2} = cl:enqueue_nd_range_kernel(Queue, Kernel,
+ [Global], [Local], [Event1]),
+ io:format("nd range [~p, ~p] kernel enqueued\n",
+ [[Global],[Local]]),
+
+ %% Enqueue the read from device memory (wait for kernel to finish)
+ {ok,Event3} = cl:enqueue_read_buffer(Queue,Output,0,N,[Event2]),
+ io:format("read buffer enqueued\n"),
+
+ %% Now flush the queue to make things happend
+ ok = cl:flush(Queue),
+ io:format("flushed\n"),
+
+ %% Wait for Result buffer to be written
+ io:format("wait\n"),
+ io:format("Event1 = ~p\n", [cl:wait(Event1)]),
+ io:format("Event2 = ~p\n", [cl:wait(Event2)]),
+ Event3Res = cl:wait(Event3),
+ io:format("Event3 = ~p\n", [Event3Res]),
+
+ %%
+ cl:release_mem_object(Input),
+ cl:release_mem_object(Output),
+ cl:release_queue(Queue),
+ cl:release_kernel(Kernel),
+ cl:release_program(Program),
+
+ clu:teardown(E),
+ Event3Res.
View
319 examples/cl_map.erl
@@ -0,0 +1,319 @@
+-module(cl_map).
+
+-include_lib("cl/include/cl.hrl").
+
+-compile(export_all).
+-import(lists, [map/2, foreach/2, foldl/3]).
+
+-record(kwork,
+ {
+ queue, %% the queue
+ local, %% kernel work_group_size
+ freq, %% device max_clock_frequenct
+ units, %% device max_compute_units
+ weight, %% weight [0..1]
+ e1,e2,e3, %% events (fixme)
+ imem, %% input memory object
+ omem, %% output memory object
+ isize, %% item size
+ idata %% input data
+ }).
+
+test() ->
+ Args = << <<X:32/native-float>> || X <- lists:seq(1, 1024) >>,
+ ResultList = run("fun(<<X/cl_float>>) -> X*X+1 end", Args),
+ lists:flatmap(
+ fun(Result) ->
+ [ X || <<X:32/native-float>> <= Result ]
+ end, ResultList).
+
+%%
+%% Run a map operation over data
+%% Restrictions: the output must currently equal the size of
+%%
+%%
+run(Function, Data) ->
+ E = clu:setup(all), %% gpu needs more work
+ {_NArgs,ItemSize,Source} = p_program(Function),
+ io:format("Program:\n~s\n", [Source]),
+ {ok,Program} = clu:build_source(E, Source),
+ {ok,Kernel} = cl:create_kernel(Program, "main"),
+
+ Kws =
+ map(
+ fun(Device) ->
+ {ok,Queue} = cl:create_queue(E#cl.context,Device,[]),
+ {ok,Local} = cl:get_kernel_workgroup_info(Kernel,Device,
+ work_group_size),
+ {ok,Freq} = cl:get_device_info(Device,max_clock_frequency),
+ {ok,K} = cl:get_device_info(Device, max_compute_units),
+ #kwork{ queue=Queue, local=Local, freq=Freq, units=K,
+ isize=ItemSize }
+ end, E#cl.devices),
+ io:format("Kws = ~p\n", [Kws]),
+
+ %% Sum the weights and scale to [0..1]
+ Tw = foldl(fun(K,Sum) -> Sum + K#kwork.freq*K#kwork.units end,
+ 0, Kws),
+ Kws1 = map(fun(K) ->
+ K#kwork { weight = (K#kwork.freq*K#kwork.units)/Tw }
+ end, Kws),
+ io:format("Kws1 = ~p\n", [Kws1]),
+
+ %% Split data according to Weights but start with data
+ %% That have hard requirements on work_group_size
+ Kws11 = lists:reverse(lists:keysort(#kwork.local,Kws1)),
+ Kws2 = kwork_set_data(Kws11, Data),
+ io:format("Kws2 = ~p\n", [Kws2]),
+
+ %% Create memory objects
+ Kws3 = map(
+ fun(K) ->
+ Nk = byte_size(K#kwork.idata),
+ {ok,I} = cl:create_buffer(E#cl.context,[read_only],Nk),
+ {ok,O} = cl:create_buffer(E#cl.context,[write_only],Nk),
+ K#kwork { imem=I, omem=O }
+ end, Kws2),
+ io:format("Kws3 = ~p\n", [Kws3]),
+
+ %% Enque input data
+ Kws4 = map(
+ fun(K) ->
+ Nk = byte_size(K#kwork.idata),
+ Count = Nk div K#kwork.isize,
+ {ok,E1} = cl:enqueue_write_buffer(K#kwork.queue,
+ K#kwork.imem,
+ 0, Nk,
+ K#kwork.idata, []),
+ %% Set kernel arguments
+ ok = cl:set_kernel_arg(Kernel, 0, K#kwork.imem),
+ ok = cl:set_kernel_arg(Kernel, 1, K#kwork.omem),
+ ok = cl:set_kernel_arg(Kernel, 2, Count),
+
+ %% Enqueue the kernel
+ Global = Count,
+ io:format("Global=~w, Local=~w\n", [Global,K#kwork.local]),
+ {ok,E2} = cl:enqueue_nd_range_kernel(K#kwork.queue,
+ Kernel,
+ [Global], [K#kwork.local],
+ [E1]),
+ %% Enqueue the read from device memory (wait for kernel to finish)
+ {ok,E3} = cl:enqueue_read_buffer(K#kwork.queue,
+ K#kwork.omem,0,Nk,[E2]),
+ %% Now flush the queue to make things happend
+ ok = cl:flush(K#kwork.queue),
+ %% FIXME: here we should release E1,E2
+ K#kwork { e1=E1,e2=E2,e3=E3 }
+ end, Kws3),
+ io:format("Kws4 = ~p\n", [Kws4]),
+
+ %% Wait for Result buffer to be written
+ Bs = map(
+ fun(K) ->
+ io:format("E1 = ~p\n", [cl:wait(K#kwork.e1)]),
+ io:format("E2 = ~p\n", [cl:wait(K#kwork.e2)]),
+ {ok,Bin} = cl:wait(K#kwork.e3),
+ cl:release_mem_object(K#kwork.imem),
+ cl:release_mem_object(K#kwork.omem),
+ cl:release_queue(K#kwork.queue),
+ %% Release built into cl:wait!
+ %% cl:release_event(K#kwork.e1),
+ %% cl:release_event(K#kwork.e2),
+ %% cl:release_event(K#kwork.e3),
+ Bin
+ end, Kws4),
+
+
+ cl:release_kernel(Kernel),
+ cl:release_program(Program),
+ clu:teardown(E),
+ Bs.
+%%
+%% Assume at least one kwork
+%% Data must be a multiple of local (work_group_size)
+%% FIXME: This must be reworked to handle all cases
+%%
+kwork_set_data([K], Data) ->
+ [K#kwork { idata = Data }];
+kwork_set_data([K|Ks], Data) ->
+ N = byte_size(Data) div K#kwork.isize,
+ M = trunc(K#kwork.weight * N), %% make a multiple of local
+ L = K#kwork.local,
+ R = ((L - (M rem L)) rem L),
+ ML = M + R,
+ io:format("N=~w, M=~w, L=~w, R=~w, ML=~w\n", [N,M,L,R,ML]),
+ if ML =< N ->
+ Md = ML*K#kwork.isize,
+ <<Data1:Md/binary, Data2/binary>> = Data,
+ [K#kwork { idata = Data1 } | kwork_set_data(Ks, Data2)];
+ true ->
+ Rd = R*K#kwork.isize,
+ [K#kwork { idata = <<Data/binary, 0:Rd/unit:8>> } | Ks]
+ end.
+
+%%
+%% Function:
+%% fun(<<X:32/T>>,P1,..,Pn) ->
+%% F(X,P1,...Pn)
+%%
+%% Translates to
+%% __kernel main(__global T0* input, __global T0* output,
+%% const unsigned int item_count,
+%% T1 p1, T2 p2 .. Tn Pn)
+%% {
+%% int i = get_global_id(0);
+%% if (i < item_count) {
+%% output[i] = F(input[i],p1,..Pn)
+%% }
+%% }
+%%
+%%
+%%
+p_program(Function) ->
+ case erl_scan:string(Function) of
+ {ok,Ts,_Ln} ->
+ case erl_parse:parse_exprs(add_dot(Ts)) of
+ {ok, Exprs} ->
+ p_fun(Exprs);
+ Error ->
+ Error
+ end;
+ Error ->
+ Error
+ end.
+
+add_dot(Ts) ->
+ case lists:last(Ts) of
+ {dot,_} -> Ts;
+ E ->
+ Ts ++ [{dot,element(2,E)}]
+ end.
+
+
+p_fun([{'fun',_Ln1,{clauses,[{clause,_Ln3,H,[],B}]}}]) ->
+ As = p_header(H),
+ NArgs = length(As),
+ {_MainVar,MainType} = hd(As),
+ ItemSize = sizeof(MainType),
+ {NArgs,ItemSize,
+ lists:flatten([g_header(As), g_body(As,B)])};
+p_fun(Fs) ->
+ io:format("Fs=~p\n", [Fs]),
+ erlang:error(not_supported).
+
+p_header(Params) ->
+ map(fun p_arg/1, Params).
+
+g_header([{V,T}|Ps]) ->
+ ["__kernel void main(",
+ "__global ", g_type(T), "*", "in", ",",
+ "__global ", g_type(T), "*", "out",",",
+ "const uint n",
+ map(fun({X,Tx}) ->
+ [",", "const ", g_type(Tx), " ",
+ atom_to_list(X)]
+ end, Ps),
+ ")\n",
+ "{",
+ " int i = get_global_id(0);\n",
+ " if (i < n) {\n"
+ " ", g_type(T), " ", atom_to_list(V), "= in[i];\n"
+ ].
+
+g_body(Vs,[E]) ->
+ ["out[i] = ", p_expr(Vs, E),";\n",
+ " }\n",
+ "}\n"];
+g_body(Vs,[E|Es]) ->
+ [p_expr(Vs,E),";\n",
+ g_body(Vs, Es)];
+g_body(_Vs,[]) ->
+ [" }\n",
+ "}\n"].
+
+p_arg({bin,_,[{bin_element,_,{var,_,V},Size,[Type]}]}) ->
+ S = t_vector_size(Size),
+ T = t_type(S,Type),
+ {V,T}.
+
+p_expr(Vs, {var,_,V}) ->
+ true = lists:keymember(V, 1, Vs),
+ [atom_to_list(V)];
+p_expr(_Vs, {integer,_,I}) ->
+ [integer_to_list(I)];
+p_expr(_Vs, {float,_,F}) ->
+ io_lib:format("~f", [F]);
+p_expr(Vs, {op,_Ln,Op,L,R}) ->
+ [p_expr(Vs,L),atom_to_list(Op),p_expr(Vs,R)];
+p_expr(Vs, {op,_Ln,Op,M}) ->
+ [atom_to_list(Op),p_expr(Vs,M)];
+p_expr(Vs, {match,_Ln,L,R}) ->
+ [p_expr(Vs,L),"=",p_expr(Vs,R)];
+p_expr(Vs, {record_field,_Ln,{var,_,V},{atom,_,Selector}}) ->
+ true = lists:keymember(V, 1, Vs),
+ [atom_to_list(V),".",atom_to_list(Selector)];
+p_expr(Vs, {record_field,_Ln,Expr,{atom,_,Selector}}) ->
+ E = p_expr(Vs, Expr),
+ %% fixme: normalize vector selector and check that
+ %% the permutation is valid.
+ [E,".",atom_to_list(Selector)];
+p_expr(Vs, {call,_Ln,{atom,_,F},As}) ->
+ Ps = map(fun(A) -> p_expr(Vs, A) end, As),
+ [atom_to_list(F),"(", g_args(Ps), ")"].
+
+
+t_vector_size(default) ->
+ default;
+t_vector_size({integer,_,Sz}) ->
+ Sz.
+
+g_args([]) -> [];
+g_args([A]) -> [A];
+g_args([A|As]) -> [A,"," | g_args(As)].
+
+g_type({T,S}) when is_atom(T), is_integer(S) ->
+ [atom_to_list(T),integer_to_list(T)];
+g_type(T) when is_atom(T) ->
+ [atom_to_list(T)].
+
+%% size scalar type
+sizeof('char') -> 1;
+sizeof('uchar') -> 1;
+sizeof('short') -> 2;
+sizeof('ushort') -> 2;
+sizeof('int') -> 4;
+sizeof('uint') -> 4;
+sizeof('long') -> 8;
+sizeof('ulong') -> 8;
+sizeof('float') -> 4;
+sizeof('half') -> 2;
+sizeof({T,default}) -> sizeof(T);
+sizeof({T,S}) -> S*sizeof(T).
+
+%% scalar types (api -> opencl)
+t_type(Size,Type) ->
+ Scalar = t_type(Type),
+ if Size == default -> Scalar;
+ Size == 1 -> Scalar;
+ Scalar == 'half' ->
+ erlang:error({bad_vector_type,Scalar,Size});
+ Size == 2 -> {Scalar,2};
+ Size == 4 -> {Scalar,4};
+ Size == 8 -> {Scalar,8};
+ Size == 16 -> {Scalar,16};
+ true -> erlang:error({bad_vector_type,Scalar,Size})
+ end.
+
+t_type(cl_char) -> 'char';
+t_type(cl_uchar) -> 'uchar';
+t_type(cl_short) -> 'short';
+t_type(cl_ushort) -> 'ushort';
+t_type(cl_int) -> 'int';
+t_type(cl_uint) -> 'uint';
+t_type(cl_long) -> 'long';
+t_type(cl_ulong) -> 'ulong';
+t_type(cl_float) -> 'float';
+t_type(cl_half) -> 'half';
+t_type(T) ->
+ erlang:error({bad_type,T}).
+
View
153 examples/cl_mul.erl
@@ -0,0 +1,153 @@
+%%% File : cl_mul.erl
+%%% Author : Tony Rogvall <tony@rogvall.se>
+%%% Description : Multiply matrix with list of matrices
+%%% Created : 16 Nov 2009 by Tony Rogvall <tony@rogvall.se>
+
+-module(cl_mul).
+
+-compile(export_all).
+
+-import(lists, [map/2]).
+
+-include("../include/cl.hrl").
+
+-define(DATA_SIZE, 1024).
+-define(ITEM_SIZE, (16*4)).
+
+encode_matrix(M) ->
+ cl:encode_argument({float16,M}).
+
+decode_matrix(Data) ->
+ case Data of
+ <<
+ ?cl_float(A11), ?cl_float(A12), ?cl_float(A13), ?cl_float(A14),
+ ?cl_float(A21), ?cl_float(A22), ?cl_float(A23), ?cl_float(A24),
+ ?cl_float(A31), ?cl_float(A32), ?cl_float(A33), ?cl_float(A34),
+ ?cl_float(A41), ?cl_float(A42), ?cl_float(A43), ?cl_float(A44),
+ Rest/binary
+ >> ->
+ [{A11,A12,A13,A14,
+ A21,A22,A23,A24,
+ A31,A32,A33,A34,
+ A41,A42,A43,A44} | decode_matrix(Rest)];
+ <<>> ->
+ []
+ end.
+
+id_matrix() ->
+ {float16,{1,0,0,0,
+ 0,1,0,0,
+ 0,0,1,0,
+ 0,0,0,1}}.
+
+zero_matrix() ->
+ {float16,{0,0,0,0,
+ 0,0,0,0,
+ 0,0,0,0,
+ 0,0,0,0}}.
+
+r() -> random:uniform().
+
+random_matrices(N) ->
+ list_to_binary(
+ lists:map(
+ fun(_I) ->
+ M = {r(),r(),r(),r(),
+ r(),r(),r(),r(),
+ r(),r(),r(),r(),
+ r(),r(),r(),r()},
+ encode_matrix(M)
+ end, lists:seq(1, N))).
+
+test_data() ->
+ random_matrices(64).
+
+dump_data(Bin) ->
+ io:format("data=~p\n", [decode_matrix(Bin)]).
+
+test() ->
+ test(cpu).
+
+test(DevType) ->
+ %% Create binary with floating points 1.0 ... 1024.0
+ Data = test_data(),
+ run(Data, DevType).
+
+examples_dir() ->
+ filename:join(code:lib_dir(cl), "examples").
+
+%%
+%% execute a kernel that squares floating point numbers
+%% now only one device is used (We run on cpu for debugging)
+%%
+run(Data, DevType) ->
+ E = clu:setup(DevType),
+ io:format("platform created\n"),
+
+ Filename = filename:join(examples_dir(),"mul4x4.cl"),
+ io:format("build: ~s\n", [Filename]),
+ {ok,Program} = clu:build_source_file(E, Filename),
+ io:format("program built\n"),
+
+ N = byte_size(Data), %% number of bytes in indata
+ Count = N div ?ITEM_SIZE, %% number of matrices in indata
+
+ %% Create input data memory (implicit copy_host_ptr)
+ {ok,Input} = cl:create_buffer(E#cl.context,[read_only],N),
+ io:format("input memory created\n"),
+
+ %% Create the output memory
+ {ok,Output} = cl:create_buffer(E#cl.context,[write_only],N),
+ io:format("output memory created\n"),
+
+ %% Create the command queue for the first device
+ {ok,Queue} = cl:create_queue(E#cl.context,hd(E#cl.devices),[]),
+ io:format("queue created\n"),
+
+ %% Create the squre kernel object
+ {ok,Kernel} = cl:create_kernel(Program, "mul4x4"),
+ io:format("kernel created: ~p\n", [Kernel]),
+
+ clu:apply_kernel_args(Kernel, [Input,Output,id_matrix(),Count]),
+ io:format("kernel args set\n"),
+
+ %% Write data into input array
+ {ok,Event1} = cl:enqueue_write_buffer(Queue, Input, 0, N, Data, []),
+ io:format("write data enqueued\n"),
+ erlang:display_string("enqueu write\n"),
+
+ Device = hd(E#cl.devices),
+ {ok,Local} = cl:get_kernel_workgroup_info(Kernel, Device, work_group_size),
+ io:format("work_group_size = ~p\n", [Local]),
+
+ %% Enqueue the kernel
+ Global = Count,
+ {ok,Event2} = cl:enqueue_nd_range_kernel(Queue, Kernel,
+ [Global], [Local], [Event1]),
+ io:format("nd range [~w, ~w] kernel enqueued\n",
+ [[Global],[Local]]),
+
+ %% Enqueue the read from device memory (wait for kernel to finish)
+ {ok,Event3} = cl:enqueue_read_buffer(Queue,Output,0,N,[Event2]),
+ io:format("read buffer enqueued\n"),
+
+ %% Now flush the queue to make things happend
+ ok = cl:flush(Queue),
+ io:format("flushed\n"),
+
+ %% Wait for Result buffer to be written
+ io:format("wait\n"),
+ io:format("Event1 = ~p\n", [cl:wait(Event1,1000)]),
+ io:format("Event2 = ~p\n", [cl:wait(Event2,1000)]),
+ Event3Res = cl:wait(Event3,1000),
+ io:format("Event3 = ~p\n", [Event3Res]),
+
+ %%
+ cl:release_mem_object(Input),
+ cl:release_mem_object(Output),
+ cl:release_queue(Queue),
+ cl:release_kernel(Kernel),
+ cl:release_program(Program),
+
+ clu:teardown(E),
+ Event3Res.
View
30 examples/mul4x4.cl
@@ -0,0 +1,30 @@
+//
+// Multiply count 4x4 matrices with a constant matrix
+//
+
+__kernel void mul4x4(__global float* input,
+ __global float* output,
+ const float16 a,
+ const unsigned int count)
+{
+ int i;
+
+ i = get_global_id(0);
+ if (i < count) {
+ int j,k;
+ __global float* b = input + i*16;
+ __global float* c = output + i*16;
+
+ for (i=0; i<3; i++) {
+ for (j=0; j<3; j++) {
+ float e = 0.0;
+ for (k=0; k<3; k++)
+ e += a[3*i+k]*b[3*k+j];
+ c[3*i+j] = e;
+ }
+ }
+ }
+}
+
+
+
View
30 examples/z2.cl
@@ -0,0 +1,30 @@
+//
+// Calculate mandelbrot
+// f(0) = x+yi
+// f(n) = f(n)^2 + c
+//
+
+__kernel void z2(const float x, const float y,
+ const float xs, const float ys,
+ const unsigned int n,
+ __global unsigned int* out)
+{
+ int i = get_global_id(0);
+ int j = get_global_id(0);
+ if ((i < n) && (j < n)) {
+ int k = 0;
+ float cx = x + i*xs;
+ float cy = y + j*ys;
+ float a = 0, b = 0;
+ float a2 = 0, b2 = 0;
+
+ while ((k < n) && ((a2 + b2) < 4)) {
+ a = a2-b2 + cx;
+ b = 2*a*b + cy;
+ a2 = a*a;
+ b2 = b*b;
+ k++;
+ }
+ out[i*n + j] = k;
+ }
+}

0 comments on commit ee7475a

Please sign in to comment.