Large diffs are not rendered by default.

@@ -0,0 +1,156 @@
from ctypes import *
import math
import random

def sample(probs):
s = sum(probs)
probs = [a/s for a in probs]
r = random.uniform(0, 1)
for i in range(len(probs)):
r = r - probs[i]
if r <= 0:
return i
return len(probs)-1

def c_array(ctype, values):
arr = (ctype*len(values))()
arr[:] = values
return arr

class BOX(Structure):
_fields_ = [("x", c_float),
("y", c_float),
("w", c_float),
("h", c_float)]

class DETECTION(Structure):
_fields_ = [("bbox", BOX),
("classes", c_int),
("prob", POINTER(c_float)),
("mask", POINTER(c_float)),
("objectness", c_float),
("sort_class", c_int)]


class IMAGE(Structure):
_fields_ = [("w", c_int),
("h", c_int),
("c", c_int),
("data", POINTER(c_float))]

class METADATA(Structure):
_fields_ = [("classes", c_int),
("names", POINTER(c_char_p))]



#lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
lib = CDLL("libdarknet.so", RTLD_GLOBAL)
lib.network_width.argtypes = [c_void_p]
lib.network_width.restype = c_int
lib.network_height.argtypes = [c_void_p]
lib.network_height.restype = c_int

predict = lib.network_predict
predict.argtypes = [c_void_p, POINTER(c_float)]
predict.restype = POINTER(c_float)

set_gpu = lib.cuda_set_device
set_gpu.argtypes = [c_int]

make_image = lib.make_image
make_image.argtypes = [c_int, c_int, c_int]
make_image.restype = IMAGE

get_network_boxes = lib.get_network_boxes
get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int)]
get_network_boxes.restype = POINTER(DETECTION)

make_network_boxes = lib.make_network_boxes
make_network_boxes.argtypes = [c_void_p]
make_network_boxes.restype = POINTER(DETECTION)

free_detections = lib.free_detections
free_detections.argtypes = [POINTER(DETECTION), c_int]

free_ptrs = lib.free_ptrs
free_ptrs.argtypes = [POINTER(c_void_p), c_int]

network_predict = lib.network_predict
network_predict.argtypes = [c_void_p, POINTER(c_float)]

reset_rnn = lib.reset_rnn
reset_rnn.argtypes = [c_void_p]

load_net = lib.load_network
load_net.argtypes = [c_char_p, c_char_p, c_int]
load_net.restype = c_void_p

do_nms_obj = lib.do_nms_obj
do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]

do_nms_sort = lib.do_nms_sort
do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]

free_image = lib.free_image
free_image.argtypes = [IMAGE]

letterbox_image = lib.letterbox_image
letterbox_image.argtypes = [IMAGE, c_int, c_int]
letterbox_image.restype = IMAGE

load_meta = lib.get_metadata
lib.get_metadata.argtypes = [c_char_p]
lib.get_metadata.restype = METADATA

load_image = lib.load_image_color
load_image.argtypes = [c_char_p, c_int, c_int]
load_image.restype = IMAGE

rgbgr_image = lib.rgbgr_image
rgbgr_image.argtypes = [IMAGE]

predict_image = lib.network_predict_image
predict_image.argtypes = [c_void_p, IMAGE]
predict_image.restype = POINTER(c_float)

def classify(net, meta, im):
out = predict_image(net, im)
res = []
for i in range(meta.classes):
res.append((meta.names[i], out[i]))
res = sorted(res, key=lambda x: -x[1])
return res

def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
im = load_image(image, 0, 0)
num = c_int(0)
pnum = pointer(num)
predict_image(net, im)
dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
num = pnum[0]
if (nms): do_nms_obj(dets, num, meta.classes, nms);

res = []
for j in range(num):
for i in range(meta.classes):
if dets[j].prob[i] > 0:
b = dets[j].bbox
res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
res = sorted(res, key=lambda x: -x[1])
free_image(im)
free_detections(dets, num)
return res

if __name__ == "__main__":
#net = load_net("cfg/densenet201.cfg", "/home/pjreddie/trained/densenet201.weights", 0)
#im = load_image("data/wolf.jpg", 0, 0)
#meta = load_meta("cfg/imagenet1k.data")
#r = classify(net, meta, im)
#print r[:10]
net = load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
meta = load_meta("cfg/coco.data")
r = detect(net, meta, "data/dog.jpg")
print r


@@ -0,0 +1,37 @@
from darknet import *

def predict_tactic(net, s):
prob = 0
d = c_array(c_float, [0.0]*256)
tac = ''
if not len(s):
s = '\n'
for c in s[:-1]:
d[ord(c)] = 1
pred = predict(net, d)
d[ord(c)] = 0
c = s[-1]
while 1:
d[ord(c)] = 1
pred = predict(net, d)
d[ord(c)] = 0
pred = [pred[i] for i in range(256)]
ind = sample(pred)
c = chr(ind)
prob += math.log(pred[ind])
if len(tac) and tac[-1] == '.':
break
tac = tac + c
return (tac, prob)

def predict_tactics(net, s, n):
tacs = []
for i in range(n):
reset_rnn(net)
tacs.append(predict_tactic(net, s))
tacs = sorted(tacs, key=lambda x: -x[1])
return tacs

net = load_net("cfg/coq.test.cfg", "/home/pjreddie/backup/coq.backup", 0)
t = predict_tactics(net, "+++++\n", 10)
print t
@@ -10,8 +10,8 @@ extern "C" {

__device__ float lhtan_activate_kernel(float x)
{
if(x < 0) return .001*x;
if(x > 1) return .001*(x-1) + 1;
if(x < 0) return .001f*x;
if(x > 1) return .001f*(x-1.f) + 1.f;
return x;
}
__device__ float lhtan_gradient_kernel(float x)
@@ -27,25 +27,25 @@ __device__ float hardtan_activate_kernel(float x)
return x;
}
__device__ float linear_activate_kernel(float x){return x;}
__device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));}
__device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;}
__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
__device__ float relu_activate_kernel(float x){return x*(x>0);}
__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01*x;}
__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;}
__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1*x;}
__device__ float tanh_activate_kernel(float x){return (2/(1 + exp(-2*x)) - 1);}
__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
__device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);}
__device__ float plse_activate_kernel(float x)
{
if(x < -4) return .01 * (x + 4);
if(x > 4) return .01 * (x - 4) + 1;
return .125*x + .5;
if(x < -4) return .01f * (x + 4);
if(x > 4) return .01f * (x - 4) + 1;
return .125f*x + .5f;
}
__device__ float stair_activate_kernel(float x)
{
int n = floor(x);
if (n%2 == 0) return floor(x/2.);
else return (x - n) + floor(x/2.);
int n = floorf(x);
if (n%2 == 0) return floorf(x/2);
else return (x - n) + floorf(x/2);
}


@@ -58,19 +58,19 @@ __device__ float linear_gradient_kernel(float x){return 1;}
__device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
__device__ float loggy_gradient_kernel(float x)
{
float y = (x+1.)/2.;
float y = (x+1)/2;
return 2*(1-y)*y;
}
__device__ float relu_gradient_kernel(float x){return (x>0);}
__device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;}
__device__ float ramp_gradient_kernel(float x){return (x>0)+.1;}
__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1;}
__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
__device__ float tanh_gradient_kernel(float x){return 1-x*x;}
__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01 : .125;}
__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
__device__ float stair_gradient_kernel(float x)
{
if (floor(x) == x) return 0;
if (floorf(x) == x) return 0;
return 1;
}

@@ -140,6 +140,41 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
return 0;
}

__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
int i = id % s;
int b = id / s;
float x1 = x[b*s + i];
float x2 = x[b*s + s/2 + i];
if(id < n) {
float de = dy[id];
dx[b*s + i] = x2*de;
dx[b*s + s/2 + i] = x1*de;
}
}

extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y)
{
binary_gradient_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, dx, n/2, size, a, y);
check_error(cudaPeekAtLastError());
}
__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
int i = id % s;
int b = id / s;
float x1 = x[b*s + i];
float x2 = x[b*s + s/2 + i];
if(id < n) y[id] = x1*x2;
}

extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y)
{
binary_activate_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, n/2, size, a, y);
check_error(cudaPeekAtLastError());
}

__global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -152,13 +187,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt
if(i < n) delta[i] *= gradient_kernel(x[i], a);
}

extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a)
extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a)
{
activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
check_error(cudaPeekAtLastError());
}

extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
{
gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
check_error(cudaPeekAtLastError());
@@ -51,13 +51,13 @@ void backward_activation_layer(layer l, network net)

void forward_activation_layer_gpu(layer l, network net)
{
copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
}

void backward_activation_layer_gpu(layer l, network net)
{
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif
@@ -12,8 +12,8 @@ float gradient(float x, ACTIVATION a);
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
void activate_array(float *x, const int n, const ACTIVATION a);
#ifdef GPU
void activate_array_ongpu(float *x, int n, ACTIVATION a);
void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
void activate_array_gpu(float *x, int n, ACTIVATION a);
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta);
#endif

static inline float stair_activate(float x)
@@ -188,8 +188,8 @@ void push_batchnorm_layer(layer l)

void forward_batchnorm_layer_gpu(layer l, network net)
{
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
if (net.train) {
#ifdef CUDNN
float one = 1;
@@ -215,14 +215,14 @@ void forward_batchnorm_layer_gpu(layer l, network net)
fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);

copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);

scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
@@ -263,7 +263,7 @@ void backward_batchnorm_layer_gpu(layer l, network net)
.00001,
l.mean_gpu,
l.variance_gpu);
copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
@@ -274,6 +274,6 @@ void backward_batchnorm_layer_gpu(layer l, network net)
fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif
@@ -55,7 +55,17 @@ void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
}
}

void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
{
int i;
for(i = 0; i < n; ++i){
if(da) da[i] += dc[i] * s[i];
if(db) db[i] += dc[i] * (1-s[i]);
ds[i] += dc[i] * (a[i] - b[i]);
}
}

void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
{
int stride = w1/w2;
int sample = w2/w1;
@@ -74,7 +84,7 @@ void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2,
for(i = 0; i < minw; ++i){
int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
out[out_index] += add[add_index];
out[out_index] = s1*out[out_index] + s2*add[add_index];
}
}
}
@@ -113,6 +123,27 @@ void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, fl
}
}

void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial)
{
int b,f,i;
for(b = 0; b < batch; ++b){
for(i = 0; i < spatial; ++i){
float sum = 0;
for(f = 0; f < filters; ++f){
int index = b*filters*spatial + f*spatial + i;
sum += powf(x[index], 2);
}
sum = sqrtf(sum);
for(f = 0; f < filters; ++f){
int index = b*filters*spatial + f*spatial + i;
x[index] /= sum;
dx[index] = (1 - x[index]) / sum;
}
}
}
}


void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
{
int b, f, i;
@@ -162,12 +193,48 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
}

void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
int i, j;
int index = 0;
for(j = 0; j < B; ++j) {
for(i = 0; i < NX; ++i){
if(X) X[j*NX + i] += OUT[index];
++index;
}
for(i = 0; i < NY; ++i){
if(Y) Y[j*NY + i] += OUT[index];
++index;
}
}
}

void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
int i, j;
int index = 0;
for(j = 0; j < B; ++j) {
for(i = 0; i < NX; ++i){
OUT[index++] = X[j*NX + i];
}
for(i = 0; i < NY; ++i){
OUT[index++] = Y[j*NY + i];
}
}
}

void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
}

void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
{
int i;
for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
}

void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
@@ -195,6 +262,28 @@ void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
}
}

void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float t = truth[i];
float p = pred[i];
error[i] = (t) ? -log(p) : 0;
delta[i] = t-p;
}
}

void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float t = truth[i];
float p = pred[i];
error[i] = -t*log(p) - (1-t)*log(1-p);
delta[i] = t-p;
}
}

void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
@@ -242,3 +331,21 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i
}
}

void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
{
int i, j, k, b;
for(b = 0; b < batch; ++b){
for(k = 0; k < c; ++k){
for(j = 0; j < h*stride; ++j){
for(i = 0; i < w*stride; ++i){
int in_index = b*w*h*c + k*w*h + (j/stride)*w + i/stride;
int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
if(forward) out[out_index] = scale*in[in_index];
else in[in_index] += scale*out[out_index];
}
}
}
}
}


@@ -1,5 +1,7 @@
#ifndef BLAS_H
#define BLAS_H
#include "darknet.h"

void flatten(float *x, int size, int layers, int batch, int forward);
void pm(int M, int N, float *A);
float *random_matrix(int rows, int cols);
@@ -8,57 +10,60 @@ void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward

void test_blas();

void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void mult_add_into_cpu(int N, float *X, float *Y, float *Z);

void const_cpu(int N, float ALPHA, float *X, int INCX);
void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
void constrain_gpu(int N, float ALPHA, float * X, int INCX);
void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);

void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
void scal_cpu(int N, float ALPHA, float *X, int INCX);
void fill_cpu(int N, float ALPHA, float * X, int INCX);
float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
int test_gpu_blas();
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);

void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);

void scale_bias(float *output, float *scales, int batch, int n, int size);
void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial);

void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);

void softmax(float *input, int n, float temp, int stride, float *output);
void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);

#ifdef GPU
#include "cuda.h"
#include "tree.h"

void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void scal_ongpu(int N, float ALPHA, float * X, int INCX);
void add_ongpu(int N, float ALPHA, float * X, int INCX);
void supp_ongpu(int N, float ALPHA, float * X, int INCX);
void mask_ongpu(int N, float * X, float mask_num, float * mask);
void const_ongpu(int N, float ALPHA, float *X, int INCX);
void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
void fill_ongpu(int N, float ALPHA, float * X, int INCX);
void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void add_gpu(int N, float ALPHA, float * X, int INCX);
void supp_gpu(int N, float ALPHA, float * X, int INCX);
void mask_gpu(int N, float * X, float mask_num, float * mask, float val);
void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale);
void const_gpu(int N, float ALPHA, float *X, int INCX);
void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_gpu(int N, float *X, int INCX, float *Y, int INCY);

void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial);

void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);

@@ -67,27 +72,34 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc

void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);

void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error);
void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
void mult_add_into_gpu(int num, float *a, float *b, float *c);
void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);

void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);

void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);

void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out);
void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);

#endif
#endif

Large diffs are not rendered by default.

160 src/box.c
@@ -3,9 +3,94 @@
#include <math.h>
#include <stdlib.h>

int nms_comparator(const void *pa, const void *pb)
{
detection a = *(detection *)pa;
detection b = *(detection *)pb;
float diff = 0;
if(b.sort_class >= 0){
diff = a.prob[b.sort_class] - b.prob[b.sort_class];
} else {
diff = a.objectness - b.objectness;
}
if(diff < 0) return 1;
else if(diff > 0) return -1;
return 0;
}

void do_nms_obj(detection *dets, int total, int classes, float thresh)
{
int i, j, k;
k = total-1;
for(i = 0; i <= k; ++i){
if(dets[i].objectness == 0){
detection swap = dets[i];
dets[i] = dets[k];
dets[k] = swap;
--k;
--i;
}
}
total = k+1;

for(i = 0; i < total; ++i){
dets[i].sort_class = -1;
}

qsort(dets, total, sizeof(detection), nms_comparator);
for(i = 0; i < total; ++i){
if(dets[i].objectness == 0) continue;
box a = dets[i].bbox;
for(j = i+1; j < total; ++j){
if(dets[j].objectness == 0) continue;
box b = dets[j].bbox;
if (box_iou(a, b) > thresh){
dets[j].objectness = 0;
for(k = 0; k < classes; ++k){
dets[j].prob[k] = 0;
}
}
}
}
}


void do_nms_sort(detection *dets, int total, int classes, float thresh)
{
int i, j, k;
k = total-1;
for(i = 0; i <= k; ++i){
if(dets[i].objectness == 0){
detection swap = dets[i];
dets[i] = dets[k];
dets[k] = swap;
--k;
--i;
}
}
total = k+1;

for(k = 0; k < classes; ++k){
for(i = 0; i < total; ++i){
dets[i].sort_class = k;
}
qsort(dets, total, sizeof(detection), nms_comparator);
for(i = 0; i < total; ++i){
if(dets[i].prob[k] == 0) continue;
box a = dets[i].bbox;
for(j = i+1; j < total; ++j){
box b = dets[j].bbox;
if (box_iou(a, b) > thresh){
dets[j].prob[k] = 0;
}
}
}
}
}

box float_to_box(float *f, int stride)
{
box b;
box b = {0};
b.x = f[0];
b.y = f[1*stride];
b.w = f[2*stride];
@@ -230,79 +315,6 @@ dbox diou(box a, box b)
return dd;
}

typedef struct{
int index;
int class;
float **probs;
} sortable_bbox;

int nms_comparator(const void *pa, const void *pb)
{
sortable_bbox a = *(sortable_bbox *)pa;
sortable_bbox b = *(sortable_bbox *)pb;
float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
if(diff < 0) return 1;
else if(diff > 0) return -1;
return 0;
}

void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh)
{
int i, j, k;
sortable_bbox *s = calloc(total, sizeof(sortable_bbox));

for(i = 0; i < total; ++i){
s[i].index = i;
s[i].class = classes;
s[i].probs = probs;
}

qsort(s, total, sizeof(sortable_bbox), nms_comparator);
for(i = 0; i < total; ++i){
if(probs[s[i].index][classes] == 0) continue;
box a = boxes[s[i].index];
for(j = i+1; j < total; ++j){
box b = boxes[s[j].index];
if (box_iou(a, b) > thresh){
for(k = 0; k < classes+1; ++k){
probs[s[j].index][k] = 0;
}
}
}
}
free(s);
}


void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh)
{
int i, j, k;
sortable_bbox *s = calloc(total, sizeof(sortable_bbox));

for(i = 0; i < total; ++i){
s[i].index = i;
s[i].class = 0;
s[i].probs = probs;
}

for(k = 0; k < classes; ++k){
for(i = 0; i < total; ++i){
s[i].class = k;
}
qsort(s, total, sizeof(sortable_bbox), nms_comparator);
for(i = 0; i < total; ++i){
if(probs[s[i].index][k] == 0) continue;
box a = boxes[s[i].index];
for(j = i+1; j < total; ++j){
box b = boxes[s[j].index];
if (box_iou(a, b) > thresh){
probs[s[j].index][k] = 0;
}
}
}
}
free(s);
}

void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
{
@@ -6,13 +6,8 @@ typedef struct{
float dx, dy, dw, dh;
} dbox;

box float_to_box(float *f, int stride);
float box_iou(box a, box b);
float box_rmse(box a, box b);
dbox diou(box a, box b);
void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh);
box decode_box(box b, box anchor);
box encode_box(box b, box anchor);

@@ -6,7 +6,7 @@ void col2im_cpu(float* data_col,
int ksize, int stride, int pad, float* data_im);

#ifdef GPU
void col2im_ongpu(float *data_col,
void col2im_gpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im);
#endif
@@ -41,7 +41,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
}
}

void col2im_ongpu(float *data_col,
void col2im_gpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im){
// We are going to launch channels * height_col * width_col kernels, each
@@ -54,7 +54,7 @@ void train_compare(char *cfgfile, char *weightfile)
float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%.3f: %f, %f avg, %lf seconds, %d images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
free_data(train);
if(i%100 == 0){
char buff[256];
@@ -1,4 +1,5 @@
#include "connected_layer.h"
#include "convolutional_layer.h"
#include "batchnorm_layer.h"
#include "utils.h"
#include "cuda.h"
@@ -10,10 +11,11 @@
#include <stdlib.h>
#include <string.h>

connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
{
int i;
connected_layer l = {0};
layer l = {0};
l.learning_rate_scale = 1;
l.type = CONNECTED;

l.inputs = inputs;
@@ -50,6 +52,14 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
l.biases[i] = 0;
}

if(adam){
l.m = calloc(l.inputs*l.outputs, sizeof(float));
l.v = calloc(l.inputs*l.outputs, sizeof(float));
l.bias_m = calloc(l.outputs, sizeof(float));
l.scale_m = calloc(l.outputs, sizeof(float));
l.bias_v = calloc(l.outputs, sizeof(float));
l.scale_v = calloc(l.outputs, sizeof(float));
}
if(batch_normalize){
l.scales = calloc(outputs, sizeof(float));
l.scale_updates = calloc(outputs, sizeof(float));
@@ -82,10 +92,16 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT

l.output_gpu = cuda_make_array(l.output, outputs*batch);
l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
if(batch_normalize){
l.scales_gpu = cuda_make_array(l.scales, outputs);
l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
if (adam) {
l.m_gpu = cuda_make_array(0, inputs*outputs);
l.v_gpu = cuda_make_array(0, inputs*outputs);
l.bias_m_gpu = cuda_make_array(0, outputs);
l.bias_v_gpu = cuda_make_array(0, outputs);
l.scale_m_gpu = cuda_make_array(0, outputs);
l.scale_v_gpu = cuda_make_array(0, outputs);
}

if(batch_normalize){
l.mean_gpu = cuda_make_array(l.mean, outputs);
l.variance_gpu = cuda_make_array(l.variance, outputs);

@@ -95,6 +111,9 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
l.variance_delta_gpu = cuda_make_array(l.variance, outputs);

l.scales_gpu = cuda_make_array(l.scales, outputs);
l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);

l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
#ifdef CUDNN
@@ -110,8 +129,12 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
return l;
}

void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
void update_connected_layer(layer l, update_args a)
{
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
scal_cpu(l.outputs, momentum, l.bias_updates, 1);

@@ -125,9 +148,8 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f
scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
}

void forward_connected_layer(connected_layer l, network net)
void forward_connected_layer(layer l, network net)
{
int i;
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
int m = l.batch;
int k = l.inputs;
@@ -137,44 +159,21 @@ void forward_connected_layer(connected_layer l, network net)
float *c = l.output;
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
if(l.batch_normalize){
if(net.train){
mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);

scal_cpu(l.outputs, .95, l.rolling_mean, 1);
axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
scal_cpu(l.outputs, .95, l.rolling_variance, 1);
axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);

copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else {
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
}
scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
}
for(i = 0; i < l.batch; ++i){
axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
forward_batchnorm_layer(l, net);
} else {
add_bias(l.output, l.biases, l.batch, l.outputs, 1);
}
activate_array(l.output, l.outputs*l.batch, l.activation);
}

void backward_connected_layer(connected_layer l, network net)
void backward_connected_layer(layer l, network net)
{
int i;
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
for(i = 0; i < l.batch; ++i){
axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
}
if(l.batch_normalize){
backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);

scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);

mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
if(l.batch_normalize){
backward_batchnorm_layer(l, net);
} else {
backward_bias(l.bias_updates, l.delta, l.batch, l.outputs, 1);
}

int m = l.outputs;
@@ -233,7 +232,7 @@ void statistics_connected_layer(layer l)

#ifdef GPU

void pull_connected_layer(connected_layer l)
void pull_connected_layer(layer l)
{
cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
@@ -246,7 +245,7 @@ void pull_connected_layer(connected_layer l)
}
}

void push_connected_layer(connected_layer l)
void push_connected_layer(layer l)
{
cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
cuda_push_array(l.biases_gpu, l.biases, l.outputs);
@@ -259,53 +258,61 @@ void push_connected_layer(connected_layer l)
}
}

void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay)
void update_connected_layer_gpu(layer l, update_args a)
{
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.inputs*l.outputs, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
}
}else{
axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);

if(l.batch_normalize){
axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
}
if(l.batch_normalize){
axpy_gpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.outputs, momentum, l.scale_updates_gpu, 1);
}

axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
axpy_gpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
}
}

void forward_connected_layer_gpu(connected_layer l, network net)
void forward_connected_layer_gpu(layer l, network net)
{
int i;
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);

int m = l.batch;
int k = l.inputs;
int n = l.outputs;
float * a = net.input_gpu;
float * b = l.weights_gpu;
float * c = l.output_gpu;
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
if(l.batch_normalize){
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, net);
} else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
}
for(i = 0; i < l.batch; ++i){
axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
}
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
}

void backward_connected_layer_gpu(connected_layer l, network net)
void backward_connected_layer_gpu(layer l, network net)
{
int i;
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
for(i = 0; i < l.batch; ++i){
axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
}

constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
if(l.batch_normalize){
backward_batchnorm_layer_gpu(l, net);
} else {
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.outputs, 1);
}

int m = l.outputs;
@@ -314,7 +321,7 @@ void backward_connected_layer_gpu(connected_layer l, network net)
float * a = l.delta_gpu;
float * b = net.input_gpu;
float * c = l.weight_updates_gpu;
gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
gemm_gpu(1,0,m,n,k,1,a,m,b,n,1,c,n);

m = l.batch;
k = l.outputs;
@@ -324,6 +331,6 @@ void backward_connected_layer_gpu(connected_layer l, network net)
b = l.weights_gpu;
c = net.delta_gpu;

if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
if(c) gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
#endif
@@ -5,22 +5,18 @@
#include "layer.h"
#include "network.h"

typedef layer connected_layer;
layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam);

connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize);

void forward_connected_layer(connected_layer layer, network net);
void backward_connected_layer(connected_layer layer, network net);
void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
void denormalize_connected_layer(layer l);
void statistics_connected_layer(layer l);
void forward_connected_layer(layer l, network net);
void backward_connected_layer(layer l, network net);
void update_connected_layer(layer l, update_args a);

#ifdef GPU
void forward_connected_layer_gpu(connected_layer layer, network net);
void backward_connected_layer_gpu(connected_layer layer, network net);
void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
void push_connected_layer(connected_layer layer);
void pull_connected_layer(connected_layer layer);
void forward_connected_layer_gpu(layer l, network net);
void backward_connected_layer_gpu(layer l, network net);
void update_connected_layer_gpu(layer l, update_args a);
void push_connected_layer(layer l);
void pull_connected_layer(layer l);
#endif

#endif
@@ -33,7 +33,7 @@ __global__ void binarize_input_kernel(float *input, int n, int size, float *bina
int i = 0;
float mean = 0;
for(i = 0; i < n; ++i){
mean += abs(input[i*size + s]);
mean += fabsf(input[i*size + s]);
}
mean = mean / n;
for(i = 0; i < n; ++i){
@@ -55,7 +55,7 @@ __global__ void binarize_weights_kernel(float *weights, int n, int size, float *
int i = 0;
float mean = 0;
for(i = 0; i < size; ++i){
mean += abs(weights[f*size + i]);
mean += fabsf(weights[f*size + i]);
}
mean = mean / size;
for(i = 0; i < size; ++i){
@@ -72,14 +72,14 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary)

void forward_convolutional_layer_gpu(convolutional_layer l, network net)
{
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
if(l.binary){
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
}

if(l.xnor){
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
net.input_gpu = l.binary_input_gpu;
@@ -102,16 +102,20 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
l.output_gpu);

#else
int i;
int m = l.n;
int k = l.size*l.size*l.c;
int i, j;
int m = l.n/l.groups;
int k = l.size*l.size*l.c/l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
float * a = l.weights_gpu;
float * b = net.workspace;
float * c = l.output_gpu;
gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
for(j = 0; j < l.groups; ++j){
float *a = l.weights_gpu + j*l.nweights/l.groups;
float *b = net.workspace;
float *c = l.output_gpu + (i*l.groups + j)*n*m;

im2col_gpu(net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w,
l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
#endif

@@ -121,7 +125,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}

activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
//if(l.dot > 0) dot_error_gpu(l);
if(l.binary || l.xnor) swap_binary(&l);
}
@@ -139,8 +143,8 @@ __global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, fl
id /= c;
int b = id;

int w_offset = -(size/2.);
int h_offset = -(size/2.);
int w_offset = -(size/2.f);
int h_offset = -(size/2.f);

int out_index = j + w*(i + h*(k + c*b));
int l, m;
@@ -173,8 +177,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
if(l.smooth){
smooth_layer(l, 5, l.smooth);
}
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
//constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);


if(l.batch_normalize){
@@ -217,112 +221,102 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
l.dsrcTensorDesc,
net.delta_gpu);
if(l.binary || l.xnor) swap_binary(&l);
if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
}

#else
int m = l.n;
int n = l.size*l.size*l.c;
int m = l.n/l.groups;
int n = l.size*l.size*l.c/l.groups;
int k = l.out_w*l.out_h;

int i;
int i, j;
for(i = 0; i < l.batch; ++i){
float * a = l.delta_gpu;
float * b = net.workspace;
float * c = l.weight_updates_gpu;

im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);

if(net.delta_gpu){
if(l.binary || l.xnor) swap_binary(&l);
float * a = l.weights_gpu;
float * b = l.delta_gpu;
float * c = net.workspace;

gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);

col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w);
if(l.binary || l.xnor) {
swap_binary(&l);
for(j = 0; j < l.groups; ++j){
float *a = l.delta_gpu + (i*l.groups + j)*m*k;
float *b = net.workspace;
float *c = l.weight_updates_gpu + j*l.nweights/l.groups;

float *im = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;

im2col_gpu(im, l.c/l.groups, l.h, l.w,
l.size, l.stride, l.pad, b);
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

if(net.delta_gpu){
if(l.binary || l.xnor) swap_binary(&l);
a = l.weights_gpu + j*l.nweights/l.groups;
b = l.delta_gpu + (i*l.groups + j)*m*k;
c = net.workspace;

gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);

col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride,
l.pad, net.delta_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w);
if(l.binary || l.xnor) {
swap_binary(&l);
}
}
if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
}
}
#endif
}

void pull_convolutional_layer(convolutional_layer layer)
void pull_convolutional_layer(layer l)
{
cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize){
cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
}
if (layer.adam){
cuda_pull_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
cuda_pull_array(l.biases_gpu, l.biases, l.n);
cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_pull_array(l.scales_gpu, l.scales, l.n);
cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

void push_convolutional_layer(convolutional_layer layer)
void push_convolutional_layer(layer l)
{
cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize){
cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
}
if (layer.adam){
cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(l.weights_gpu, l.weights, l.nweights);
cuda_push_array(l.biases_gpu, l.biases, l.n);
cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_push_array(l.scales_gpu, l.scales, l.n);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
}

void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
void update_convolutional_layer_gpu(layer l, update_args a)
{
scal_ongpu(n, B1, m, 1);
scal_ongpu(n, B2, v, 1);
axpy_ongpu(n, -decay*batch, w, 1, d, 1);

axpy_ongpu(n, (1-B1), d, 1, m, 1);
mul_ongpu(n, d, 1, d, 1);
axpy_ongpu(n, (1-B2), d, 1, v, 1);

adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t);
fill_ongpu(n, 0, d, 1);
}

void update_convolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
{
int size = l.size*l.size*l.c*l.n;

if(l.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;

if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
}
}else{
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);

axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);

if(l.scales_gpu){
axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
}
}
if(l.clip){
constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
}
}


@@ -115,7 +115,7 @@ static size_t get_workspace_size(layer l){
return most;
}
#endif
return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
return (size_t)l.out_h*l.out_w*l.size*l.size*l.c/l.groups*sizeof(float);
}

#ifdef GPU
@@ -124,47 +124,62 @@ void cudnn_convolutional_setup(layer *l)
{
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w);
cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size);
cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size);
#if CUDNN_MAJOR >= 6
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
#else
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
#endif

#if CUDNN_MAJOR >= 7
cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
#else
if(l->groups > 1){
error("CUDNN < 7 doesn't support groups, please upgrade!");
}
#endif

cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->weightDesc,
l->convDesc,
l->dstTensorDesc,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
0,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
4000000000,
&l->fw_algo);
cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
l->weightDesc,
l->ddstTensorDesc,
l->convDesc,
l->dsrcTensorDesc,
CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
0,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
4000000000,
&l->bd_algo);
cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
l->srcTensorDesc,
l->ddstTensorDesc,
l->convDesc,
l->dweightDesc,
CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
0,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
4000000000,
&l->bf_algo);
}
#endif
#endif

convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
{
int i;
convolutional_layer l = {0};
l.type = CONVOLUTIONAL;

l.groups = groups;
l.h = h;
l.w = w;
l.c = c;
@@ -177,20 +192,21 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
l.pad = padding;
l.batch_normalize = batch_normalize;

l.weights = calloc(c*n*size*size, sizeof(float));
l.weight_updates = calloc(c*n*size*size, sizeof(float));
l.weights = calloc(c/groups*n*size*size, sizeof(float));
l.weight_updates = calloc(c/groups*n*size*size, sizeof(float));

l.biases = calloc(n, sizeof(float));
l.bias_updates = calloc(n, sizeof(float));

l.nweights = c*n*size*size;
l.nweights = c/groups*n*size*size;
l.nbiases = n;

// float scale = 1./sqrt(size*size*c);
float scale = sqrt(2./(size*size*c));
float scale = sqrt(2./(size*size*c/l.groups));
//printf("convscale %f\n", scale);
//scale = .02;
//for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal();
int out_w = convolutional_out_width(l);
int out_h = convolutional_out_height(l);
l.out_h = out_h;
@@ -206,12 +222,12 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
l.backward = backward_convolutional_layer;
l.update = update_convolutional_layer;
if(binary){
l.binary_weights = calloc(c*n*size*size, sizeof(float));
l.cweights = calloc(c*n*size*size, sizeof(char));
l.binary_weights = calloc(l.nweights, sizeof(float));
l.cweights = calloc(l.nweights, sizeof(char));
l.scales = calloc(n, sizeof(float));
}
if(xnor){
l.binary_weights = calloc(c*n*size*size, sizeof(float));
l.binary_weights = calloc(l.nweights, sizeof(float));
l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
}

@@ -234,9 +250,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
}
if(adam){
l.adam = 1;
l.m = calloc(c*n*size*size, sizeof(float));
l.v = calloc(c*n*size*size, sizeof(float));
l.m = calloc(l.nweights, sizeof(float));
l.v = calloc(l.nweights, sizeof(float));
l.bias_m = calloc(n, sizeof(float));
l.scale_m = calloc(n, sizeof(float));
l.bias_v = calloc(n, sizeof(float));
@@ -250,16 +265,16 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int

if(gpu_index >= 0){
if (adam) {
l.m_gpu = cuda_make_array(l.m, c*n*size*size);
l.v_gpu = cuda_make_array(l.v, c*n*size*size);
l.m_gpu = cuda_make_array(l.m, l.nweights);
l.v_gpu = cuda_make_array(l.v, l.nweights);
l.bias_m_gpu = cuda_make_array(l.bias_m, n);
l.bias_v_gpu = cuda_make_array(l.bias_v, n);
l.scale_m_gpu = cuda_make_array(l.scale_m, n);
l.scale_v_gpu = cuda_make_array(l.scale_v, n);
}

l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
l.weights_gpu = cuda_make_array(l.weights, l.nweights);
l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);

l.biases_gpu = cuda_make_array(l.biases, n);
l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
@@ -268,10 +283,10 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

if(binary){
l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
}
if(xnor){
l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
}

@@ -307,7 +322,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
l.workspace_size = get_workspace_size(l);
l.activation = activation;

fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);

return l;
}
@@ -317,8 +332,8 @@ void denormalize_convolutional_layer(convolutional_layer l)
int i, j;
for(i = 0; i < l.n; ++i){
float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
for(j = 0; j < l.c*l.size*l.size; ++j){
l.weights[i*l.c*l.size*l.size + j] *= scale;
for(j = 0; j < l.c/l.groups*l.size*l.size; ++j){
l.weights[i*l.c/l.groups*l.size*l.size + j] *= scale;
}
l.biases[i] -= l.rolling_mean[i] * scale;
l.scales[i] = 1;
@@ -429,54 +444,50 @@ void backward_bias(float *bias_updates, float *delta, int batch, int n, int size

void forward_convolutional_layer(convolutional_layer l, network net)
{
int out_h = l.out_h;
int out_w = l.out_w;
int i;
int i, j;

fill_cpu(l.outputs*l.batch, 0, l.output, 1);

if(l.xnor){
binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
swap_binary(&l);
binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
net.input = l.binary_input;
}

int m = l.n;
int k = l.size*l.size*l.c;
int n = out_h*out_w;


float *a = l.weights;
float *b = net.workspace;
float *c = l.output;

int m = l.n/l.groups;
int k = l.size*l.size*l.c/l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
im2col_cpu(net.input, l.c, l.h, l.w,
l.size, l.stride, l.pad, b);
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
c += n*m;
net.input += l.c*l.h*l.w;
for(j = 0; j < l.groups; ++j){
float *a = l.weights + j*l.nweights/l.groups;
float *b = net.workspace;
float *c = l.output + (i*l.groups + j)*n*m;

im2col_cpu(net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w,
l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}

if(l.batch_normalize){
forward_batchnorm_layer(l, net);
} else {
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
}

activate_array(l.output, m*n*l.batch, l.activation);
activate_array(l.output, l.outputs*l.batch, l.activation);
if(l.binary || l.xnor) swap_binary(&l);
}

void backward_convolutional_layer(convolutional_layer l, network net)
{
int i;
int m = l.n;
int n = l.size*l.size*l.c;
int i, j;
int m = l.n/l.groups;
int n = l.size*l.size*l.c/l.groups;
int k = l.out_w*l.out_h;

gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);

if(l.batch_normalize){
backward_batchnorm_layer(l, net);
@@ -485,31 +496,38 @@ void backward_convolutional_layer(convolutional_layer l, network net)
}

for(i = 0; i < l.batch; ++i){
float *a = l.delta + i*m*k;
float *b = net.workspace;
float *c = l.weight_updates;
for(j = 0; j < l.groups; ++j){
float *a = l.delta + (i*l.groups + j)*m*k;
float *b = net.workspace;
float *c = l.weight_updates + j*l.nweights/l.groups;

float *im = net.input+i*l.c*l.h*l.w;
float *im = net.input+(i*l.groups + j)*l.c/l.groups*l.h*l.w;

im2col_cpu(im, l.c, l.h, l.w,
l.size, l.stride, l.pad, b);
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
im2col_cpu(im, l.c/l.groups, l.h, l.w,
l.size, l.stride, l.pad, b);
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);

if(net.delta){
a = l.weights;
b = l.delta + i*m*k;
c = net.workspace;
if(net.delta){
a = l.weights + j*l.nweights/l.groups;
b = l.delta + (i*l.groups + j)*m*k;
c = net.workspace;

gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);

col2im_cpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta+i*l.c*l.h*l.w);
col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride,
l.pad, net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w);
}
}
}
}

void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay)
void update_convolutional_layer(convolutional_layer l, update_args a)
{
int size = l.size*l.size*l.c*l.n;
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;

axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
scal_cpu(l.n, momentum, l.bias_updates, 1);

@@ -518,17 +536,17 @@ void update_convolutional_layer(convolutional_layer l, int batch, float learning
scal_cpu(l.n, momentum, l.scale_updates, 1);
}

axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
scal_cpu(size, momentum, l.weight_updates, 1);
axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
axpy_cpu(l.nweights, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
scal_cpu(l.nweights, momentum, l.weight_updates, 1);
}


image get_convolutional_weight(convolutional_layer l, int i)
{
int h = l.size;
int w = l.size;
int c = l.c;
int c = l.c/l.groups;
return float_to_image(w,h,c,l.weights+i*h*w*c);
}

@@ -564,10 +582,10 @@ image *get_weights(convolutional_layer l)
weights[i] = copy_image(get_convolutional_weight(l, i));
normalize_image(weights[i]);
/*
char buff[256];
sprintf(buff, "filter%d", i);
save_image(weights[i], buff);
*/
char buff[256];
sprintf(buff, "filter%d", i);
save_image(weights[i], buff);
*/
}
//error("hey");
return weights;
@@ -12,7 +12,7 @@ typedef layer convolutional_layer;
#ifdef GPU
void forward_convolutional_layer_gpu(convolutional_layer layer, network net);
void backward_convolutional_layer_gpu(convolutional_layer layer, network net);
void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
void update_convolutional_layer_gpu(convolutional_layer layer, update_args a);

void push_convolutional_layer(convolutional_layer layer);
void pull_convolutional_layer(convolutional_layer layer);
@@ -25,11 +25,10 @@ void cudnn_convolutional_setup(layer *l);
#endif
#endif

convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
void denormalize_convolutional_layer(convolutional_layer l);
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
void forward_convolutional_layer(const convolutional_layer layer, network net);
void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
void update_convolutional_layer(convolutional_layer layer, update_args a);
image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
void binarize_weights(float *weights, int n, int size, float *binary);
void swap_binary(convolutional_layer *l);
@@ -40,15 +39,12 @@ void backward_convolutional_layer(convolutional_layer layer, network net);
void add_bias(float *output, float *biases, int batch, int n, int size);
void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);

image *get_weights(convolutional_layer l);
image get_convolutional_image(convolutional_layer layer);
image get_convolutional_delta(convolutional_layer layer);
image get_convolutional_weight(convolutional_layer layer, int i);

int convolutional_out_height(convolutional_layer layer);
int convolutional_out_width(convolutional_layer layer);
void rescale_weights(convolutional_layer l, float scale, float trans);
void rgbgr_weights(convolutional_layer l);

#endif

@@ -9,17 +9,21 @@

COST_TYPE get_cost_type(char *s)
{
if (strcmp(s, "seg")==0) return SEG;
if (strcmp(s, "sse")==0) return SSE;
if (strcmp(s, "masked")==0) return MASKED;
if (strcmp(s, "smooth")==0) return SMOOTH;
if (strcmp(s, "L1")==0) return L1;
if (strcmp(s, "wgan")==0) return WGAN;
fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
return SSE;
}

char *get_cost_string(COST_TYPE a)
{
switch(a){
case SEG:
return "seg";
case SSE:
return "sse";
case MASKED:
@@ -28,6 +32,8 @@ char *get_cost_string(COST_TYPE a)
return "smooth";
case L1:
return "L1";
case WGAN:
return "wgan";
}
return "sse";
}
@@ -122,33 +128,40 @@ void forward_cost_layer_gpu(cost_layer l, network net)
{
if (!net.truth) return;
if(l.smooth){
scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
}
if (l.cost_type == MASKED) {
mask_ongpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu);
scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
}

if(l.cost_type == SMOOTH){
smooth_l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
} else if (l.cost_type == L1){
l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
} else if (l.cost_type == WGAN){
wgan_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
} else {
l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
}

if (l.cost_type == SEG && l.noobject_scale != 1) {
scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale);
scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale);
}
if (l.cost_type == MASKED) {
mask_gpu(l.batch*l.inputs, net.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
}

if(l.ratio){
cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare);
int n = (1-l.ratio) * l.batch*l.inputs;
float thresh = l.delta[n];
thresh = 0;
printf("%f\n", thresh);
supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
}

if(l.thresh){
supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
}

cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
@@ -157,7 +170,7 @@ void forward_cost_layer_gpu(cost_layer l, network net)

void backward_cost_layer_gpu(const cost_layer l, network net)
{
axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

@@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou

l.input_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0);
*(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 1, 3, 1, 1, activation, batch_normalize, 0, 0, 0);
l.input_layer->batch = batch;

l.self_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0);
*(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 1, 3, 1, 1, activation, batch_normalize, 0, 0, 0);
l.self_layer->batch = batch;

l.output_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0);
*(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 1, 3, 1, 1, activation, batch_normalize, 0, 0, 0);
l.output_layer->batch = batch;

l.output = l.output_layer->output;
@@ -81,11 +81,11 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
return l;
}

void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
void update_crnn_layer(layer l, update_args a)
{
update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
update_convolutional_layer(*(l.input_layer), a);
update_convolutional_layer(*(l.self_layer), a);
update_convolutional_layer(*(l.output_layer), a);
}

void forward_crnn_layer(layer l, network net)
@@ -194,11 +194,11 @@ void push_crnn_layer(layer l)
push_convolutional_layer(*(l.output_layer));
}

void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
void update_crnn_layer_gpu(layer l, update_args a)
{
update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
update_convolutional_layer_gpu(*(l.input_layer), a);
update_convolutional_layer_gpu(*(l.self_layer), a);
update_convolutional_layer_gpu(*(l.output_layer), a);
}

void forward_crnn_layer_gpu(layer l, network net)
@@ -209,10 +209,10 @@ void forward_crnn_layer_gpu(layer l, network net)
layer self_layer = *(l.self_layer);
layer output_layer = *(l.output_layer);

fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);

for (i = 0; i < l.steps; ++i) {
s.input_gpu = net.input_gpu;
@@ -224,12 +224,12 @@ void forward_crnn_layer_gpu(layer l, network net)
float *old_state = l.state_gpu;
if(net.train) l.state_gpu += l.hidden*l.batch;
if(l.shortcut){
copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
}else{
fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
}
axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);

s.input_gpu = l.state_gpu;
forward_convolutional_layer_gpu(output_layer, s);
@@ -254,8 +254,8 @@ void backward_crnn_layer_gpu(layer l, network net)
increment_layer(&output_layer, l.steps - 1);
l.state_gpu += l.hidden*l.batch*l.steps;
for (i = l.steps-1; i >= 0; --i) {
copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);

s.input_gpu = l.state_gpu;
s.delta_gpu = self_layer.delta_gpu;
@@ -268,8 +268,8 @@ void backward_crnn_layer_gpu(layer l, network net)
if (i == 0) s.delta_gpu = 0;
backward_convolutional_layer_gpu(self_layer, s);

copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
else s.delta_gpu = 0;
@@ -10,12 +10,12 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou

void forward_crnn_layer(layer l, network net);
void backward_crnn_layer(layer l, network net);
void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
void update_crnn_layer(layer l, update_args a);

#ifdef GPU
void forward_crnn_layer_gpu(layer l, network net);
void backward_crnn_layer_gpu(layer l, network net);
void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
void update_crnn_layer_gpu(layer l, update_args a);
void push_crnn_layer(layer l);
void pull_crnn_layer(layer l);
#endif
@@ -113,9 +113,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
float r3 = rand[8*id + 3];

saturation = r0*(saturation - 1) + 1;
saturation = (r1 > .5) ? 1./saturation : saturation;
saturation = (r1 > .5f) ? 1.f/saturation : saturation;
exposure = r2*(exposure - 1) + 1;
exposure = (r3 > .5) ? 1./exposure : exposure;
exposure = (r3 > .5f) ? 1.f/exposure : exposure;

size_t offset = id * h * w * 3;
image += offset;
@@ -131,18 +131,18 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
} else {
shift = 0;
}
image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5)*shift;
image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5)*shift;
image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5)*shift;
image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift;
image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift;
image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift;
}

__global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
{
int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(id >= size) return;

float cx = w/2.;
float cy = h/2.;
float cx = w/2.f;
float cy = h/2.f;

int count = id;
int j = id % crop_width;
@@ -160,11 +160,11 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i

float dw = (w - crop_width)*r4;
float dh = (h - crop_height)*r5;
flip = (flip && (r6 > .5));
flip = (flip && (r6 > .5f));
angle = 2*angle*r7 - angle;
if(!train){
dw = (w - crop_width)/2.;
dh = (h - crop_height)/2.;
dw = (w - crop_width)/2.f;
dh = (h - crop_height)/2.f;
flip = 0;
angle = 0;
}
@@ -174,8 +174,8 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
float x = (flip) ? w - dw - j - 1 : j + dw;
float y = i + dh;

float rx = cos(angle)*(x-cx) - sin(angle)*(y-cy) + cx;
float ry = sin(angle)*(x-cx) + cos(angle)*(y-cy) + cy;
float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx;
float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy;

output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
}
@@ -184,7 +184,7 @@ extern "C" void forward_crop_layer_gpu(crop_layer layer, network net)
{
cuda_random(layer.rand_gpu, layer.batch*8);

float radians = layer.angle*3.14159265/180.;
float radians = layer.angle*3.14159265f/180.f;

float scale = 2;
float translate = -1;
@@ -96,6 +96,8 @@ float *cuda_make_array(float *x, size_t n)
if(x){
status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
check_error(status);
} else {
fill_gpu(n, 0, x_gpu, 1);
}
if(!x_gpu) error("Cuda malloc failed\n");
return x_gpu;
@@ -170,5 +172,7 @@ float cuda_mag_array(float *x_gpu, size_t n)
free(temp);
return m;
}
#else
void cuda_set_device(int n){}

#endif
@@ -7,16 +7,10 @@

void check_error(cudaError_t status);
cublasHandle_t blas_handle();
float *cuda_make_array(float *x, size_t n);
int *cuda_make_int_array(int *x, size_t n);
void cuda_push_array(float *x_gpu, float *x, size_t n);
void cuda_pull_array(float *x_gpu, float *x, size_t n);
void cuda_set_device(int n);
void cuda_free(float *x_gpu);
void cuda_random(float *x_gpu, size_t n);
float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
dim3 cuda_gridsize(size_t n);
float cuda_mag_array(float *x_gpu, size_t n);

#ifdef CUDNN
cudnnHandle_t cudnn_handle();