Skip to content

Commit

Permalink
removed unroll in convolutions [harming most normal size convolutions]
Browse files Browse the repository at this point in the history
  • Loading branch information
andresy committed Jun 5, 2012
1 parent 14dbece commit 3e96df6
Showing 1 changed file with 28 additions and 50 deletions.
78 changes: 28 additions & 50 deletions lib/TH/THVector.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,31 +41,20 @@


#define THDoubleVector_add(y, x, c, n) { \
long i = 0; \
__m128d XMM7 = _mm_set1_pd(c); \
__m128d XMM0,XMM1,XMM2; \
__m128d XMM3,XMM4,XMM5; \
for (; i<=((n)-6); i+=6) { \
XMM0 = _mm_loadu_pd((x)+i); \
XMM1 = _mm_loadu_pd((x)+i+2); \
XMM2 = _mm_loadu_pd((x)+i+4); \
XMM3 = _mm_loadu_pd((y)+i); \
XMM4 = _mm_loadu_pd((y)+i+2); \
XMM5 = _mm_loadu_pd((y)+i+4); \
XMM0 = _mm_mul_pd(XMM0, XMM7); \
XMM1 = _mm_mul_pd(XMM1, XMM7); \
XMM2 = _mm_mul_pd(XMM2, XMM7); \
XMM3 = _mm_add_pd(XMM3, XMM0); \
XMM4 = _mm_add_pd(XMM4, XMM1); \
XMM5 = _mm_add_pd(XMM5, XMM2); \
_mm_storeu_pd((y)+i , XMM3); \
_mm_storeu_pd((y)+i+2, XMM4); \
_mm_storeu_pd((y)+i+4, XMM5); \
} \
for (; i<(n); i++) { \
y[i] += c * x[i]; \
} \
}
long i = 0; \
__m128d XMM7 = _mm_set1_pd(c); \
__m128d XMM0,XMM2; \
for (; i<=((n)-2); i+=2) { \
XMM0 = _mm_loadu_pd((x)+i); \
XMM2 = _mm_loadu_pd((y)+i); \
XMM0 = _mm_mul_pd(XMM0, XMM7); \
XMM2 = _mm_add_pd(XMM2, XMM0); \
_mm_storeu_pd((y)+i , XMM2); \
} \
for (; i<(n); i++) { \
y[i] += c * x[i]; \
} \
}

#define THDoubleVector_diff(z, x, y, n) { \
long i; \
Expand Down Expand Up @@ -152,31 +141,20 @@
}

#define THFloatVector_add(y, x, c, n) { \
long i = 0; \
__m128 XMM7 = _mm_set_ps1(c); \
__m128 XMM0,XMM1,XMM2; \
__m128 XMM3,XMM4,XMM5; \
for (; i<=((n)-12); i+=12) { \
XMM0 = _mm_loadu_ps((x)+i); \
XMM1 = _mm_loadu_ps((x)+i+4); \
XMM2 = _mm_loadu_ps((x)+i+8); \
XMM3 = _mm_loadu_ps((y)+i); \
XMM4 = _mm_loadu_ps((y)+i+4); \
XMM5 = _mm_loadu_ps((y)+i+8); \
XMM0 = _mm_mul_ps(XMM0, XMM7); \
XMM1 = _mm_mul_ps(XMM1, XMM7); \
XMM2 = _mm_mul_ps(XMM2, XMM7); \
XMM3 = _mm_add_ps(XMM3, XMM0); \
XMM4 = _mm_add_ps(XMM4, XMM1); \
XMM5 = _mm_add_ps(XMM5, XMM2); \
_mm_storeu_ps((y)+i , XMM3); \
_mm_storeu_ps((y)+i+4, XMM4); \
_mm_storeu_ps((y)+i+8, XMM5); \
} \
for (; i<(n); i++) { \
y[i] += c * x[i]; \
} \
}
long i = 0; \
__m128 XMM7 = _mm_set_ps1(c); \
__m128 XMM0,XMM2; \
for (; i<=((n)-4); i+=4) { \
XMM0 = _mm_loadu_ps((x)+i); \
XMM2 = _mm_loadu_ps((y)+i); \
XMM0 = _mm_mul_ps(XMM0, XMM7); \
XMM2 = _mm_add_ps(XMM2, XMM0); \
_mm_storeu_ps((y)+i , XMM2); \
} \
for (; i<(n); i++) { \
y[i] += c * x[i]; \
} \
}

#define THFloatVector_diff(z, x, y, n) { \
long i; \
Expand Down

0 comments on commit 3e96df6

Please sign in to comment.