Join GitHub today
GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together.
Sign up| /* | |
| * Copyright (c) 2019 Trail of Bits, Inc. | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| */ | |
| #include <stdint.h> | |
| #include <inttypes.h> | |
| #include <stdbool.h> | |
| #include <stdlib.h> | |
| #include "output.h" | |
| #include "sse.h" | |
| #ifndef __FILE_NAME__ | |
| #define __FILE_NAME__ "SSE" | |
| #endif | |
| #if !defined(__x86_64__) | |
| bool sse_check(void) { | |
| return false; | |
| } | |
| uint64_t sse_do_range( | |
| uint64_t start, uint64_t end, | |
| uint64_t secret, bool *found) | |
| { | |
| (void)(start); | |
| (void)(end); | |
| (void)(secret); | |
| *found = false; | |
| return 0; | |
| } | |
| #else | |
| #include <emmintrin.h> | |
| #include <immintrin.h> | |
| #include <smmintrin.h> | |
| bool sse_check(void) { | |
| return __builtin_cpu_supports("sse4.1"); | |
| } | |
| // use SSE instructions to compare 2 64-bit quantities at once | |
| // The inner loop is unrolled to maximize SIMD execution performance | |
| typedef uint64_t VECTOR[2] __attribute__ ((aligned (16))); | |
| // SSE 4.1 based loop unrolled to 8 SSE compares (aka 16 ops per loop) | |
| // AVX is better, but this works on older machines | |
| // And was a good test of using intrinsics | |
| uint64_t sse_do_range( | |
| uint64_t start, uint64_t end, | |
| uint64_t secret, bool *found) | |
| { | |
| // how many ops were done | |
| uint64_t out = 0; | |
| // how many operations per loop | |
| const unsigned OPS_PER_LOOP = 16; | |
| uint64_t max_iterations = (end - start) / OPS_PER_LOOP; | |
| // start at the following values | |
| VECTOR start_vec = { start+0, start+1}; | |
| // every iteration, add 4 to each since we compare 4 values at once | |
| VECTOR increment = { 0x2, 0x2}; | |
| // Test each lane of the vector against the following values | |
| VECTOR secret_vec = { secret, secret}; | |
| *found = false; | |
| __m128i sse_start = _mm_load_si128((__m128i*)&start_vec); | |
| __m128i sse_increment = _mm_load_si128((__m128i*)&increment); | |
| __m128i sse_secret = _mm_load_si128((__m128i*)&secret_vec); | |
| uint64_t counter = max_iterations; | |
| while(counter > 0) { | |
| __m128i result0 = _mm_cmpeq_epi64(sse_start, sse_secret); | |
| int summary0 = _mm_movemask_epi8(result0); | |
| __m128i next1 = _mm_add_epi64(sse_start, sse_increment); | |
| __m128i result1 = _mm_cmpeq_epi64(next1, sse_secret); | |
| int summary1 = _mm_movemask_epi8(result1); | |
| __m128i next2 = _mm_add_epi64(next1, sse_increment); | |
| __m128i result2 = _mm_cmpeq_epi64(next2, sse_secret); | |
| int summary2 = _mm_movemask_epi8(result2); | |
| __m128i next3 = _mm_add_epi64(next2, sse_increment); | |
| __m128i result3 = _mm_cmpeq_epi64(next3, sse_secret); | |
| int summary3 = _mm_movemask_epi8(result3); | |
| __m128i next4 = _mm_add_epi64(next3, sse_increment); | |
| __m128i result4 = _mm_cmpeq_epi64(next4, sse_secret); | |
| int summary4 = _mm_movemask_epi8(result4); | |
| __m128i next5 = _mm_add_epi64(next4, sse_increment); | |
| __m128i result5 = _mm_cmpeq_epi64(next5, sse_secret); | |
| int summary5 = _mm_movemask_epi8(result5); | |
| __m128i next6 = _mm_add_epi64(next5, sse_increment); | |
| __m128i result6 = _mm_cmpeq_epi64(next6, sse_secret); | |
| int summary6 = _mm_movemask_epi8(result6); | |
| __m128i next7 = _mm_add_epi64(next6, sse_increment); | |
| __m128i result7 = _mm_cmpeq_epi64(next7, sse_secret); | |
| int summary7 = _mm_movemask_epi8(result7); | |
| __m128i next8 = _mm_add_epi64(next7, sse_increment); | |
| sse_start = next8; | |
| if(0 != summary0 || | |
| 0 != summary1 || | |
| 0 != summary2 || | |
| 0 != summary3 || | |
| 0 != summary4 || | |
| 0 != summary5 || | |
| 0 != summary6 || | |
| 0 != summary7 | |
| ) | |
| { | |
| // found it | |
| break; | |
| } | |
| --counter; | |
| } | |
| out = max_iterations - counter; | |
| uint64_t index = out; | |
| for(unsigned i = 0; i < OPS_PER_LOOP; i++) { | |
| uint64_t val = start + (index * OPS_PER_LOOP) + i; | |
| if(val <= end && val == secret) { | |
| *found = true; | |
| break; | |
| } | |
| } | |
| if(secret == end) { | |
| *found = true; | |
| } | |
| return out*OPS_PER_LOOP; | |
| } | |
| #endif | |
| uint64_t sse_method(uint64_t secret, bool *found, uint64_t h_start, uint64_t h_end) { | |
| if (!sse_check()) { | |
| log_error( | |
| __FILE_NAME__, | |
| "Requested to use SSE method but your CPU doesn't support SSE4.1\n"); | |
| exit(-1); | |
| } | |
| return sse_do_range(h_start, h_end, secret, found); | |
| } |