Skip to content

Commit

Permalink
add monero POW v7
Browse files Browse the repository at this point in the history
- add new monero POW v7
- increase version
  • Loading branch information
psychocrypt committed Mar 29, 2018
1 parent e124c51 commit b69d6c3
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 36 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -36,3 +36,4 @@ obj/
# merge original backup files
*.orig

/build/
36 changes: 29 additions & 7 deletions cryptonight.c
Expand Up @@ -13,6 +13,23 @@
#include "crypto/c_skein.h"
#include "cryptonight.h"

#define VARIANT1_1(p) \
do if (variant > 0) \
{ \
const uint8_t tmp = ((const uint8_t*)(p))[11]; \
static const uint32_t table = 0x75310; \
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
} while(0)

#define VARIANT1_INIT() \
if (variant > 0 && len < 43) \
{ \
printf("length must be 43 byte\n"); \
return; \
} \
const uint64_t tweak1_2 = variant > 0 ? *((const uint64_t*) (((const uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0

struct cryptonight_ctx {
uint8_t long_state[MEMORY];
union cn_slow_hash_state state;
Expand Down Expand Up @@ -104,14 +121,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}

static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];

((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
((uint64_t*) dst)[1] = variant > 0 ? lo ^ tweak1_2 : lo;
}

static void copy_block(uint8_t* dst, const uint8_t* src) {
Expand All @@ -129,12 +146,14 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}

void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx) {
void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx, int variant) {
size_t i, j;
hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);

VARIANT1_INIT();

oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
Expand All @@ -157,14 +176,17 @@ void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cr
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
VARIANT1_1(&ctx->long_state[j]);

mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);

mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);

j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
VARIANT1_1(&ctx->long_state[j]);

mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
}

memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
Expand All @@ -188,8 +210,8 @@ void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cr
oaes_free((OAES_CTX **) &ctx->aes_ctx);
}

void cryptonight_hash(void* output, const void* input, size_t len) {
void cryptonight_hash(void* output, const void* input, size_t len, int variant) {
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptonight_hash_ctx(output, input, len, ctx);
cryptonight_hash_ctx(output, input, len, ctx,variant);
free(ctx);
}
7 changes: 5 additions & 2 deletions cryptonight.h
Expand Up @@ -154,10 +154,13 @@ void hash_permutation(union hash_state *state);
void hash_process(union hash_state *state, const uint8_t *buf, size_t count);

void cryptonight_core_cpu_init(int thr_id, int threads);
void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state,
uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2,
uint32_t variant, uint32_t *d_ctx_tweak1_2);

void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_prepare(int thr_id, int threads, uint32_t startNonce,
uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint32_t *d_ctx_tweak1_2);
void cryptonight_extra_cpu_final(int thr_id, int threads, uint32_t startNonce, uint32_t *nonce, uint32_t *d_ctx_state);

16 changes: 11 additions & 5 deletions cryptonight/cryptonight.cu
Expand Up @@ -130,19 +130,21 @@ static uint32_t *d_ctx_b[8];
static uint32_t *d_ctx_key1[8];
static uint32_t *d_ctx_key2[8];
static uint32_t *d_ctx_text[8];
static uint32_t *d_ctx_tweak1_2[8];

extern "C"
{
extern bool opt_benchmark;
}

extern "C" void cryptonight_hash(void* output, const void* input, size_t len);
extern "C" void cryptonight_hash(void* output, const void* input, size_t len, int variant);

extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done, uint32_t *results)
{
cudaError_t err;
int res;
uint32_t *nonceptr = (uint32_t*)(((char*)pdata) + 39);
int variant = ((uint8_t*)pdata)[0] >= 7 ? ((uint8_t*)pdata)[0] - 6 : 0;
const uint32_t first_nonce = *nonceptr;
uint32_t nonce = *nonceptr;
int cn_blocks = device_config[thr_id][0];
Expand Down Expand Up @@ -187,6 +189,8 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t
exit_if_cudaerror(thr_id, __FILE__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FILE__, __LINE__);
cudaMalloc(&d_ctx_tweak1_2[thr_id], 2 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FILE__, __LINE__);

cryptonight_extra_cpu_init(thr_id);

Expand All @@ -199,8 +203,10 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t
{
uint32_t foundNonce[2];

cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id],
d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak1_2[thr_id]);
cryptonight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id],
d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak1_2[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, foundNonce, d_ctx_state[thr_id]);

if(foundNonce[0] < 0xffffffff)
Expand All @@ -210,7 +216,7 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t
memcpy(tempdata, pdata, 76);
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
*tempnonceptr = foundNonce[0];
cryptonight_hash(vhash64, tempdata, 76);
cryptonight_hash(vhash64, tempdata, 76, variant);
if((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget))
{
res = 1;
Expand All @@ -219,7 +225,7 @@ extern "C" int scanhash_cryptonight(int thr_id, uint32_t *pdata, const uint32_t
if(foundNonce[1] < 0xffffffff)
{
*tempnonceptr = foundNonce[1];
cryptonight_hash(vhash64, tempdata, 76);
cryptonight_hash(vhash64, tempdata, 76, variant);
if((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget))
{
res++;
Expand Down
47 changes: 42 additions & 5 deletions cryptonight/cuda_cryptonight_core.cu
Expand Up @@ -115,10 +115,19 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr, const uint32
#endif
}

__device__ __forceinline__ uint32_t variant1_1(const uint32_t src)
{
const uint8_t tmp = src >> 24;
const uint32_t table = 0x75310;
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
return (src & 0x00ffffff) | ((tmp ^ ((table >> index) & 0x30)) << 24);
}

template< uint32_t variant >
#ifdef XMR_THREADS
__launch_bounds__(XMRMINER_THREADS * 4)
#endif
__global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
__global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, const uint32_t * d_tweak1_2)
{
__shared__ uint32_t sharedMemory[1024];

Expand All @@ -139,6 +148,13 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partid
if (thread >= threads)
return;

uint32_t tweak1_2[2];
if (variant > 0)
{
tweak1_2[0] = d_tweak1_2[thread * 2];
tweak1_2[1] = d_tweak1_2[thread * 2 + 1];
}

int i, k;
uint32_t j;
const int batchsize = ITER >> (2 + bfactor);
Expand Down Expand Up @@ -174,7 +190,8 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partid
//XOR_BLOCKS_DST(c, b, &long_state[j]);
t1[0] = shuffle(sPtr, sub, d[x], 0);
//long_state[j] = d[0] ^ d[1];
storeGlobal32(long_state + j, d[0] ^ d[1]);
const uint32_t z = d[0] ^ d[1];
storeGlobal32(long_state + j, (variant > 0 && sub == 2) ? variant1_1(z) : z);

//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
j = ((*t1 & 0x1FFFF0) >> 2) + sub;
Expand All @@ -194,7 +211,7 @@ __global__ void cryptonight_core_gpu_phase2(int threads, int bfactor, int partid

res = *((uint64_t *) t2) >> (sub & 1 ? 32 : 0);

storeGlobal32(long_state + j, res);
storeGlobal32(long_state + j, (variant > 0 && sub2) ? (tweak1_2[sub & 1] ^ res) : res);
a = (sub & 1 ? yy[1] : yy[0]) ^ res;
}
}
Expand Down Expand Up @@ -241,7 +258,11 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
}
}

__host__ void cryptonight_core_cpu_hash( int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2 )
template< uint32_t variant >
__host__ void cryptonight_core_cpu_hash_template( int thr_id, int blocks,
int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2,
uint32_t *d_ctx_tweak1_2)
{
dim3 grid( blocks );
dim3 block( threads );
Expand Down Expand Up @@ -275,7 +296,8 @@ __host__ void cryptonight_core_cpu_hash( int thr_id, int blocks, int threads, ui
}
for ( i = 0; i < partcount; i++ )
{
cryptonight_core_gpu_phase2<<< grid, block4, block4.x * sizeof (uint32_t) * static_cast<int> (device_arch[thr_id][0] < 3) >>>( blocks*threads, device_bfactor[thr_id], i, d_long_state, d_ctx_a, d_ctx_b );
cryptonight_core_gpu_phase2<variant><<< grid, block4, block4.x * sizeof (uint32_t) * static_cast<int> (device_arch[thr_id][0] < 3) >>>(
blocks*threads, device_bfactor[thr_id], i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak1_2 );
exit_if_cudaerror( thr_id, __FILE__, __LINE__ );
if ( partcount > 1 ) usleep( device_bsleep[thr_id] );
}
Expand All @@ -293,3 +315,18 @@ __host__ void cryptonight_core_cpu_hash( int thr_id, int blocks, int threads, ui
exit_if_cudaerror( thr_id, __FILE__, __LINE__ );
}
}

__host__ void cryptonight_core_cpu_hash( int thr_id, int blocks,
int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2,
uint32_t variant, uint32_t *d_ctx_tweak1_2)
{

if(variant == 0)
cryptonight_core_cpu_hash_template<0>(thr_id, blocks, threads, d_long_state, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1,
d_ctx_key2, d_ctx_tweak1_2);
else if(variant >= 1)
cryptonight_core_cpu_hash_template<1>(thr_id, blocks, threads, d_long_state, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1,
d_ctx_key2, d_ctx_tweak1_2);
}

20 changes: 17 additions & 3 deletions cryptonight/cuda_cryptonight_extra.cu
Expand Up @@ -88,7 +88,11 @@ __device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__
}
}

__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a,
uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2,
int variant, uint32_t * d_ctx_tweak1_2
)
{
int thread = ( blockDim.x * blockIdx.x + threadIdx.x );

Expand All @@ -100,6 +104,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
uint32_t ctx_key1[40];
uint32_t ctx_key2[40];
uint32_t input[19];
uint32_t tweak1_2[2];

MEMCPY4( input, d_input, 19 );
//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
Expand All @@ -113,6 +118,15 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );

if (variant > 0)
{
tweak1_2[0] = (input[8] >> 24) | (input[9] << 8);
tweak1_2[0] ^= ctx_state[48];
tweak1_2[1] = nonce;
tweak1_2[1] ^= ctx_state[49];
MEMCPY4(d_ctx_tweak1_2 + thread * 2, tweak1_2, 2);
}

memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
Expand Down Expand Up @@ -205,14 +219,14 @@ __host__ void cryptonight_extra_cpu_init( int thr_id )
exit_if_cudaerror( thr_id, __FILE__, __LINE__ );
}

__host__ void cryptonight_extra_cpu_prepare( int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2 )
__host__ void cryptonight_extra_cpu_prepare( int thr_id, int threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint32_t *d_ctx_tweak1_2 )
{
int threadsperblock = 128;

dim3 grid( ( threads + threadsperblock - 1 ) / threadsperblock );
dim3 block( threadsperblock );

cryptonight_extra_gpu_prepare<<<grid, block >>>( threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2 );
cryptonight_extra_gpu_prepare<<<grid, block >>>( threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak1_2 );
exit_if_cudaerror( thr_id, __FILE__, __LINE__ );
}

Expand Down
2 changes: 1 addition & 1 deletion miner.h
Expand Up @@ -198,7 +198,7 @@ extern int scanhash_cryptonight(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t *results);

extern void cryptonight_hash(void* output, const void* input, size_t len);
extern void cryptonight_hash(void* output, const void* input, size_t len, int variant);

struct thr_info {
int id;
Expand Down
2 changes: 1 addition & 1 deletion xmrMiner-config.h.cmake
Expand Up @@ -35,7 +35,7 @@


/* Define to the version of this package. */
#define PACKAGE_VERSION "0.2.1"
#define PACKAGE_VERSION "0.3.0"
#define PACKAGE_NAME "xmrMiner"
#define PROGRAM_NAME "xmrMiner"
#define PACKAGE_STRING PACKAGE_NAME " " PACKAGE_VERSION

0 comments on commit b69d6c3

Please sign in to comment.