In [None]:
!pip3 install pycuda

Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m25.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2023.1.1-py2.py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting mako (from pycuda)
  Downloadi

In [None]:
import math
import random
import numpy as np
import pycuda
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [None]:
cuda_kernel = """
#include<stdio.h>

#define A 0x67452301
#define B 0xefcdab89
#define C 0x98badcfe
#define D 0x10325476
#define F(X, Y, Z) ((X & Y) | (~X & Z))
#define G(X, Y, Z) ((X & Z) | (Y & ~Z))
#define H(X, Y, Z) (X ^ Y ^ Z)
#define I(X, Y, Z) (Y ^ (X | ~Z))
#define MX_THREAD 6912
#define MX_LEN 50
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;

typedef struct{
    uint64_t size;        // Size of input in bytes
    uint32_t buffer[4];   // Current accumulation of hash
    uint8_t input[64];    // Input to be used in the next step
    uint8_t digest[16];   // Result of algorithm
}MD5Context;

__device__ static uint32_t S[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
                       5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
                       4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
                       6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};

__device__ static uint32_t K[] = {0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
                       0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
                       0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
                       0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
                       0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
                       0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
                       0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
                       0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
                       0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
                       0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
                       0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
                       0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
                       0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
                       0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
                       0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
                       0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};


__device__ static uint8_t PADDING[] = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};

__device__ void md5Init(MD5Context *ctx);
__device__ void md5Update(MD5Context *ctx, uint8_t *input, size_t input_len);
__device__ void md5Finalize(MD5Context *ctx);
__device__ void md5Step(uint32_t *buffer, uint32_t *input);
__device__ void md5String(char *input, int len, char *output);

__device__ void strcpy_gpu(char* dest, int d_id, char* src, int s_id, int len) {
    for(int i = 0; i < len; i++) dest[d_id + i] = src[s_id + i];
    dest[d_id + len] = 0;
}

__device__ uint32_t rotateLeft(uint32_t x, uint32_t n){
    return (x << n) | (x >> (32 - n));
}

__device__ void md5Init(MD5Context *ctx){
    ctx->size = (uint64_t)0;

    ctx->buffer[0] = (uint32_t)A;
    ctx->buffer[1] = (uint32_t)B;
    ctx->buffer[2] = (uint32_t)C;
    ctx->buffer[3] = (uint32_t)D;
}

__device__ void md5Update(MD5Context *ctx, uint8_t *input_buffer, size_t input_len){
    uint32_t input[16];
    unsigned int offset = ctx->size % 64;
    ctx->size += (uint64_t)input_len;

    // Copy each byte in input_buffer into the next space in our context input
    for(unsigned int i = 0; i < input_len; ++i){
        ctx->input[offset++] = (uint8_t)*(input_buffer + i);

        // If we've filled our context input, copy it into our local array input
        // then reset the offset to 0 and fill in a new buffer.
        // Every time we fill out a chunk, we run it through the algorithm
        // to enable some back and forth between cpu and i/o
        if(offset % 64 == 0){
            for(unsigned int j = 0; j < 16; ++j){
                // Convert to little-endian
                // The local variable `input` our 512-bit chunk separated into 32-bit words
                // we can use in calculations
                input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 |
                           (uint32_t)(ctx->input[(j * 4) + 2]) << 16 |
                           (uint32_t)(ctx->input[(j * 4) + 1]) <<  8 |
                           (uint32_t)(ctx->input[(j * 4)]);
            }
            md5Step(ctx->buffer, input);
            offset = 0;
        }
    }
}

__device__ void md5Finalize(MD5Context *ctx){
    uint32_t input[16];
    unsigned int offset = ctx->size % 64;
    unsigned int padding_length = offset < 56 ? 56 - offset : (56 + 64) - offset;

    // Fill in the padding and undo the changes to size that resulted from the update
    md5Update(ctx, PADDING, padding_length);
    ctx->size -= (uint64_t)padding_length;

    // Do a final update (internal to this function)
    // Last two 32-bit words are the two halves of the size (converted from bytes to bits)
    for(unsigned int j = 0; j < 14; ++j){
        input[j] = (uint32_t)(ctx->input[(j * 4) + 3]) << 24 |
                   (uint32_t)(ctx->input[(j * 4) + 2]) << 16 |
                   (uint32_t)(ctx->input[(j * 4) + 1]) <<  8 |
                   (uint32_t)(ctx->input[(j * 4)]);
    }
    input[14] = (uint32_t)(ctx->size * 8);
    input[15] = (uint32_t)((ctx->size * 8) >> 32);

    md5Step(ctx->buffer, input);

    // Move the result into digest (convert from little-endian)
    for(unsigned int i = 0; i < 4; ++i){
        ctx->digest[(i * 4) + 0] = (uint8_t)((ctx->buffer[i] & 0x000000FF));
        ctx->digest[(i * 4) + 1] = (uint8_t)((ctx->buffer[i] & 0x0000FF00) >>  8);
        ctx->digest[(i * 4) + 2] = (uint8_t)((ctx->buffer[i] & 0x00FF0000) >> 16);
        ctx->digest[(i * 4) + 3] = (uint8_t)((ctx->buffer[i] & 0xFF000000) >> 24);
    }
}

__device__ void md5Step(uint32_t *buffer, uint32_t *input){
    uint32_t AA = buffer[0];
    uint32_t BB = buffer[1];
    uint32_t CC = buffer[2];
    uint32_t DD = buffer[3];

    uint32_t E;

    unsigned int j;

    for(unsigned int i = 0; i < 64; ++i){
        switch(i / 16){
            case 0:
                E = F(BB, CC, DD);
                j = i;
                break;
            case 1:
                E = G(BB, CC, DD);
                j = ((i * 5) + 1) % 16;
                break;
            case 2:
                E = H(BB, CC, DD);
                j = ((i * 3) + 5) % 16;
                break;
            default:
                E = I(BB, CC, DD);
                j = (i * 7) % 16;
                break;
        }

        uint32_t temp = DD;
        DD = CC;
        CC = BB;
        BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]);
        AA = temp;
    }

    buffer[0] += AA;
    buffer[1] += BB;
    buffer[2] += CC;
    buffer[3] += DD;
}

__device__ short hexdigest[MX_THREAD][32];
__device__ static char ch_mapping[16] = {'0', '1', '2', '3',\
                       '4', '5', '6', '7',\
                       '8', '9', 'a', 'b',\
                       'c', 'd', 'e', 'f'};
__device__ void md5String(char *input, int len, char *output){
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
    MD5Context ctx;
    md5Init(&ctx);
    md5Update(&ctx, (uint8_t *)input, len);
    md5Finalize(&ctx);
    for(int i = 0; i < 16; i++)
    {
        hexdigest[idx][2*i] = ( ctx.digest[i] >> 4 ) & 0xf;
        hexdigest[idx][2*i+1] = (ctx.digest[i] & 0xf);
    }
    for(int i = 0; i < len/2; i++)
        output[i] = ch_mapping[hexdigest[idx][i]];
    for(int i = len/2; i<len; i++)
        output[i] = ch_mapping[hexdigest[idx][(32-len+i)]];
    output[len] = 0;
}

__device__ void fn(char *data, int len, char *str_child) {
    md5String(data, len, str_child);
}

__device__ int distinct_property(char* str, int len){
    char prefix[5] = "a5b6", suffix[5] = "7b24";
    int m = 3;
    for(int i = 0; i < m; i++) if(prefix[i] != str[i]) return 0;
    for(int i = 0; i < m-1; i++) if(suffix[i] != str[len - 1 - i]) return 0;
    return 1;
}

__device__ void getString(char* str, int id, int len, char* sub_str_loc){
  strcpy_gpu(sub_str_loc, 0, str, id*len, len);
}

__device__ char str_child[MX_THREAD][MX_LEN];
__device__ char sub_str[MX_THREAD][MX_LEN];
__global__ void random_walk(char* str, char* op_str, int mx_path, int* retLen, int slay, int num_strings){
    int string_len = 2*slay;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < num_strings)
    {
      getString(str, idx, string_len, sub_str[idx]);
      for(int i = 0 ; i < mx_path; i++){
          if(distinct_property(sub_str[idx], string_len) && i != 0){
              retLen[idx] =  i + 1;
              strcpy_gpu(op_str, idx*string_len, str_child[idx], 0, string_len);
              break;
          }
          fn(sub_str[idx], string_len, str_child[idx]);
          strcpy_gpu(sub_str[idx], 0, str_child[idx], 0, string_len);
      }
    }
}
"""

In [None]:
import hashlib

SLAY, theta= 10, 1/1e6
hashMapSync = {}

def fn(string, reduced = True):
    s = hashlib.md5(string.encode('utf-8')).hexdigest()
    if reduced:
        return s[:SLAY] + s[-SLAY:]
    return s

def randomSt():
    mxLen = 2*SLAY
    chrSet = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'] # HEX FAMILY
    res = ""
    for indx in range(mxLen):
        res = res + chrSet[random.randint(0, len(chrSet) - 1)]
    return res

def getSolution(mx_iter = 100):
  global cnt
  # PyCUDA
  for _ in range(mx_iter):
    block_size = 64 # 64 threads per block.
    grid_size = 80 # 108 block
    mod = SourceModule(cuda_kernel)
    fetch = mod.get_function("random_walk")

    # Param 1
    strArr = np.array([ randomSt().encode("utf-8") for _ in range(grid_size * block_size)])
    str_gpu_ip = cuda.mem_alloc(strArr.nbytes)
    cuda.memcpy_htod(str_gpu_ip, strArr)

    # Param 2
    str_gpu_op = cuda.mem_alloc(strArr.nbytes)

    # Param 3
    mx_path = np.array([math.ceil(1//theta)], dtype = np.int32)

    # Param 4 ~ Seq Len
    retLen = np.zeros(grid_size * block_size, dtype = np.int32)
    retLen_gpu = cuda.mem_alloc(retLen.nbytes)
    cuda.memcpy_htod(retLen_gpu, retLen)

    # Param 5
    slay = np.array([SLAY], dtype = np.int32)

    # Param 6
    num_strings = np.array([grid_size * block_size], dtype = np.int32)

    # Fucntion CALL
    fetch(str_gpu_ip, str_gpu_op, mx_path, retLen_gpu, slay, num_strings, block=(block_size, 1, 1), grid=(grid_size, 1))

    # Output 1
    output_bytes = np.empty(strArr.nbytes, dtype=np.uint8)
    cuda.memcpy_dtoh(output_bytes, str_gpu_op)
    output_bytes = output_bytes.tobytes()

    # Output 2
    cuda.memcpy_dtoh(retLen, retLen_gpu)

    found = False
    lb = 0
    # Parse Output
    for res_id in range(num_strings[0]):
      x = strArr[res_id].decode('utf-8')
      hash_x = output_bytes[lb: lb+2*SLAY]
      lb += 2*SLAY
      if not retLen[res_id]: continue
      cnt += 1
      if hash_x not in hashMapSync:
        hashMapSync[hash_x] = [x, retLen[res_id]]
      else:
        print(f'{[x, retLen[res_id]]}, {hashMapSync[hash_x]}')
        found = True
        break
    if found: break


In [None]:
%%time
cnt = 0
getSolution(mx_iter = 1000000)
print(cnt)

In [None]:
import hashlib

reduced = True
def fn(string, reduced = True):
    s = hashlib.md5(string.encode('utf-8')).hexdigest()
    if reduced:
        return s[:SLAY] + s[-SLAY:]
    return s

x, y = None, None
seq0, seq1 =  ['2c40437b1487732f11', 79725], ['7acfbd132c01382f2a', 98213]
x, y = seq0[0], seq1[0]
SLAY = len(seq0[0])//2

if seq0[-1] > seq1[-1]:
    x, y = seq0[0], seq1[0]
    while seq0[-1] > seq1[-1]:
        x = fn(x)
        seq0[-1] -= 1

if seq0[-1] < seq1[-1]:
    x, y = seq1[0], seq0[0]
    while seq0[-1] < seq1[-1]:
        x = fn(x)
        seq1[-1] -= 1

while seq0[-1] and fn(x) != fn(y):
    x, y = fn(x), fn(y)
    seq0[-1] -= 1
print(f'Result ~ [{x}: {fn(x, reduced = False)}]  [{y}: {fn(y, reduced = False)}]')