From e3ab02f1e4c02193cfb0d450bc57d92436f4c999 Mon Sep 17 00:00:00 2001 From: Aditya Bisht Date: Mon, 5 Feb 2024 14:02:13 +0530 Subject: [PATCH] feat: js to rust compiler --- Cargo.lock | 10 +- .../circom/circuits/common/email_addr.json | 14 +- .../circuits/common/email_addr_regex.circom | 79 ++-- .../circom/circuits/common/email_domain.json | 30 +- .../circuits/common/email_domain_regex.circom | 75 ++-- .../circom/circuits/common/from_addr.json | 38 +- .../circom/circuits/common/message_id.json | 30 +- .../circuits/common/message_id_regex.circom | 353 +++++++++--------- .../circom/tests/circuits/simple_regex.circom | 2 - packages/compiler/Cargo.toml | 2 + packages/compiler/package.json | 4 +- packages/compiler/src/js_caller.rs | 14 +- packages/compiler/src/lib.rs | 6 +- packages/compiler/src/regex.rs | 177 +++++++++ 14 files changed, 501 insertions(+), 333 deletions(-) create mode 100644 packages/compiler/src/regex.rs diff --git a/Cargo.lock b/Cargo.lock index e61b1e3..405856c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1016,9 +1016,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", @@ -1028,9 +1028,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", @@ -1745,6 +1745,8 @@ dependencies = [ "js-sandbox", "neon", "petgraph", + "regex", + "regex-automata", "serde", "serde_json", "tabbycat", diff --git a/packages/circom/circuits/common/email_addr.json b/packages/circom/circuits/common/email_addr.json index c590439..0842165 100644 --- a/packages/circom/circuits/common/email_addr.json +++ b/packages/circom/circuits/common/email_addr.json @@ -1,8 +1,8 @@ { - "parts": [ - { - "is_public": true, - "regex_def": "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|!|#|$|%|&|'|\\*|\\+|-|/|=|\\?|^|_|`|{|\\||}|~|\\.)+@(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|\\.|-)+" - } - ] -} \ No newline at end of file + "parts": [ + { + "is_public": true, + "regex_def": "[A-Za-z0-9!#$%&'*+=?^_`{|}~.]+@[A-Za-z0-9.-]+" + } + ] +} diff --git a/packages/circom/circuits/common/email_addr_regex.circom b/packages/circom/circuits/common/email_addr_regex.circom index 9a029d5..0cda99f 100644 --- a/packages/circom/circuits/common/email_addr_regex.circom +++ b/packages/circom/circuits/common/email_addr_regex.circom @@ -13,7 +13,7 @@ template EmailAddrRegex(msg_bytes) { in[i+1] <== msg[i]; } - component eq[25][num_bytes]; + component eq[24][num_bytes]; component lt[8][num_bytes]; component and[9][num_bytes]; component multi_or[5][num_bytes]; @@ -71,55 +71,49 @@ template EmailAddrRegex(msg_bytes) { eq[7][i].in[1] <== 43; eq[8][i] = IsEqual(); eq[8][i].in[0] <== in[i]; - eq[8][i].in[1] <== 45; + eq[8][i].in[1] <== 46; eq[9][i] = IsEqual(); eq[9][i].in[0] <== in[i]; - eq[9][i].in[1] <== 46; + eq[9][i].in[1] <== 48; eq[10][i] = IsEqual(); eq[10][i].in[0] <== in[i]; - eq[10][i].in[1] <== 47; + eq[10][i].in[1] <== 49; eq[11][i] = IsEqual(); eq[11][i].in[0] <== in[i]; - eq[11][i].in[1] <== 48; + eq[11][i].in[1] <== 50; eq[12][i] = IsEqual(); eq[12][i].in[0] <== in[i]; - eq[12][i].in[1] <== 49; + eq[12][i].in[1] <== 51; eq[13][i] = IsEqual(); eq[13][i].in[0] <== in[i]; - eq[13][i].in[1] <== 50; + eq[13][i].in[1] <== 52; eq[14][i] = IsEqual(); eq[14][i].in[0] <== in[i]; - eq[14][i].in[1] <== 51; + eq[14][i].in[1] <== 53; eq[15][i] = IsEqual(); eq[15][i].in[0] <== in[i]; - eq[15][i].in[1] <== 52; + eq[15][i].in[1] <== 54; eq[16][i] = IsEqual(); eq[16][i].in[0] <== in[i]; - eq[16][i].in[1] <== 53; + eq[16][i].in[1] <== 55; eq[17][i] = IsEqual(); eq[17][i].in[0] <== in[i]; - eq[17][i].in[1] <== 54; + eq[17][i].in[1] <== 56; eq[18][i] = IsEqual(); eq[18][i].in[0] <== in[i]; - eq[18][i].in[1] <== 55; + eq[18][i].in[1] <== 57; eq[19][i] = IsEqual(); eq[19][i].in[0] <== in[i]; - eq[19][i].in[1] <== 56; + eq[19][i].in[1] <== 61; eq[20][i] = IsEqual(); eq[20][i].in[0] <== in[i]; - eq[20][i].in[1] <== 57; + eq[20][i].in[1] <== 63; eq[21][i] = IsEqual(); eq[21][i].in[0] <== in[i]; - eq[21][i].in[1] <== 61; - eq[22][i] = IsEqual(); - eq[22][i].in[0] <== in[i]; - eq[22][i].in[1] <== 63; - eq[23][i] = IsEqual(); - eq[23][i].in[0] <== in[i]; - eq[23][i].in[1] <== 255; + eq[21][i].in[1] <== 255; and[2][i] = AND(); and[2][i].a <== states[i][0]; - multi_or[0][i] = MultiOR(26); + multi_or[0][i] = MultiOR(24); multi_or[0][i].in[0] <== and[0][i].out; multi_or[0][i].in[1] <== and[1][i].out; multi_or[0][i].in[2] <== eq[0][i].out; @@ -144,8 +138,6 @@ template EmailAddrRegex(msg_bytes) { multi_or[0][i].in[21] <== eq[19][i].out; multi_or[0][i].in[22] <== eq[20][i].out; multi_or[0][i].in[23] <== eq[21][i].out; - multi_or[0][i].in[24] <== eq[22][i].out; - multi_or[0][i].in[25] <== eq[23][i].out; and[2][i].b <== multi_or[0][i].out; lt[4][i] = LessEqThan(8); lt[4][i].in[0] <== 94; @@ -158,7 +150,7 @@ template EmailAddrRegex(msg_bytes) { and[3][i].b <== lt[5][i].out; and[4][i] = AND(); and[4][i].a <== states[i][1]; - multi_or[1][i] = MultiOR(25); + multi_or[1][i] = MultiOR(23); multi_or[1][i].in[0] <== and[0][i].out; multi_or[1][i].in[1] <== and[3][i].out; multi_or[1][i].in[2] <== eq[0][i].out; @@ -182,20 +174,18 @@ template EmailAddrRegex(msg_bytes) { multi_or[1][i].in[20] <== eq[18][i].out; multi_or[1][i].in[21] <== eq[19][i].out; multi_or[1][i].in[22] <== eq[20][i].out; - multi_or[1][i].in[23] <== eq[21][i].out; - multi_or[1][i].in[24] <== eq[22][i].out; and[4][i].b <== multi_or[1][i].out; multi_or[2][i] = MultiOR(2); multi_or[2][i].in[0] <== and[2][i].out; multi_or[2][i].in[1] <== and[4][i].out; states[i+1][1] <== multi_or[2][i].out; state_changed[i].in[0] <== states[i+1][1]; - eq[24][i] = IsEqual(); - eq[24][i].in[0] <== in[i]; - eq[24][i].in[1] <== 64; + eq[22][i] = IsEqual(); + eq[22][i].in[0] <== in[i]; + eq[22][i].in[1] <== 64; and[5][i] = AND(); and[5][i].a <== states[i][1]; - and[5][i].b <== eq[24][i].out; + and[5][i].b <== eq[22][i].out; states[i+1][2] <== and[5][i].out; state_changed[i].in[1] <== states[i+1][2]; lt[6][i] = LessEqThan(8); @@ -207,23 +197,26 @@ template EmailAddrRegex(msg_bytes) { and[6][i] = AND(); and[6][i].a <== lt[6][i].out; and[6][i].b <== lt[7][i].out; + eq[23][i] = IsEqual(); + eq[23][i].in[0] <== in[i]; + eq[23][i].in[1] <== 45; and[7][i] = AND(); and[7][i].a <== states[i][2]; multi_or[3][i] = MultiOR(14); multi_or[3][i].in[0] <== and[0][i].out; multi_or[3][i].in[1] <== and[6][i].out; - multi_or[3][i].in[2] <== eq[8][i].out; - multi_or[3][i].in[3] <== eq[9][i].out; - multi_or[3][i].in[4] <== eq[11][i].out; - multi_or[3][i].in[5] <== eq[12][i].out; - multi_or[3][i].in[6] <== eq[13][i].out; - multi_or[3][i].in[7] <== eq[14][i].out; - multi_or[3][i].in[8] <== eq[15][i].out; - multi_or[3][i].in[9] <== eq[16][i].out; - multi_or[3][i].in[10] <== eq[17][i].out; - multi_or[3][i].in[11] <== eq[18][i].out; - multi_or[3][i].in[12] <== eq[19][i].out; - multi_or[3][i].in[13] <== eq[20][i].out; + multi_or[3][i].in[2] <== eq[23][i].out; + multi_or[3][i].in[3] <== eq[8][i].out; + multi_or[3][i].in[4] <== eq[9][i].out; + multi_or[3][i].in[5] <== eq[10][i].out; + multi_or[3][i].in[6] <== eq[11][i].out; + multi_or[3][i].in[7] <== eq[12][i].out; + multi_or[3][i].in[8] <== eq[13][i].out; + multi_or[3][i].in[9] <== eq[14][i].out; + multi_or[3][i].in[10] <== eq[15][i].out; + multi_or[3][i].in[11] <== eq[16][i].out; + multi_or[3][i].in[12] <== eq[17][i].out; + multi_or[3][i].in[13] <== eq[18][i].out; and[7][i].b <== multi_or[3][i].out; and[8][i] = AND(); and[8][i].a <== states[i][3]; diff --git a/packages/circom/circuits/common/email_domain.json b/packages/circom/circuits/common/email_domain.json index fb60f82..bef3e81 100644 --- a/packages/circom/circuits/common/email_domain.json +++ b/packages/circom/circuits/common/email_domain.json @@ -1,16 +1,16 @@ { - "parts": [ - { - "is_public": false, - "regex_def": "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|!|#|$|%|&|'|\\*|\\+|-|/|=|\\?|^|_|`|{|\\||}|~|\\.)+" - }, - { - "is_public": false, - "regex_def": "@" - }, - { - "is_public": true, - "regex_def": "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|\\.|-)+" - } - ] -} \ No newline at end of file + "parts": [ + { + "is_public": false, + "regex_def": "[A-Za-z0-9!#$%&'\\*\\+-/=\\?^_`{\\|}~\\.]+" + }, + { + "is_public": false, + "regex_def": "@" + }, + { + "is_public": true, + "regex_def": "[A-Za-z0-9\\.-]+" + } + ] +} diff --git a/packages/circom/circuits/common/email_domain_regex.circom b/packages/circom/circuits/common/email_domain_regex.circom index 20f3cc5..484576d 100644 --- a/packages/circom/circuits/common/email_domain_regex.circom +++ b/packages/circom/circuits/common/email_domain_regex.circom @@ -13,7 +13,7 @@ template EmailDomainRegex(msg_bytes) { in[i+1] <== msg[i]; } - component eq[25][num_bytes]; + component eq[26][num_bytes]; component lt[8][num_bytes]; component and[9][num_bytes]; component multi_or[5][num_bytes]; @@ -71,55 +71,58 @@ template EmailDomainRegex(msg_bytes) { eq[7][i].in[1] <== 43; eq[8][i] = IsEqual(); eq[8][i].in[0] <== in[i]; - eq[8][i].in[1] <== 45; + eq[8][i].in[1] <== 44; eq[9][i] = IsEqual(); eq[9][i].in[0] <== in[i]; - eq[9][i].in[1] <== 46; + eq[9][i].in[1] <== 45; eq[10][i] = IsEqual(); eq[10][i].in[0] <== in[i]; - eq[10][i].in[1] <== 47; + eq[10][i].in[1] <== 46; eq[11][i] = IsEqual(); eq[11][i].in[0] <== in[i]; - eq[11][i].in[1] <== 48; + eq[11][i].in[1] <== 47; eq[12][i] = IsEqual(); eq[12][i].in[0] <== in[i]; - eq[12][i].in[1] <== 49; + eq[12][i].in[1] <== 48; eq[13][i] = IsEqual(); eq[13][i].in[0] <== in[i]; - eq[13][i].in[1] <== 50; + eq[13][i].in[1] <== 49; eq[14][i] = IsEqual(); eq[14][i].in[0] <== in[i]; - eq[14][i].in[1] <== 51; + eq[14][i].in[1] <== 50; eq[15][i] = IsEqual(); eq[15][i].in[0] <== in[i]; - eq[15][i].in[1] <== 52; + eq[15][i].in[1] <== 51; eq[16][i] = IsEqual(); eq[16][i].in[0] <== in[i]; - eq[16][i].in[1] <== 53; + eq[16][i].in[1] <== 52; eq[17][i] = IsEqual(); eq[17][i].in[0] <== in[i]; - eq[17][i].in[1] <== 54; + eq[17][i].in[1] <== 53; eq[18][i] = IsEqual(); eq[18][i].in[0] <== in[i]; - eq[18][i].in[1] <== 55; + eq[18][i].in[1] <== 54; eq[19][i] = IsEqual(); eq[19][i].in[0] <== in[i]; - eq[19][i].in[1] <== 56; + eq[19][i].in[1] <== 55; eq[20][i] = IsEqual(); eq[20][i].in[0] <== in[i]; - eq[20][i].in[1] <== 57; + eq[20][i].in[1] <== 56; eq[21][i] = IsEqual(); eq[21][i].in[0] <== in[i]; - eq[21][i].in[1] <== 61; + eq[21][i].in[1] <== 57; eq[22][i] = IsEqual(); eq[22][i].in[0] <== in[i]; - eq[22][i].in[1] <== 63; + eq[22][i].in[1] <== 61; eq[23][i] = IsEqual(); eq[23][i].in[0] <== in[i]; - eq[23][i].in[1] <== 255; + eq[23][i].in[1] <== 63; + eq[24][i] = IsEqual(); + eq[24][i].in[0] <== in[i]; + eq[24][i].in[1] <== 255; and[2][i] = AND(); and[2][i].a <== states[i][0]; - multi_or[0][i] = MultiOR(26); + multi_or[0][i] = MultiOR(27); multi_or[0][i].in[0] <== and[0][i].out; multi_or[0][i].in[1] <== and[1][i].out; multi_or[0][i].in[2] <== eq[0][i].out; @@ -146,6 +149,7 @@ template EmailDomainRegex(msg_bytes) { multi_or[0][i].in[23] <== eq[21][i].out; multi_or[0][i].in[24] <== eq[22][i].out; multi_or[0][i].in[25] <== eq[23][i].out; + multi_or[0][i].in[26] <== eq[24][i].out; and[2][i].b <== multi_or[0][i].out; lt[4][i] = LessEqThan(8); lt[4][i].in[0] <== 94; @@ -158,7 +162,7 @@ template EmailDomainRegex(msg_bytes) { and[3][i].b <== lt[5][i].out; and[4][i] = AND(); and[4][i].a <== states[i][1]; - multi_or[1][i] = MultiOR(25); + multi_or[1][i] = MultiOR(26); multi_or[1][i].in[0] <== and[0][i].out; multi_or[1][i].in[1] <== and[3][i].out; multi_or[1][i].in[2] <== eq[0][i].out; @@ -184,18 +188,19 @@ template EmailDomainRegex(msg_bytes) { multi_or[1][i].in[22] <== eq[20][i].out; multi_or[1][i].in[23] <== eq[21][i].out; multi_or[1][i].in[24] <== eq[22][i].out; + multi_or[1][i].in[25] <== eq[23][i].out; and[4][i].b <== multi_or[1][i].out; multi_or[2][i] = MultiOR(2); multi_or[2][i].in[0] <== and[2][i].out; multi_or[2][i].in[1] <== and[4][i].out; states[i+1][1] <== multi_or[2][i].out; state_changed[i].in[0] <== states[i+1][1]; - eq[24][i] = IsEqual(); - eq[24][i].in[0] <== in[i]; - eq[24][i].in[1] <== 64; + eq[25][i] = IsEqual(); + eq[25][i].in[0] <== in[i]; + eq[25][i].in[1] <== 64; and[5][i] = AND(); and[5][i].a <== states[i][1]; - and[5][i].b <== eq[24][i].out; + and[5][i].b <== eq[25][i].out; states[i+1][2] <== and[5][i].out; state_changed[i].in[1] <== states[i+1][2]; lt[6][i] = LessEqThan(8); @@ -212,18 +217,18 @@ template EmailDomainRegex(msg_bytes) { multi_or[3][i] = MultiOR(14); multi_or[3][i].in[0] <== and[0][i].out; multi_or[3][i].in[1] <== and[6][i].out; - multi_or[3][i].in[2] <== eq[8][i].out; - multi_or[3][i].in[3] <== eq[9][i].out; - multi_or[3][i].in[4] <== eq[11][i].out; - multi_or[3][i].in[5] <== eq[12][i].out; - multi_or[3][i].in[6] <== eq[13][i].out; - multi_or[3][i].in[7] <== eq[14][i].out; - multi_or[3][i].in[8] <== eq[15][i].out; - multi_or[3][i].in[9] <== eq[16][i].out; - multi_or[3][i].in[10] <== eq[17][i].out; - multi_or[3][i].in[11] <== eq[18][i].out; - multi_or[3][i].in[12] <== eq[19][i].out; - multi_or[3][i].in[13] <== eq[20][i].out; + multi_or[3][i].in[2] <== eq[9][i].out; + multi_or[3][i].in[3] <== eq[10][i].out; + multi_or[3][i].in[4] <== eq[12][i].out; + multi_or[3][i].in[5] <== eq[13][i].out; + multi_or[3][i].in[6] <== eq[14][i].out; + multi_or[3][i].in[7] <== eq[15][i].out; + multi_or[3][i].in[8] <== eq[16][i].out; + multi_or[3][i].in[9] <== eq[17][i].out; + multi_or[3][i].in[10] <== eq[18][i].out; + multi_or[3][i].in[11] <== eq[19][i].out; + multi_or[3][i].in[12] <== eq[20][i].out; + multi_or[3][i].in[13] <== eq[21][i].out; and[7][i].b <== multi_or[3][i].out; and[8][i] = AND(); and[8][i].a <== states[i][3]; diff --git a/packages/circom/circuits/common/from_addr.json b/packages/circom/circuits/common/from_addr.json index 584c66c..7fdc28e 100644 --- a/packages/circom/circuits/common/from_addr.json +++ b/packages/circom/circuits/common/from_addr.json @@ -1,20 +1,20 @@ { - "parts": [ - { - "is_public": false, - "regex_def": "((\r\n)|^)from:" - }, - { - "is_public": false, - "regex_def": "([^\r\n]+<)?" - }, - { - "is_public": true, - "regex_def": "(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|!|#|$|%|&|'|\\*|\\+|-|/|=|\\?|^|_|`|{|\\||}|~|\\.)+@(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9|_|\\.|-)+" - }, - { - "is_public": false, - "regex_def": ">?\r\n" - } - ] -} \ No newline at end of file + "parts": [ + { + "is_public": false, + "regex_def": "((\r\n)|^)from:" + }, + { + "is_public": false, + "regex_def": "([^\r\n]+<)?" + }, + { + "is_public": true, + "regex_def": "[A-Za-z0-9!#$%&'\\*\\+-/=\\?^_`{\\|}~\\.]+@[A-Za-z0-9\\.-]+" + }, + { + "is_public": false, + "regex_def": ">?\r\n" + } + ] +} diff --git a/packages/circom/circuits/common/message_id.json b/packages/circom/circuits/common/message_id.json index c207af1..f71f0ee 100644 --- a/packages/circom/circuits/common/message_id.json +++ b/packages/circom/circuits/common/message_id.json @@ -1,16 +1,16 @@ { - "parts": [ - { - "is_public": false, - "regex_def": "((\r\n)|^)message-id:" - }, - { - "is_public": true, - "regex_def": "<(=|@|\\.|\\+|_|-|a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|0|1|2|3|4|5|6|7|8|9)+>" - }, - { - "is_public": false, - "regex_def": "\r\n" - } - ] -} \ No newline at end of file + "parts": [ + { + "is_public": false, + "regex_def": "((\r\n)|^)message-id:" + }, + { + "is_public": true, + "regex_def": "<[A-Za-z0-9=@\\.\\+_-]+>" + }, + { + "is_public": false, + "regex_def": "\r\n" + } + ] +} diff --git a/packages/circom/circuits/common/message_id_regex.circom b/packages/circom/circuits/common/message_id_regex.circom index f89299e..af6e982 100644 --- a/packages/circom/circuits/common/message_id_regex.circom +++ b/packages/circom/circuits/common/message_id_regex.circom @@ -13,10 +13,10 @@ template MessageIdRegex(msg_bytes) { in[i+1] <== msg[i]; } - component eq[28][num_bytes]; + component eq[27][num_bytes]; component lt[4][num_bytes]; - component and[23][num_bytes]; - component multi_or[4][num_bytes]; + component and[22][num_bytes]; + component multi_or[3][num_bytes]; signal states[num_bytes+1][19]; component state_changed[num_bytes]; @@ -27,247 +27,238 @@ template MessageIdRegex(msg_bytes) { for (var i = 0; i < num_bytes; i++) { state_changed[i] = MultiOR(18); - lt[0][i] = LessEqThan(8); - lt[0][i].in[0] <== 64; - lt[0][i].in[1] <== in[i]; - lt[1][i] = LessEqThan(8); - lt[1][i].in[0] <== in[i]; - lt[1][i].in[1] <== 90; - and[0][i] = AND(); - and[0][i].a <== lt[0][i].out; - and[0][i].b <== lt[1][i].out; - lt[2][i] = LessEqThan(8); - lt[2][i].in[0] <== 97; - lt[2][i].in[1] <== in[i]; - lt[3][i] = LessEqThan(8); - lt[3][i].in[0] <== in[i]; - lt[3][i].in[1] <== 122; - and[1][i] = AND(); - and[1][i].a <== lt[2][i].out; - and[1][i].b <== lt[3][i].out; eq[0][i] = IsEqual(); eq[0][i].in[0] <== in[i]; - eq[0][i].in[1] <== 43; + eq[0][i].in[1] <== 109; + and[0][i] = AND(); + and[0][i].a <== states[i][0]; + and[0][i].b <== eq[0][i].out; + and[1][i] = AND(); + and[1][i].a <== states[i][4]; + and[1][i].b <== eq[0][i].out; + multi_or[0][i] = MultiOR(2); + multi_or[0][i].in[0] <== and[0][i].out; + multi_or[0][i].in[1] <== and[1][i].out; + states[i+1][1] <== multi_or[0][i].out; + state_changed[i].in[0] <== states[i+1][1]; eq[1][i] = IsEqual(); eq[1][i].in[0] <== in[i]; - eq[1][i].in[1] <== 45; + eq[1][i].in[1] <== 13; + and[2][i] = AND(); + and[2][i].a <== states[i][0]; + and[2][i].b <== eq[1][i].out; + states[i+1][2] <== and[2][i].out; + state_changed[i].in[1] <== states[i+1][2]; eq[2][i] = IsEqual(); eq[2][i].in[0] <== in[i]; - eq[2][i].in[1] <== 46; + eq[2][i].in[1] <== 101; + and[3][i] = AND(); + and[3][i].a <== states[i][1]; + and[3][i].b <== eq[2][i].out; + states[i+1][3] <== and[3][i].out; + state_changed[i].in[2] <== states[i+1][3]; eq[3][i] = IsEqual(); eq[3][i].in[0] <== in[i]; - eq[3][i].in[1] <== 48; + eq[3][i].in[1] <== 10; + and[4][i] = AND(); + and[4][i].a <== states[i][2]; + and[4][i].b <== eq[3][i].out; + states[i+1][4] <== and[4][i].out; + state_changed[i].in[3] <== states[i+1][4]; eq[4][i] = IsEqual(); eq[4][i].in[0] <== in[i]; - eq[4][i].in[1] <== 49; + eq[4][i].in[1] <== 115; + and[5][i] = AND(); + and[5][i].a <== states[i][3]; + and[5][i].b <== eq[4][i].out; + states[i+1][5] <== and[5][i].out; + state_changed[i].in[4] <== states[i+1][5]; + and[6][i] = AND(); + and[6][i].a <== states[i][5]; + and[6][i].b <== eq[4][i].out; + states[i+1][6] <== and[6][i].out; + state_changed[i].in[5] <== states[i+1][6]; eq[5][i] = IsEqual(); eq[5][i].in[0] <== in[i]; - eq[5][i].in[1] <== 50; + eq[5][i].in[1] <== 97; + and[7][i] = AND(); + and[7][i].a <== states[i][6]; + and[7][i].b <== eq[5][i].out; + states[i+1][7] <== and[7][i].out; + state_changed[i].in[6] <== states[i+1][7]; eq[6][i] = IsEqual(); eq[6][i].in[0] <== in[i]; - eq[6][i].in[1] <== 51; + eq[6][i].in[1] <== 103; + and[8][i] = AND(); + and[8][i].a <== states[i][7]; + and[8][i].b <== eq[6][i].out; + states[i+1][8] <== and[8][i].out; + state_changed[i].in[7] <== states[i+1][8]; + and[9][i] = AND(); + and[9][i].a <== states[i][8]; + and[9][i].b <== eq[2][i].out; + states[i+1][9] <== and[9][i].out; + state_changed[i].in[8] <== states[i+1][9]; eq[7][i] = IsEqual(); eq[7][i].in[0] <== in[i]; - eq[7][i].in[1] <== 52; + eq[7][i].in[1] <== 45; + and[10][i] = AND(); + and[10][i].a <== states[i][9]; + and[10][i].b <== eq[7][i].out; + states[i+1][10] <== and[10][i].out; + state_changed[i].in[9] <== states[i+1][10]; eq[8][i] = IsEqual(); eq[8][i].in[0] <== in[i]; - eq[8][i].in[1] <== 53; + eq[8][i].in[1] <== 105; + and[11][i] = AND(); + and[11][i].a <== states[i][10]; + and[11][i].b <== eq[8][i].out; + states[i+1][11] <== and[11][i].out; + state_changed[i].in[10] <== states[i+1][11]; eq[9][i] = IsEqual(); eq[9][i].in[0] <== in[i]; - eq[9][i].in[1] <== 54; + eq[9][i].in[1] <== 100; + and[12][i] = AND(); + and[12][i].a <== states[i][11]; + and[12][i].b <== eq[9][i].out; + states[i+1][12] <== and[12][i].out; + state_changed[i].in[11] <== states[i+1][12]; eq[10][i] = IsEqual(); eq[10][i].in[0] <== in[i]; - eq[10][i].in[1] <== 55; + eq[10][i].in[1] <== 58; + and[13][i] = AND(); + and[13][i].a <== states[i][12]; + and[13][i].b <== eq[10][i].out; + states[i+1][13] <== and[13][i].out; + state_changed[i].in[12] <== states[i+1][13]; eq[11][i] = IsEqual(); eq[11][i].in[0] <== in[i]; - eq[11][i].in[1] <== 56; + eq[11][i].in[1] <== 60; + and[14][i] = AND(); + and[14][i].a <== states[i][13]; + and[14][i].b <== eq[11][i].out; + states[i+1][14] <== and[14][i].out; + state_changed[i].in[13] <== states[i+1][14]; + lt[0][i] = LessEqThan(8); + lt[0][i].in[0] <== 64; + lt[0][i].in[1] <== in[i]; + lt[1][i] = LessEqThan(8); + lt[1][i].in[0] <== in[i]; + lt[1][i].in[1] <== 90; + and[15][i] = AND(); + and[15][i].a <== lt[0][i].out; + and[15][i].b <== lt[1][i].out; + lt[2][i] = LessEqThan(8); + lt[2][i].in[0] <== 97; + lt[2][i].in[1] <== in[i]; + lt[3][i] = LessEqThan(8); + lt[3][i].in[0] <== in[i]; + lt[3][i].in[1] <== 122; + and[16][i] = AND(); + and[16][i].a <== lt[2][i].out; + and[16][i].b <== lt[3][i].out; eq[12][i] = IsEqual(); eq[12][i].in[0] <== in[i]; - eq[12][i].in[1] <== 57; + eq[12][i].in[1] <== 43; eq[13][i] = IsEqual(); eq[13][i].in[0] <== in[i]; - eq[13][i].in[1] <== 61; + eq[13][i].in[1] <== 46; eq[14][i] = IsEqual(); eq[14][i].in[0] <== in[i]; - eq[14][i].in[1] <== 95; - and[2][i] = AND(); - and[2][i].a <== states[i][1]; - multi_or[0][i] = MultiOR(17); - multi_or[0][i].in[0] <== and[0][i].out; - multi_or[0][i].in[1] <== and[1][i].out; - multi_or[0][i].in[2] <== eq[0][i].out; - multi_or[0][i].in[3] <== eq[1][i].out; - multi_or[0][i].in[4] <== eq[2][i].out; - multi_or[0][i].in[5] <== eq[3][i].out; - multi_or[0][i].in[6] <== eq[4][i].out; - multi_or[0][i].in[7] <== eq[5][i].out; - multi_or[0][i].in[8] <== eq[6][i].out; - multi_or[0][i].in[9] <== eq[7][i].out; - multi_or[0][i].in[10] <== eq[8][i].out; - multi_or[0][i].in[11] <== eq[9][i].out; - multi_or[0][i].in[12] <== eq[10][i].out; - multi_or[0][i].in[13] <== eq[11][i].out; - multi_or[0][i].in[14] <== eq[12][i].out; - multi_or[0][i].in[15] <== eq[13][i].out; - multi_or[0][i].in[16] <== eq[14][i].out; - and[2][i].b <== multi_or[0][i].out; - and[3][i] = AND(); - and[3][i].a <== states[i][18]; - and[3][i].b <== multi_or[0][i].out; - multi_or[1][i] = MultiOR(2); - multi_or[1][i].in[0] <== and[2][i].out; - multi_or[1][i].in[1] <== and[3][i].out; - states[i+1][1] <== multi_or[1][i].out; - state_changed[i].in[0] <== states[i+1][1]; + eq[14][i].in[1] <== 48; eq[15][i] = IsEqual(); eq[15][i].in[0] <== in[i]; - eq[15][i].in[1] <== 13; - and[4][i] = AND(); - and[4][i].a <== states[i][0]; - and[4][i].b <== eq[15][i].out; - and[5][i] = AND(); - and[5][i].a <== states[i][3]; - and[5][i].b <== eq[15][i].out; - multi_or[2][i] = MultiOR(2); - multi_or[2][i].in[0] <== and[4][i].out; - multi_or[2][i].in[1] <== and[5][i].out; - states[i+1][2] <== multi_or[2][i].out; - state_changed[i].in[1] <== states[i+1][2]; + eq[15][i].in[1] <== 49; eq[16][i] = IsEqual(); eq[16][i].in[0] <== in[i]; - eq[16][i].in[1] <== 255; - and[6][i] = AND(); - and[6][i].a <== states[i][0]; - and[6][i].b <== eq[16][i].out; + eq[16][i].in[1] <== 50; eq[17][i] = IsEqual(); eq[17][i].in[0] <== in[i]; - eq[17][i].in[1] <== 10; - and[7][i] = AND(); - and[7][i].a <== states[i][2]; - and[7][i].b <== eq[17][i].out; - multi_or[3][i] = MultiOR(2); - multi_or[3][i].in[0] <== and[6][i].out; - multi_or[3][i].in[1] <== and[7][i].out; - states[i+1][3] <== multi_or[3][i].out; - state_changed[i].in[2] <== states[i+1][3]; + eq[17][i].in[1] <== 51; eq[18][i] = IsEqual(); eq[18][i].in[0] <== in[i]; - eq[18][i].in[1] <== 62; - and[8][i] = AND(); - and[8][i].a <== states[i][1]; - and[8][i].b <== eq[18][i].out; - states[i+1][4] <== and[8][i].out; - state_changed[i].in[3] <== states[i+1][4]; + eq[18][i].in[1] <== 52; eq[19][i] = IsEqual(); eq[19][i].in[0] <== in[i]; - eq[19][i].in[1] <== 109; - and[9][i] = AND(); - and[9][i].a <== states[i][3]; - and[9][i].b <== eq[19][i].out; - states[i+1][5] <== and[9][i].out; - state_changed[i].in[4] <== states[i+1][5]; - and[10][i] = AND(); - and[10][i].a <== states[i][4]; - and[10][i].b <== eq[15][i].out; - states[i+1][6] <== and[10][i].out; - state_changed[i].in[5] <== states[i+1][6]; - and[11][i] = AND(); - and[11][i].a <== states[i][6]; - and[11][i].b <== eq[17][i].out; - states[i+1][7] <== and[11][i].out; - state_changed[i].in[6] <== states[i+1][7]; + eq[19][i].in[1] <== 53; eq[20][i] = IsEqual(); eq[20][i].in[0] <== in[i]; - eq[20][i].in[1] <== 101; - and[12][i] = AND(); - and[12][i].a <== states[i][5]; - and[12][i].b <== eq[20][i].out; - states[i+1][8] <== and[12][i].out; - state_changed[i].in[7] <== states[i+1][8]; + eq[20][i].in[1] <== 54; eq[21][i] = IsEqual(); eq[21][i].in[0] <== in[i]; - eq[21][i].in[1] <== 115; - and[13][i] = AND(); - and[13][i].a <== states[i][8]; - and[13][i].b <== eq[21][i].out; - states[i+1][9] <== and[13][i].out; - state_changed[i].in[8] <== states[i+1][9]; - and[14][i] = AND(); - and[14][i].a <== states[i][9]; - and[14][i].b <== eq[21][i].out; - states[i+1][10] <== and[14][i].out; - state_changed[i].in[9] <== states[i+1][10]; + eq[21][i].in[1] <== 55; eq[22][i] = IsEqual(); eq[22][i].in[0] <== in[i]; - eq[22][i].in[1] <== 97; - and[15][i] = AND(); - and[15][i].a <== states[i][10]; - and[15][i].b <== eq[22][i].out; - states[i+1][11] <== and[15][i].out; - state_changed[i].in[10] <== states[i+1][11]; + eq[22][i].in[1] <== 56; eq[23][i] = IsEqual(); eq[23][i].in[0] <== in[i]; - eq[23][i].in[1] <== 103; - and[16][i] = AND(); - and[16][i].a <== states[i][11]; - and[16][i].b <== eq[23][i].out; - states[i+1][12] <== and[16][i].out; - state_changed[i].in[11] <== states[i+1][12]; - and[17][i] = AND(); - and[17][i].a <== states[i][12]; - and[17][i].b <== eq[20][i].out; - states[i+1][13] <== and[17][i].out; - state_changed[i].in[12] <== states[i+1][13]; - and[18][i] = AND(); - and[18][i].a <== states[i][13]; - and[18][i].b <== eq[1][i].out; - states[i+1][14] <== and[18][i].out; - state_changed[i].in[13] <== states[i+1][14]; + eq[23][i].in[1] <== 57; eq[24][i] = IsEqual(); eq[24][i].in[0] <== in[i]; - eq[24][i].in[1] <== 105; - and[19][i] = AND(); - and[19][i].a <== states[i][14]; - and[19][i].b <== eq[24][i].out; - states[i+1][15] <== and[19][i].out; - state_changed[i].in[14] <== states[i+1][15]; + eq[24][i].in[1] <== 61; eq[25][i] = IsEqual(); eq[25][i].in[0] <== in[i]; - eq[25][i].in[1] <== 100; - and[20][i] = AND(); - and[20][i].a <== states[i][15]; - and[20][i].b <== eq[25][i].out; - states[i+1][16] <== and[20][i].out; - state_changed[i].in[15] <== states[i+1][16]; + eq[25][i].in[1] <== 95; + and[17][i] = AND(); + and[17][i].a <== states[i][14]; + multi_or[1][i] = MultiOR(17); + multi_or[1][i].in[0] <== and[15][i].out; + multi_or[1][i].in[1] <== and[16][i].out; + multi_or[1][i].in[2] <== eq[12][i].out; + multi_or[1][i].in[3] <== eq[7][i].out; + multi_or[1][i].in[4] <== eq[13][i].out; + multi_or[1][i].in[5] <== eq[14][i].out; + multi_or[1][i].in[6] <== eq[15][i].out; + multi_or[1][i].in[7] <== eq[16][i].out; + multi_or[1][i].in[8] <== eq[17][i].out; + multi_or[1][i].in[9] <== eq[18][i].out; + multi_or[1][i].in[10] <== eq[19][i].out; + multi_or[1][i].in[11] <== eq[20][i].out; + multi_or[1][i].in[12] <== eq[21][i].out; + multi_or[1][i].in[13] <== eq[22][i].out; + multi_or[1][i].in[14] <== eq[23][i].out; + multi_or[1][i].in[15] <== eq[24][i].out; + multi_or[1][i].in[16] <== eq[25][i].out; + and[17][i].b <== multi_or[1][i].out; + and[18][i] = AND(); + and[18][i].a <== states[i][15]; + and[18][i].b <== multi_or[1][i].out; + multi_or[2][i] = MultiOR(2); + multi_or[2][i].in[0] <== and[17][i].out; + multi_or[2][i].in[1] <== and[18][i].out; + states[i+1][15] <== multi_or[2][i].out; + state_changed[i].in[14] <== states[i+1][15]; eq[26][i] = IsEqual(); eq[26][i].in[0] <== in[i]; - eq[26][i].in[1] <== 58; - and[21][i] = AND(); - and[21][i].a <== states[i][16]; - and[21][i].b <== eq[26][i].out; - states[i+1][17] <== and[21][i].out; + eq[26][i].in[1] <== 62; + and[19][i] = AND(); + and[19][i].a <== states[i][15]; + and[19][i].b <== eq[26][i].out; + states[i+1][16] <== and[19][i].out; + state_changed[i].in[15] <== states[i+1][16]; + and[20][i] = AND(); + and[20][i].a <== states[i][16]; + and[20][i].b <== eq[1][i].out; + states[i+1][17] <== and[20][i].out; state_changed[i].in[16] <== states[i+1][17]; - eq[27][i] = IsEqual(); - eq[27][i].in[0] <== in[i]; - eq[27][i].in[1] <== 60; - and[22][i] = AND(); - and[22][i].a <== states[i][17]; - and[22][i].b <== eq[27][i].out; - states[i+1][18] <== and[22][i].out; + and[21][i] = AND(); + and[21][i].a <== states[i][17]; + and[21][i].b <== eq[3][i].out; + states[i+1][18] <== and[21][i].out; state_changed[i].in[17] <== states[i+1][18]; states[i+1][0] <== 1 - state_changed[i].out; } component final_state_result = MultiOR(num_bytes+1); for (var i = 0; i <= num_bytes; i++) { - final_state_result.in[i] <== states[i][7]; + final_state_result.in[i] <== states[i][18]; } out <== final_state_result.out; signal is_consecutive[msg_bytes+1][2]; is_consecutive[msg_bytes][1] <== 1; for (var i = 0; i < msg_bytes; i++) { - is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][7] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; + is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][18] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; is_consecutive[msg_bytes-1-i][1] <== state_changed[msg_bytes-i].out * is_consecutive[msg_bytes-1-i][0]; } signal is_substr0[msg_bytes][5]; @@ -275,10 +266,10 @@ template MessageIdRegex(msg_bytes) { signal output reveal0[msg_bytes]; for (var i = 0; i < msg_bytes; i++) { is_substr0[i][0] <== 0; - is_substr0[i][1] <== is_substr0[i][0] + states[i+1][1] * states[i+2][1]; - is_substr0[i][2] <== is_substr0[i][1] + states[i+1][1] * states[i+2][4]; - is_substr0[i][3] <== is_substr0[i][2] + states[i+1][17] * states[i+2][18]; - is_substr0[i][4] <== is_substr0[i][3] + states[i+1][18] * states[i+2][1]; + is_substr0[i][1] <== is_substr0[i][0] + states[i+1][13] * states[i+2][14]; + is_substr0[i][2] <== is_substr0[i][1] + states[i+1][14] * states[i+2][15]; + is_substr0[i][3] <== is_substr0[i][2] + states[i+1][15] * states[i+2][15]; + is_substr0[i][4] <== is_substr0[i][3] + states[i+1][15] * states[i+2][16]; is_reveal0[i] <== is_substr0[i][4] * is_consecutive[i][1]; reveal0[i] <== in[i+1] * is_reveal0[i]; } diff --git a/packages/circom/tests/circuits/simple_regex.circom b/packages/circom/tests/circuits/simple_regex.circom index 3f499c1..f24bced 100644 --- a/packages/circom/tests/circuits/simple_regex.circom +++ b/packages/circom/tests/circuits/simple_regex.circom @@ -2,7 +2,6 @@ pragma circom 2.1.5; include "@zk-email/zk-regex-circom/circuits/regex_helpers.circom"; -// regex: 1=(a|b) (2=(b|c)+ )+d template SimpleRegex(msg_bytes) { signal input msg[msg_bytes]; signal output out; @@ -129,7 +128,6 @@ template SimpleRegex(msg_bytes) { is_consecutive[msg_bytes-1-i][0] <== states[num_bytes-i][9] * (1 - is_consecutive[msg_bytes-i][1]) + is_consecutive[msg_bytes-i][1]; is_consecutive[msg_bytes-1-i][1] <== state_changed[msg_bytes-i].out * is_consecutive[msg_bytes-1-i][0]; } - // substrings calculated: [{(2, 3)}, {(6, 7), (7, 7)}, {(8, 9)}] signal is_substr0[msg_bytes][2]; signal is_reveal0[msg_bytes]; signal output reveal0[msg_bytes]; diff --git a/packages/compiler/Cargo.toml b/packages/compiler/Cargo.toml index 2af7683..c5e7232 100644 --- a/packages/compiler/Cargo.toml +++ b/packages/compiler/Cargo.toml @@ -31,6 +31,8 @@ serde = { version = "1.0.159", features = ["derive"] } js-sandbox = { version = "0.2.0-rc.2", git = "https://github.com/Bromeon/js-sandbox.git", tag = "0.2.0-rc.2" } itertools = "0.10.3" clap = { version = "=4.2.1", features = ["derive"] } +regex-automata = "0.4.5" +regex = "1.10.3" [dependencies.neon] version = "0.10" diff --git a/packages/compiler/package.json b/packages/compiler/package.json index 5948931..9b76a20 100644 --- a/packages/compiler/package.json +++ b/packages/compiler/package.json @@ -17,7 +17,7 @@ "build": "npx tsc && cargo-cp-artifact -nc index.node -- cargo build --message-format=json-render-diagnostics", "build-debug": "npx tsc && npm run build --", "build-release": "npx tsc && npm run build -- --release", - "install": "npx tsc && node-pre-gyp install --fallback-to-build=false || npm run build-release", + "install": "npm run build-release", "test": "npx tsc && cargo test", "package": "npx tsc && node-pre-gyp package", "upload-binary": "npx tsc && npm run package && node-pre-gyp-github publish" @@ -36,4 +36,4 @@ "package_name": "compiler-{node_abi}-{platform}-{arch}.tar.gz", "module_path": "./" } -} \ No newline at end of file +} diff --git a/packages/compiler/src/js_caller.rs b/packages/compiler/src/js_caller.rs index e054006..b185e09 100644 --- a/packages/compiler/src/js_caller.rs +++ b/packages/compiler/src/js_caller.rs @@ -1,5 +1,3 @@ - - use js_sandbox::{JsError, Script}; use serde_json::Value; @@ -40,12 +38,12 @@ pub fn text_context_prefix() -> Result { // Ok(result) // } -pub fn regex_to_dfa(regex: &str) -> Result, JsCallerError> { - let code: &'static str = include_str!("regex.js"); - let mut script = Script::from_string(code)?; - let result: String = script.call("regexToDfa", (regex,))?; - Ok(serde_json::from_str(&result)?) -} +// pub fn regex_to_dfa(regex: &str) -> Result, JsCallerError> { +// let code: &'static str = include_str!("regex.js"); +// let mut script = Script::from_string(code)?; +// let result: String = script.call("regexToDfa", (regex,))?; +// Ok(serde_json::from_str(&result)?) +// } pub fn gen_circom_allstr(graph: &[Value], template_name: &str) -> Result { let code: &'static str = include_str!("gen_circom.js"); diff --git a/packages/compiler/src/lib.rs b/packages/compiler/src/lib.rs index 259b4cc..798aabd 100644 --- a/packages/compiler/src/lib.rs +++ b/packages/compiler/src/lib.rs @@ -3,9 +3,11 @@ use std::{collections::HashMap, fs::File}; pub mod circom; pub mod halo2; pub mod js_caller; +pub mod regex; pub mod node; use crate::node::*; +use crate::regex::*; use neon; use crate::js_caller::*; @@ -88,7 +90,7 @@ impl DecomposedRegexConfig { for config in part_configs.iter() { all_regex += &config.regex_def; } - let dfa_val = regex_to_dfa(&all_regex)?; + let dfa_val = regex_to_dfa(&all_regex); let substrs_defs = self.extract_substr_ids(&dfa_val)?; Ok(RegexAndDFA { // max_byte_size: self.max_byte_size, @@ -339,7 +341,7 @@ impl RegexAndDFA { regex_str: &str, substrs_defs_json: SubstrsDefsJson, ) -> Result { - let dfa_val = regex_to_dfa(regex_str)?; + let dfa_val = regex_to_dfa(regex_str); let substr_defs_array = substrs_defs_json .transitions .into_iter() diff --git a/packages/compiler/src/regex.rs b/packages/compiler/src/regex.rs new file mode 100644 index 0000000..9e6ba09 --- /dev/null +++ b/packages/compiler/src/regex.rs @@ -0,0 +1,177 @@ +use regex::Regex; +use regex_automata::dfa::{dense::DFA, StartKind}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::{HashMap, HashSet}; + +#[derive(Debug, Clone)] +struct State { + typ: String, + source: u32, + edges: HashMap, +} + +#[derive(Debug)] +struct DFAInfo { + states: Vec, +} + +#[derive(Debug, Serialize, Deserialize)] +struct GraphNode { + #[serde(default)] + r#type: String, + edges: HashMap, +} + +fn parse_dfa_output(output: &str) -> DFAInfo { + let mut dfa_info = DFAInfo { states: Vec::new() }; + + let re = Regex::new(r"\*?(\d+): ((.+?) => (\d+),?)+").unwrap(); + for captures in re.captures_iter(output) { + let src = captures[1].parse::().unwrap(); + let mut state = State { + source: src, + typ: String::new(), + edges: HashMap::new(), + }; + if &captures[0][0..1] == "*" { + state.typ = String::from("accept"); + } + for transition in Regex::new(r"\s+[^=]+\s*=>\s*(\d+)+\s*|\s+=+\s*=>\s*(\d+)+") + .unwrap() + .captures_iter(&captures[0].to_string()) + { + let trimmed_transition = transition[0].trim(); + let transition_vec = trimmed_transition.split("=>").collect::>(); + let mut transition_vec_iter = transition_vec.iter(); + let mut src = transition_vec_iter.next().unwrap().trim().to_string(); + if src.len() > 2 && src.chars().nth(2).unwrap() == '\\' { + src = format!("{}{}", &src[0..2], &src[3..]); + } + let dst = transition_vec_iter.next().unwrap().trim(); + state.edges.insert(src, dst.parse::().unwrap()); + } + dfa_info.states.push(state); + } + + let mut eoi_pointing_states = HashSet::new(); + + for state in &mut dfa_info.states { + if let Some(eoi_target) = state.edges.get("EOI").cloned() { + eoi_pointing_states.insert(eoi_target); + state.typ = String::from("accept"); + state.edges.remove("EOI"); + } + } + + let start_state_re = Regex::new(r"START-GROUP\(anchored\)[\s*\w*\=>]*Text => (\d+)").unwrap(); + let start_state = start_state_re.captures_iter(output).next().unwrap()[1] + .parse::() + .unwrap(); + + // Sort states by order of appearance and rename the sources + let mut sorted_states = DFAInfo { states: Vec::new() }; + let mut sorted_states_set = HashSet::new(); + let mut new_states = HashSet::new(); + new_states.insert(start_state); + while !new_states.is_empty() { + let mut next_states = HashSet::new(); + for state in &new_states { + if let Some(state) = dfa_info.states.iter().find(|s| s.source == *state) { + sorted_states.states.push((*state).clone()); + sorted_states_set.insert(state.source); + for (_, dst) in &state.edges { + if !sorted_states_set.contains(dst) { + next_states.insert(*dst); + } + } + } + } + new_states = next_states; + } + + // Rename the sources + let mut switch_states = HashMap::new(); + for (i, state) in sorted_states.states.iter_mut().enumerate() { + let temp = state.source; + state.source = i as u32; + switch_states.insert(temp, state.source); + } + + // Iterate over all edges of all states + for state in &mut sorted_states.states { + for (_, dst) in &mut state.edges { + *dst = switch_states.get(dst).unwrap().clone(); + } + } + + sorted_states +} + +fn dfa_to_graph(dfa_info: &DFAInfo) -> String { + let mut graph = Vec::new(); + for state in &dfa_info.states { + let mut edges = HashMap::new(); + let mut edges_to_node = HashMap::new(); + for (key, value) in &state.edges { + let re = Regex::new(r"(.+)-(.+)").unwrap(); + if re.is_match(key) { + let capture = re.captures_iter(key).next().unwrap(); + let start = capture[1].parse::().unwrap(); + let end = capture[2].parse::().unwrap(); + let char_range: Vec = (start..=end) + .map(|c| format!("\"{}\"", c as u8 as char)) + .collect(); + if edges_to_node.contains_key(value) { + let edges_to_node_vec: &mut Vec = edges_to_node.get_mut(value).unwrap(); + edges_to_node_vec.push(char_range.join(",")); + } else { + edges_to_node.insert(value, vec![char_range.join(",")]); + } + } else { + if key == "' '" { + if edges_to_node.contains_key(value) { + let edges_to_node_vec: &mut Vec = + edges_to_node.get_mut(value).unwrap(); + edges_to_node_vec.push("\" \"".to_string()); + } else { + edges_to_node.insert(value, vec!["\" \"".to_string()]); + } + continue; + } + if edges_to_node.contains_key(value) { + let edges_to_node_vec: &mut Vec = edges_to_node.get_mut(value).unwrap(); + edges_to_node_vec.push(format!("\"{}\"", key)); + } else { + edges_to_node.insert(value, vec![format!("\"{}\"", key)]); + } + } + } + // Copy edges_to_node to edges + for (value, chars) in edges_to_node { + let result = format!("[{}]", chars.join(",")); + edges.insert(result, *value); + } + graph.push(GraphNode { + r#type: state.typ.clone(), + edges: edges, + }); + } + + let json_string = serde_json::to_string_pretty(&graph).unwrap(); + json_string +} + +pub fn regex_to_dfa(regex: &str) -> Vec { + let mut config = DFA::config().minimize(true); + config = config.start_kind(StartKind::Anchored); + config = config.byte_classes(false); + config = config.accelerate(true); + let re: DFA> = DFA::builder() + .configure(config) + .build(&format!(r"^{}$", regex)) + .unwrap(); + let re_str = format!("{:?}", re); + let json = dfa_to_graph(&parse_dfa_output(&re_str)); + serde_json::from_str(&json).unwrap() +}