From 929288b125469f3b59a828814c19e41f7bcf8ee3 Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Tue, 13 Feb 2024 12:50:06 +0000 Subject: [PATCH 1/9] replace --- lang/LangPrimSource/PyrStringPrim.cpp | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index 11feb41577a..1f8ff3d39e8 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -269,6 +269,76 @@ class regex_lru_cache { } + +static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { + if (numArgsPushed != 3) + return errFailed; // assume 'this' is counted + // regexReplace { |find, replace| ... } + + // caches the last 64 boost:regex instances. + static detail::regex_lru_cache regex_lru_cache(boost::regex_constants::ECMAScript | boost::regex_constants::nosubs); + + using namespace boost; + + // TODO: fix const-ness of isKindOfSlot + PyrSlot* slot_this = g->sp - 2; // source string + /*const*/ PyrSlot* slot_regex = g->sp - 1; // find + /*const*/ PyrSlot* slot_replace = g->sp; // replace with + + if (!isKindOfSlot(slot_this, class_string)) { + SetNil(slot_this); + postfl("Error: slot 1 is wrong"); + return errWrongType; + } + if (!isKindOfSlot(slot_regex, class_string)){ + SetNil(slot_this); + postfl("Error: slot 2 is wrong"); + return errWrongType; + } + if (!isKindOfSlot(slot_replace, class_string)){ + SetNil(slot_this); + postfl("Error: slot 4 is wrong"); + return errWrongType; + } + + try { + const auto& pattern = regex_lru_cache.get_regex(slotRawString(slot_regex)->s, slotRawString(slot_regex)->size); + + const char* source_start = slotRawString(slot_this)->s; + const int source_size = slotRawString(slot_this)->size; + postfl("size of string %i", source_size); + + if (source_size < 0) { // size is signed + SetNil(slot_this); + return errFailed; + } else if (source_size == 0) { + return errNone; // do nothing, input is empty + } + const char* source_end = source_start + source_size; + + // this allocation is necessary as the result needs to be extendable + std::string out {}; + std::string replace{slotRawString(slot_replace)->s, static_cast(slotRawString(slot_replace)->size)}; + regex_replace(std::back_inserter(out), source_start, source_end, pattern, replace ); + + // now 'out' has been filled, it's data must be copied to avoid being free'ed when 'out' goes out of scope + PyrString* output_string = newPyrStringN(g->gc, static_cast(out.size()), 0, true); + std::copy(out.begin(), out.end(), output_string->s); + + postfl("%s\n", out.c_str()); + + // do we need to free the input string from the garbage collector somehow? + // prString_AsCompileString does not. + // output slot is the first slot. + SetObject(slot_this, output_string); + return errNone; + } catch (const std::exception& e) { + postfl("Warning: Exception in _String_ReplaceRegex -%s\n", e.what()); + SetNil(slot_this); + return errFailed; + }; +} + int prString_Regexp(struct VMGlobals* g, int numArgsPushed) { /* not reentrant */ static detail::regex_lru_cache regex_lru_cache(boost::regex_constants::ECMAScript | boost::regex_constants::nosubs); @@ -1002,4 +1072,5 @@ void initStringPrimitives() { definePrimitive(base, index++, "_String_EscapeChar", prString_EscapeChar, 2, 0); definePrimitive(base, index++, "_String_ParseYAML", prString_ParseYAML, 1, 0); definePrimitive(base, index++, "_String_ParseYAMLFile", prString_ParseYAMLFile, 1, 0); + definePrimitive(base, index++, "_String_ReplaceRegex", prString_ReplaceRegex, 3, 0); } From 292008d9d451c5c1fab060635754357c1f1123e2 Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Tue, 13 Feb 2024 12:50:51 +0000 Subject: [PATCH 2/9] docs --- SCClassLibrary/Common/Collections/String.sc | 1 + 1 file changed, 1 insertion(+) diff --git a/SCClassLibrary/Common/Collections/String.sc b/SCClassLibrary/Common/Collections/String.sc index b9b9ecc7775..ca0e76b3929 100644 --- a/SCClassLibrary/Common/Collections/String.sc +++ b/SCClassLibrary/Common/Collections/String.sc @@ -130,6 +130,7 @@ String[char] : RawArray { format { arg ... items; ^this.prFormat( items.collect(_.asString) ) } prFormat { arg items; _String_Format ^this.primitiveFailed } matchRegexp { arg string, start = 0, end; _String_Regexp ^this.primitiveFailed } + replaceRegexp { |regex, with| _String_ReplaceRegex ^this.primitiveFailed } fformat { arg ... args; var str, resArgs, val, func; From b6a84d1b94ae3a8a1c875059477aae4bff5298a8 Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Tue, 13 Feb 2024 12:53:27 +0000 Subject: [PATCH 3/9] docs --- HelpSource/Classes/String.schelp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/HelpSource/Classes/String.schelp b/HelpSource/Classes/String.schelp index 98be127e9c1..9e08894d0cd 100644 --- a/HelpSource/Classes/String.schelp +++ b/HelpSource/Classes/String.schelp @@ -383,6 +383,10 @@ code:: "[xtz]+nd".matchRegexp("xnd"); // true: any combination of x, t, z :: +method::replaceRegex + +TODO + method::findRegexp Perl regular expression search (see link::Classes/String#Regular expressions::). This method searches exhaustively for matches and collects them into an array of pairs, in the format code::[character index, matching string]::. From ebd1fca8bbfe97454a0ecce14738e1710188e1f0 Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Tue, 13 Feb 2024 12:58:25 +0000 Subject: [PATCH 4/9] remove debug posts --- lang/LangPrimSource/PyrStringPrim.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index 1f8ff3d39e8..5eb42dbabc8 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -287,17 +287,17 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { if (!isKindOfSlot(slot_this, class_string)) { SetNil(slot_this); - postfl("Error: slot 1 is wrong"); + postfl("Error: slot 1 is wrong type, should be a String\n"); return errWrongType; } if (!isKindOfSlot(slot_regex, class_string)){ SetNil(slot_this); - postfl("Error: slot 2 is wrong"); + postfl("Error: slot 2 is wrong type, should be a String\n"); return errWrongType; } if (!isKindOfSlot(slot_replace, class_string)){ SetNil(slot_this); - postfl("Error: slot 4 is wrong"); + postfl("Error: slot 4 is wrong type, should be a String\n"); return errWrongType; } @@ -306,7 +306,6 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { const char* source_start = slotRawString(slot_this)->s; const int source_size = slotRawString(slot_this)->size; - postfl("size of string %i", source_size); if (source_size < 0) { // size is signed SetNil(slot_this); @@ -325,8 +324,6 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { PyrString* output_string = newPyrStringN(g->gc, static_cast(out.size()), 0, true); std::copy(out.begin(), out.end(), output_string->s); - postfl("%s\n", out.c_str()); - // do we need to free the input string from the garbage collector somehow? // prString_AsCompileString does not. // output slot is the first slot. From 5d3d66f08f0a2a75dc844d60b18f56ee20adf18f Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Tue, 13 Feb 2024 13:09:37 +0000 Subject: [PATCH 5/9] tidy comments --- lang/LangPrimSource/PyrStringPrim.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index 5eb42dbabc8..84f430d20c8 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -271,9 +271,9 @@ class regex_lru_cache { static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { + // String::regexReplace { |regex: String, replaceText: String| ... } if (numArgsPushed != 3) - return errFailed; // assume 'this' is counted - // regexReplace { |find, replace| ... } + return errFailed; // this is counted // caches the last 64 boost:regex instances. static detail::regex_lru_cache regex_lru_cache(boost::regex_constants::ECMAScript | boost::regex_constants::nosubs); @@ -285,11 +285,8 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { /*const*/ PyrSlot* slot_regex = g->sp - 1; // find /*const*/ PyrSlot* slot_replace = g->sp; // replace with - if (!isKindOfSlot(slot_this, class_string)) { - SetNil(slot_this); - postfl("Error: slot 1 is wrong type, should be a String\n"); - return errWrongType; - } + // slot one does not need to be checked as this method should only be called from methods in String, + // or children thereof. if (!isKindOfSlot(slot_regex, class_string)){ SetNil(slot_this); postfl("Error: slot 2 is wrong type, should be a String\n"); @@ -310,14 +307,15 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { if (source_size < 0) { // size is signed SetNil(slot_this); return errFailed; - } else if (source_size == 0) { - return errNone; // do nothing, input is empty } + const char* source_end = source_start + source_size; // this allocation is necessary as the result needs to be extendable std::string out {}; + // couldn't get the 'replace' argument in regex_replace to work as the char* isn't (necessarily) null terminated. std::string replace{slotRawString(slot_replace)->s, static_cast(slotRawString(slot_replace)->size)}; + regex_replace(std::back_inserter(out), source_start, source_end, pattern, replace ); // now 'out' has been filled, it's data must be copied to avoid being free'ed when 'out' goes out of scope From e442d7d81861a2480f852a6b74f3669e46af6332 Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Wed, 14 Feb 2024 17:45:48 +0000 Subject: [PATCH 6/9] old clang format --- lang/LangPrimSource/PyrStringPrim.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index 84f430d20c8..4a81f3c749e 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -287,12 +287,12 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { // slot one does not need to be checked as this method should only be called from methods in String, // or children thereof. - if (!isKindOfSlot(slot_regex, class_string)){ + if (!isKindOfSlot(slot_regex, class_string)) { SetNil(slot_this); postfl("Error: slot 2 is wrong type, should be a String\n"); return errWrongType; } - if (!isKindOfSlot(slot_replace, class_string)){ + if (!isKindOfSlot(slot_replace, class_string)) { SetNil(slot_this); postfl("Error: slot 4 is wrong type, should be a String\n"); return errWrongType; @@ -313,10 +313,12 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { // this allocation is necessary as the result needs to be extendable std::string out {}; - // couldn't get the 'replace' argument in regex_replace to work as the char* isn't (necessarily) null terminated. - std::string replace{slotRawString(slot_replace)->s, static_cast(slotRawString(slot_replace)->size)}; + // couldn't get the 'replace' argument in regex_replace to work as the char* isn't (necessarily) null + // terminated. + std::string replace{ slotRawString(slot_replace)->s, + static_cast(slotRawString(slot_replace)->size) }; - regex_replace(std::back_inserter(out), source_start, source_end, pattern, replace ); + regex_replace(std::back_inserter(out), source_start, source_end, pattern, replace); // now 'out' has been filled, it's data must be copied to avoid being free'ed when 'out' goes out of scope PyrString* output_string = newPyrStringN(g->gc, static_cast(out.size()), 0, true); From 5c29fa638729e6cb08ead768bc6c8c7f5a87ded0 Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Wed, 14 Feb 2024 17:51:22 +0000 Subject: [PATCH 7/9] old clang format2 --- lang/LangPrimSource/PyrStringPrim.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index 4a81f3c749e..2be93a3ee55 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -315,7 +315,7 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { std::string out {}; // couldn't get the 'replace' argument in regex_replace to work as the char* isn't (necessarily) null // terminated. - std::string replace{ slotRawString(slot_replace)->s, + std::string replace { slotRawString(slot_replace)->s, static_cast(slotRawString(slot_replace)->size) }; regex_replace(std::back_inserter(out), source_start, source_end, pattern, replace); From 39428c0fdd157337a304cf155def234158ea051c Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Fri, 15 Mar 2024 15:09:40 +0000 Subject: [PATCH 8/9] Tidy, add Int overflow error type in primitive, add test, update docs --- HelpSource/Classes/String.schelp | 25 ++++++++++- lang/LangPrimSource/PyrStringPrim.cpp | 45 +++++++------------ lang/LangSource/PyrErrors.h | 1 + .../classlibrary/TestStringReplaceRegex.sc | 30 +++++++++++++ 4 files changed, 71 insertions(+), 30 deletions(-) create mode 100644 testsuite/classlibrary/TestStringReplaceRegex.sc diff --git a/HelpSource/Classes/String.schelp b/HelpSource/Classes/String.schelp index 9e08894d0cd..ada09a86f4b 100644 --- a/HelpSource/Classes/String.schelp +++ b/HelpSource/Classes/String.schelp @@ -383,9 +383,30 @@ code:: "[xtz]+nd".matchRegexp("xnd"); // true: any combination of x, t, z :: -method::replaceRegex +method::replaceRegexp +This method is used to replace parts of text. -TODO +argument::regex +A perl regular expression (see link::Classes/String#Regular expressions::) with which to match the caller. + +argument::with +The link::Classes/String:: to replace the found regex with. + +returns:: A link::Classes/String::. + +code:: +// remove numbers +"g8et t8ho9se 3num5b89ers ou06t o8f h12er56e!".replaceRegexp("[0-9]", "") +-> get those numbers out of here! + +// remove capital letters +"HelLO WoRlD".replaceRegexp("(\\\w)", "\\\L$1") +-> hello world + +// remove all capital letter unless at the start of a word +"HelLO worLD! I weNT tO Paris yeSTErDay.".replaceRegexp("(\\\S)(\\\S*)", "$1\\\L$2") +-> Hello world! I went to Paris yesterday. +:: method::findRegexp Perl regular expression search (see link::Classes/String#Regular expressions::). This method searches exhaustively for matches and collects them into an array of pairs, in the format code::[character index, matching string]::. diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index 2be93a3ee55..a6203f5b24a 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -270,31 +270,23 @@ class regex_lru_cache { } -static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { - // String::regexReplace { |regex: String, replaceText: String| ... } - if (numArgsPushed != 3) - return errFailed; // this is counted - +int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { // caches the last 64 boost:regex instances. static detail::regex_lru_cache regex_lru_cache(boost::regex_constants::ECMAScript | boost::regex_constants::nosubs); - using namespace boost; - // TODO: fix const-ness of isKindOfSlot PyrSlot* slot_this = g->sp - 2; // source string - /*const*/ PyrSlot* slot_regex = g->sp - 1; // find - /*const*/ PyrSlot* slot_replace = g->sp; // replace with + PyrSlot* slot_regex = g->sp - 1; // find + PyrSlot* slot_replace = g->sp; // replace with // slot one does not need to be checked as this method should only be called from methods in String, // or children thereof. if (!isKindOfSlot(slot_regex, class_string)) { SetNil(slot_this); - postfl("Error: slot 2 is wrong type, should be a String\n"); return errWrongType; } if (!isKindOfSlot(slot_replace, class_string)) { SetNil(slot_this); - postfl("Error: slot 4 is wrong type, should be a String\n"); return errWrongType; } @@ -306,28 +298,25 @@ static int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { if (source_size < 0) { // size is signed SetNil(slot_this); - return errFailed; + return errIntegerOverflow; } - const char* source_end = source_start + source_size; - - // this allocation is necessary as the result needs to be extendable std::string out {}; - // couldn't get the 'replace' argument in regex_replace to work as the char* isn't (necessarily) null - // terminated. - std::string replace { slotRawString(slot_replace)->s, - static_cast(slotRawString(slot_replace)->size) }; - - regex_replace(std::back_inserter(out), source_start, source_end, pattern, replace); + // PyrStrings are not null terminated so a copy is needed. + const auto [replaceError, replace] = slotStrStdStrVal(slot_replace); + if(replaceError != errNone){ + SetNil(slot_this); + return replaceError; + } - // now 'out' has been filled, it's data must be copied to avoid being free'ed when 'out' goes out of scope - PyrString* output_string = newPyrStringN(g->gc, static_cast(out.size()), 0, true); - std::copy(out.begin(), out.end(), output_string->s); + boost::regex_replace(std::back_inserter(out), source_start, source_start + source_size, pattern, replace); - // do we need to free the input string from the garbage collector somehow? - // prString_AsCompileString does not. - // output slot is the first slot. - SetObject(slot_this, output_string); + if(out.size() > std::numeric_limits::max()){ + SetNil(slot_this); + return errIntegerOverflow; + } + SetObject(slot_this, newPyrStringN(g->gc, static_cast(out.size()), 0, true)); + std::copy(out.begin(), out.end(), slotRawString(slot_this)->s); return errNone; } catch (const std::exception& e) { postfl("Warning: Exception in _String_ReplaceRegex -%s\n", e.what()); diff --git a/lang/LangSource/PyrErrors.h b/lang/LangSource/PyrErrors.h index 29dfd39b571..c70cfac9112 100644 --- a/lang/LangSource/PyrErrors.h +++ b/lang/LangSource/PyrErrors.h @@ -39,6 +39,7 @@ enum { // primitive errors errOutOfMemory, errCantCallOS, errException, + errIntegerOverflow, errPropertyNotFound = 6000, diff --git a/testsuite/classlibrary/TestStringReplaceRegex.sc b/testsuite/classlibrary/TestStringReplaceRegex.sc new file mode 100644 index 00000000000..4a0720cb4db --- /dev/null +++ b/testsuite/classlibrary/TestStringReplaceRegex.sc @@ -0,0 +1,30 @@ +TestStringReplaceRegex : UnitTest { + test_replace_simple { + this.assertEquals( + "foo, foo, bar, foo".replaceRegexp("foo", "car"), + "car, car, bar, car" + ); + } + test_replace_empty { + this.assertEquals( + "".replaceRegexp("foo", "car"), + "" + ); + } + test_replace_more_complex { + this.assertEquals( + "texttextte9xtte823x234t".replaceRegexp("[0-9]", ""), + "texttexttexttext" + ); + this.assertEquals( + "hello--Remove--rem0ve".replaceRegexp("(r|R)em(o|0)ve", ""), + "hello----" + ); + } + test_replace_captures { + this.assertEquals( + "HelLO WoRlD".replaceRegexp("(\\\w)", "\\\L$1"), + "hello world" + ) + } +} \ No newline at end of file From 77cab2ee3d0840a7404f735926fd80e659b163be Mon Sep 17 00:00:00 2001 From: JordanHendersonMusic Date: Fri, 15 Mar 2024 15:12:17 +0000 Subject: [PATCH 9/9] Formatting --- lang/LangPrimSource/PyrStringPrim.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang/LangPrimSource/PyrStringPrim.cpp b/lang/LangPrimSource/PyrStringPrim.cpp index a6203f5b24a..992cf267cd8 100644 --- a/lang/LangPrimSource/PyrStringPrim.cpp +++ b/lang/LangPrimSource/PyrStringPrim.cpp @@ -304,14 +304,14 @@ int prString_ReplaceRegex(struct VMGlobals* g, int numArgsPushed) { std::string out {}; // PyrStrings are not null terminated so a copy is needed. const auto [replaceError, replace] = slotStrStdStrVal(slot_replace); - if(replaceError != errNone){ + if (replaceError != errNone) { SetNil(slot_this); return replaceError; } boost::regex_replace(std::back_inserter(out), source_start, source_start + source_size, pattern, replace); - if(out.size() > std::numeric_limits::max()){ + if (out.size() > std::numeric_limits::max()) { SetNil(slot_this); return errIntegerOverflow; }