add is_identifier, ignore non-identifier globals/members (from sublan…

…gs/splits), add splitkeys sublang example
thisismypassport · Aug 19, 2022 · 0b46c15 · 0b46c15
1 parent a881ed3
commit 0b46c15
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -75,8 +75,6 @@ glob = 123
 ?_ENV[my_key] -- 123
 ```
 
-For more advanced usecases, see the [below section](#advanced---controlling-renaming-of-identifiers).
-
 ### Preserving identifiers across the entire cart
 
 You can instruct the minifier to preserve certain identifiers across the entire cart:
@@ -402,11 +400,14 @@ eval(--[[language::evally]][[
 ```
 
 In the python script, provide a class that handles the language via sublanguage_main:
+(This is a complete example of what sublanguages can do, you can find a simpler example [below](#Example---simple-sub-language-for-table-parsing)
 ```python
-from pico_process import SubLanguageBase, is_ident_char
+from pico_process import SubLanguageBase, is_identifier
 from collections import Counter
 
 class MySubLanguage(SubLanguageBase):
+    # NOTE: all members are optional.
+
     # called to parse the sub-language from a string
     # (strings consist of raw pico-8 chars ('\0' to '\xff') - not real unicode)
     def __init__(self, str, on_error, **_):
@@ -419,7 +420,7 @@ class MySubLanguage(SubLanguageBase):
 
     def is_global(self, token):
         # is the token a global in our language? e.g. sin / rectfill / g_my_global
-        return all(is_ident_char(ch) for ch in token) and not token[:1].isdigit()
+        return is_identifier(token)
 
     def is_member(self, token):
         # is the token a member in our language? e.g. .my_member / .x
@@ -470,7 +471,10 @@ class MySubLanguage(SubLanguageBase):
                     usages[token[1:]] += 1
         return usages
 
-    # called to rename all uses of globals and members
+    # for very advanced languages only, see test_input/sublang.py for details
+    # def get_local_usages(self, **_):
+
+    # called to rename all uses of globals/members/etc
     def rename(self, globals, members, **_):
         for stmt in self.stmts:
             for i, token in enumerate(stmt):
@@ -488,3 +492,52 @@ def sublanguage_main(lang, **_):
     if lang == "evally":
         return MySubLanguage
 ```
+
+### Example - simple sub-language for table parsing
+
+Often it's useful in pico-8 to define a simple sub-language to parse something like this:
+
+`"key1=val1,key2=val2,val3,val4"`
+
+To:
+
+`{key1="val1",key2="val2","val3","val4"}
+
+Here, to minify properly, the keys (key1/key2) should be renamed as members, while the values should be left alone.
+
+The custom python script:
+```python
+from pico_process import SubLanguageBase, is_identifier
+from collections import Counter
+
+class SplitKeysSubLang(SubLanguageBase):
+    # parses the string
+    def __init__(self, str, **_):
+        self.data = [item.split("=") for item in str.split(",")]
+
+    # counts usage of keys
+    # (returned keys are ignored if they're not identifiers)
+    def get_member_usages(self, **_):
+        return Counter(item[0] for item in self.data if len(item) > 1)
+
+    # renames the keys
+    def rename(self, members, **_):
+        for item in self.data:
+            if len(item) > 1:
+                item[0] = members.get(item[0], item[0])
+
+    # formats back to string
+    def minify(self, **_):
+        return ",".join("=".join(item) for item in self.data)
+
+def sublanguage_main(lang, **_):
+    if lang == "splitkeys":
+        return SplitKeysSubLang
+```
+
+In the code:
+```lua
+local table = splitkeys(--[[language::splitkeys]]"key1=val1,key2=val2,val3,val4")
+?table.key1 -- "val1"
+?table[1] -- "val3"
+```
diff --git a/pico_process.py b/pico_process.py
@@ -403,6 +403,9 @@ def finish(m, path, code):
 def is_ident_char(ch):
     return '0' <= ch <= '9' or 'a' <= ch <= 'z' or 'A' <= ch <= 'Z' or ch == '_' or ch >= chr(0x80)
 
+def is_identifier(str):
+    return all(is_ident_char(ch) for ch in str) and not str[:1].isdigit() and str not in keywords
+
 def tokenize(source, ctxt=None):
     text = source.text
     idx = 0
@@ -934,17 +937,24 @@ def parse_call(expr, extra_arg=None):
 
         return Node(NodeType.call, tokens, func=expr, args=args)
 
-    def add_const_extra_children(node):
-        token = node.token
+    def parse_const(token):
+        node = Node(NodeType.const, [token], token=token)
+
         if getattr(token, "var_kind", None):
             node.extra_names = token.value[1:-1].split(",")
             for i, value in enumerate(node.extra_names):
-                subtoken = Token.synthetic(TokenType.ident, value, token)
-                subtoken.var_kind = token.var_kind
-                node.add_extra_child(parse_var(token=subtoken, member=True))
+                if is_identifier(value):
+                    subtoken = Token.synthetic(TokenType.ident, value, token)
+                    subtoken.var_kind = token.var_kind
+                    node.add_extra_child(parse_var(token=subtoken, member=True))
+                else:
+                    subtoken = Token.synthetic(TokenType.string, value, token)
+                    node.add_extra_child(parse_const(subtoken))
+
         if hasattr(token, "sublang"):
             sublang_token = Token.synthetic(TokenType.string, "", token)
             node.add_extra_child(Node(NodeType.sublang, (sublang_token,), name=token.sublang_name, lang=token.sublang))
+
         return node
 
     def parse_core_expr():
@@ -953,7 +963,7 @@ def parse_core_expr():
         if value == None:
             add_error("unexpected end of input", fail=True)
         elif value in ("nil", "true", "false") or token.type in (TokenType.number, TokenType.string):
-            return add_const_extra_children(Node(NodeType.const, [token], token=token))
+            return parse_const(token)
         elif value == "{":
             return parse_table()
         elif value == "(":
@@ -1464,7 +1474,7 @@ def preprocess_vars(node):
 
         elif node.type == NodeType.sublang:
             for glob in node.lang.get_defined_globals():
-                if glob not in custom_globals:
+                if glob not in custom_globals and is_identifier(glob):
                     custom_globals.add(glob)
                     vars[glob].append(root.globals[glob])
 
@@ -1676,14 +1686,14 @@ def collect_idents_pre(node):
             # slight dup of compute_effective_kind logic
 
             for name, count in node.lang.get_global_usages().items():
-                if name not in global_knowns:
+                if name not in global_knowns and is_identifier(name):
                     if name in all_globals:
                         global_knowns.add(name)
                     else:
                         global_uses[name] += count
 
             for name, count in node.lang.get_member_usages().items():
-                if name not in member_knowns:
+                if name not in member_knowns and is_identifier(name):
                     member_uses[name] += count
 
             for var, count in node.lang.get_local_usages().items():

diff --git a/pico_utils.py b/pico_utils.py
@@ -172,7 +172,7 @@ def parse_p8scii(str):
 
         start = pos + length
 
-def bytes_to_string_contents(bytes):
+def bytes_to_string_contents(bytes): # TODO: just use format_string_literal... 
     data = []
 
     esc_map = {

diff --git a/test_compare/output.p8 b/test_compare/output.p8
@@ -13,7 +13,7 @@ function f.subfunc()end function f:subfunc()end
 ?f:subfunc()
 local f="o"local d={o=123}
 ?d[f]
-local f=split"c,a,f"local d={c=123,a=234,f=345}
+local f=split"c,a,f,123"local d={c=123,a=234,f=345}
 ?d[f[2]]
 local f="n"n=123
 ?_ENV[f]

diff --git a/test_compare/output_semiob.p8 b/test_compare/output_semiob.p8
@@ -38,7 +38,7 @@ local c = "key"
 local a = {key=123}
 ?a[c]
 
-local c = split "key1,key2,key3"
+local c = split "key1,key2,key3,123"
 local a = {key1=123,key2=234,key3=345}
 ?a[c[2]]
 

diff --git a/test_compare/output_tokens.p8 b/test_compare/output_tokens.p8
@@ -39,7 +39,7 @@ local my_key = --[[member]]"key"
 local my_obj = {key=123}
 ?my_obj[my_key]
 
-local my_keys = split --[[member]]"key1,key2,key3"
+local my_keys = split --[[member]]"key1,key2,key3,123"
 local my_obj = {key1=123,key2=234,key3=345}
 ?my_obj[my_keys[2]]
 

diff --git a/test_compare/sublang.p8 b/test_compare/sublang.p8
@@ -1,7 +1,7 @@
 pico-8 cartridge // http://www.pico-8.com
 version 36
 __lua__
-u=123function f()end f[[circfill 50 50 20 7
+f=123function e()end e[[circfill 50 50 20 7
 n <- pack
-rawset n f u
-rawset n u c]]print(n)print(n.f)f""
+rawset n e f
+rawset n i d]]print(n)print(n.e)e""function i()end i"d=1,f=2,0.5=13,val,f=22,if=bad"
diff --git a/test_input/input.p8 b/test_input/input.p8
@@ -39,7 +39,7 @@ local my_key = --[[member]]"key"
 local my_obj = {key=123}
 ?my_obj[my_key]
 
-local my_keys = split --[[member]]"key1,key2,key3"
+local my_keys = split --[[member]]"key1,key2,key3,123"
 local my_obj = {key1=123,key2=234,key3=345}
 ?my_obj[my_keys[2]]
 

diff --git a/test_input/sublang.p8 b/test_input/sublang.p8
@@ -10,3 +10,5 @@ eval--[[language::evally]][[
 print(g_another_glob)
 print(g_another_glob.some_member)
 eval--[[language::empty]]""
+function splitkeys() end
+splitkeys--[[language::splitkeys]]"key1=1,key2=2,0.5=13,val,key2=22,if=bad"
diff --git a/test_input/sublang.py b/test_input/sublang.py
@@ -1,4 +1,4 @@
-from pico_process import SubLanguageBase, is_ident_char, Local, Scope
+from pico_process import SubLanguageBase, is_identifier, Local, Scope
 from collections import Counter
 
 class MySubLanguage(SubLanguageBase):
@@ -14,7 +14,7 @@ def __init__(self, str, on_error, **_):
 
     def is_global(self, token):
         # is the token a global in our language? e.g. sin / rectfill / g_my_global
-        return all(is_ident_char(ch) for ch in token) and not token[:1].isdigit()
+        return is_identifier(token)
 
     def is_member(self, token):
         # is the token a member in our language? e.g. .my_member / .x
@@ -96,9 +96,31 @@ def rename(self, globals, members, locals, **_):
     def minify(self, **_):
         return "\n".join(" ".join(stmt) for stmt in self.stmts)
 
+class SplitKeysSubLang(SubLanguageBase):
+    # parses the string
+    def __init__(self, str, **_):
+        self.data = [item.split("=") for item in str.split(",")]
+
+    # counts usage of keys
+    # (returned keys are ignored if they're not identifiers)
+    def get_member_usages(self, **_):
+        return Counter(item[0] for item in self.data if len(item) > 1)
+
+    # renames the keys
+    def rename(self, members, **_):
+        for item in self.data:
+            if len(item) > 1:
+                item[0] = members.get(item[0], item[0])
+
+    # formats back to string
+    def minify(self, **_):
+        return ",".join("=".join(item) for item in self.data)
+
 # this is called to get a sub-languge class by name
 def sublanguage_main(lang, **_):
     if lang == "evally":
         return MySubLanguage
+    elif lang == "splitkeys":
+        return SplitKeysSubLang
     elif lang == "empty":
         return SubLanguageBase