⚡ Feature: Lexer improvements (#18)

* Created brsnch * Created brsnch (removed placeholder) * Lexer - Added tab handling for the presence such as spaces would be in. - Added unit tests for the new tab processing - Resolved issues where whitepsace was allowed before and or after the '.' character - Renamed isSpliter to isSplitter - Some Code styling * Check - Added two new `SymbolType`s for comments - `SINGLE_LINE_COMMENT` (for `//`) and `MULTI_LINE_COMMENT` (for `/*`) * Parser - Added a bogus `parseComment()` which returns nothing, prints out the comment, consumes the `Token` and returns - `parseStatement()` now supports `parseComment()` whenever a single-line or multi-line comment is detected * Parser - Fixed token consumption code in `parseComment()` * BasicLexer - Fixed style mishaps * ArrLexer - Implemented dummy lexer * Parser - Added some comment related functions (for testing) - Added `pushComment(Token)`, `hasCommentsOnStack() and `getCommentCount()` - `parseComment()` now pushes the current comment-based `Token` onto the comment-stack - Added a comment stack Unit tests - Added testing for various examples of comment-type `Token`s * Lexer - Replaced the characters with Enumerated type - Working Comment lexing, single and multiline - Working escape codes for strings - Working Signage and Size Encoder indicators - Removed floatMode in favour of float lexing function - Added doComment for the comment lexing instead of comment mode - Added doEscapeCode for escape codes in string Testing - Added unit tests for comments - Added unit tests numerical encoders - Added unit tests numerical encoders TODO - ADD unit tests for all valid escape sequences and some invalid * Lexer - Removed stringMode in favour of soString TODO - Decide on miltiline strings, currently not supported * Parser - Test comments which appear at a non-Module but rather statement lavel * Parser - Changed to using `BasicLexer` for comment tests now seeing as it is now implemented therein * Basic - Added `roll()` and `shourt()` to mark unittests * Basic - `shout()` now adds line number to print out * Lexer rewrite - flush - underscores in numbers - escape codes - character escapes - bug fixes * Basic - Fixed `shourt(int)` * Basic - Remved crashing (on purpose_ unittest * Resolved bug where isSplitter evaluated to true every time * Basic - Removed `goggaWithLineInfo(...)` * Basic - Updated `shout()` to remove rolling - Removed `roll()` - Added function and module name as well * Basic - Documented `shout()` * Lexer Done and 100% coverage * LexerSymbols - Documented - Formatted * Lexer (module) - Added `LS` alias - Added `isOperator(char c)`, `isSplitter(char c)`, `isNumericalEncoder_Size(char character)`, `isNumericalEncoder_Signage(char character)` and `isValidEscape_String(char character)` * BasicLexer - Documented constructor `hasToken()`, `performLex()`, `doIdentOrPath()`, `doChar()`, `doString()`, `doComment()`, `doEscapeCode()`, `doNumber()`, `doEncoder()`, `doFloat()`, `flush()`, `buildAdvance()`, `improvedAdvance()`, `advanceLine()`, `isOperator(char)`, `isSplitter(char)`, `isValidDotPrecede(char character)`, `isNumericalEncoder(char character)`, `isNumericalEncoder_Size(char character)`, `isNumericalEncoder_Signage(char character)` and `isValidEscape_String(char character)` - Tried reformatting some of `doChar()`, `doString()`, `flush()`, `buildAdvance()`, `improvedAdvance()`, `advanceLine()`, `isOperator(char)`, `isSplitter(char)` * Basic - Removed `LS` alias Lexer - Made `LS` alias public * BasicLexer - Removed methods `isValidEscape_String(char character)`, `isNumericalEncoder_Signage(char character)`, `isNumericalEncoder_Size(char character)`, `isNumericalEncoder(char character)`, `isSplitter(char c)` and ` isOperator(char c)` Lexer - Added method `isNumericalEncoder(char character)` * BasicLexer - Documented `isValidDotPrecede(char character)` * Lexer - Added method `isValidDotPrecede(char character)` * BasicLexer - Removed method `isValidDotPrecede(char character)` * BasicLexer (unittests) - Documented the unittests - Fixed formatting * BasicLexer - Typo fixes * BasicLexer (unittests) - Only compile-in `shourt(...)` when in unittest build mode * BasicLexer - Documented `isForward()` and `isBackward()` - Made `isBackward()` private --------- Co-authored-by: GMeyer <21568499@sun.ac.za> Co-authored-by: GMeyer <gustav.meyer1999@gmail.com>
tbklang · Dec 27, 2023 · ee537f2 · ee537f2
1 parent 4c3a72b
commit ee537f2
Show file tree

Hide file tree

Showing 5 changed files with 1,971 additions and 539 deletions.
diff --git a/source/tlang/compiler/lexer/core/lexer.d b/source/tlang/compiler/lexer/core/lexer.d
@@ -4,6 +4,7 @@
 module tlang.compiler.lexer.core.lexer;
 
 import tlang.compiler.lexer.core.tokens : Token;
+import std.ascii : isDigit, isAlpha, isWhite;
 
 /** 
  * Defines the interface a lexer must provide
@@ -73,4 +74,163 @@ public interface LexerInterface
      * Returns: a `Token[]` containing all tokens
      */
     public Token[] getTokens();
+}
+
+/** 
+ * Human-readable names assigned
+ * to commonly used character
+ * constants
+ */
+public enum LexerSymbols : char
+{
+    L_PAREN = '(',
+    R_PAREN = ')',
+    SEMI_COLON = ';',
+    COMMA = ',',
+    L_BRACK =  '[' ,
+    R_BRACK =  ']' ,
+    PLUS =  '+' ,
+    MINUS =  '-' ,
+    FORWARD_SLASH =  '/' ,
+    PERCENT =  '%' ,
+    STAR =  '*' ,
+    AMPERSAND =  '&' ,
+    L_BRACE =  '{' ,
+    R_BRACE =  '}' ,
+    EQUALS =  '=' ,
+    SHEFFER_STROKE =  '|' ,
+    CARET =  '^' ,
+    EXCLAMATION =  '!' ,
+    TILDE =  '~' ,
+    DOT =  '.' ,
+    COLON =  ':',
+    SPACE = ' ',
+    TAB = '\t',
+    NEWLINE = '\n',
+    DOUBLE_QUOTE = '"',
+    SINGLE_QUOTE =  '\'' ,
+    BACKSLASH =  '\\' ,
+    UNDERSCORE =  '_' ,
+    LESS_THAN =  '<' ,
+    BIGGER_THAN =  '>' ,
+
+    ESC_NOTHING =  '0' ,
+    ESC_CARRIAGE_RETURN =  'r' ,
+    ESC_TAB =  't' ,
+    ESC_NEWLINE =  'n' ,
+    ESC_BELL=  'a' ,
+
+    ENC_BYTE =  'B' ,
+    ENC_INT =  'I' ,
+    ENC_LONG =  'L' ,
+    ENC_WORD =  'W' ,
+    ENC_UNSIGNED =  'U' ,
+    ENC_SIGNED =  'S' ,
+}
+
+/** 
+ * Alias to `LexerSymbols`
+ */
+public alias LS = LexerSymbols;
+
+/** 
+ * Checks if the provided character is an operator
+ *
+ * Params:
+ *   c = the character to check
+ * Returns: `true` if it is a character, `false`
+ * otherwise
+ */
+public bool isOperator(char c)
+{
+    return c == LS.PLUS || c == LS.TILDE || c == LS.MINUS ||
+           c == LS.STAR || c == LS.FORWARD_SLASH || c == LS.AMPERSAND ||
+           c == LS.CARET || c == LS.EXCLAMATION || c == LS.SHEFFER_STROKE ||
+           c == LS.LESS_THAN || c == LS.BIGGER_THAN;
+}
+
+/** 
+ * Checks if the provided character is a splitter
+ *
+ * Params:
+ *   c = the character to check
+ * Returns: `true` if it is a splitter, `false`
+ * otherwise
+ */
+public bool isSplitter(char c)
+{
+    return c == LS.SEMI_COLON || c == LS.COMMA || c == LS.L_PAREN ||
+           c == LS.R_PAREN || c == LS.L_BRACK || c == LS.R_BRACK ||
+           c == LS.PERCENT || c == LS.L_BRACE || c == LS.R_BRACE ||
+           c == LS.EQUALS || c == LS.DOT || c == LS.COLON ||
+           isOperator(c) || isWhite(c);
+}
+
+/** 
+ * Checks if the provided character is a
+ * numerical size encoder
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otheriwse
+ */
+public bool isNumericalEncoder_Size(char character)
+{
+    return character == LS.ENC_BYTE || character == LS.ENC_WORD ||
+           character == LS.ENC_INT || character == LS.ENC_LONG;
+}
+
+/** 
+ * Checks if the provided character is a
+ * numerical signage encoder
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otherwise
+ */
+public bool isNumericalEncoder_Signage(char character)
+{
+    return character == LS.ENC_SIGNED || character == LS.ENC_UNSIGNED;
+}
+
+/** 
+ * Checks if the provided character is
+ * either a numerical size encoder
+ * or signage encoder
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otherwise
+ */
+public bool isNumericalEncoder(char character)
+{
+    return isNumericalEncoder_Size(character) ||
+           isNumericalEncoder_Signage(character);
+}
+
+/** 
+ * Checks if the given character is a valid
+ * escape character (something which would 
+ * have followed a `\`)
+ *
+ * Params:
+ *   character = the character to check
+ * Returns: `true` if so, `false` otherwise
+ */
+public bool isValidEscape_String(char character)
+{
+    return character == LS.BACKSLASH || character == LS.DOUBLE_QUOTE || character == LS.SINGLE_QUOTE ||
+           character == LS.ESC_NOTHING || character == LS.ESC_NEWLINE  || character == LS.ESC_CARRIAGE_RETURN ||
+           character == LS.TAB || character == LS.ESC_BELL;
+}
+
+/**
+ * Given a character return whether it is valid entry
+ * for preceding a '.'.
+ *
+ * Returns: `true` if so, otherwise `false`
+ */
+public bool isValidDotPrecede(char character)
+{
+    return character == LS.R_PAREN || character == LS.R_BRACK; // || isAlpha(character) || isDigit(character);
 }
diff --git a/source/tlang/compiler/lexer/kinds/arr.d b/source/tlang/compiler/lexer/kinds/arr.d
@@ -0,0 +1,124 @@
+module tlang.compiler.lexer.kinds.arr;
+
+import tlang.compiler.lexer.core;
+
+/** 
+ * An array-based tokenizer which takes a
+ * provided array of `Token[]`. useful
+ * for testing parser-only related things
+ * with concrete tokens
+ */
+public final class ArrLexer : LexerInterface
+{
+    /** 
+     * The concrete token source
+     */
+    private Token[] tokens;
+
+    /** 
+     * Position in the `tokens` array
+     */
+    private ulong tokenPtr = 0;
+
+    /** 
+     * Constructs a new `ArrLexer` (dummy lexer) with
+     * the tokens already in concrete form in the
+     * provided array.
+     *
+     * Params:
+     *   tokens = the `Token[]`
+     */
+    this(Token[] tokens)
+    {
+        this.tokens = tokens;
+    }
+
+    /** 
+     * Returns the token at the current cursor
+     * position
+     *
+     * Returns: the `Token`
+     */
+    public Token getCurrentToken()
+    {
+        return tokens[tokenPtr];
+    }
+
+    /** 
+     * Moves the cursor one token forward
+     */
+    public void nextToken()
+    {
+        tokenPtr++;
+    }
+
+    /** 
+     * Moves the cursor one token backwards
+     */
+    public void previousToken()
+    {
+        tokenPtr--;
+    }
+
+    /** 
+     * Sets the position of the cursor
+     *
+     * Params:
+     *   cursor = the new position
+     */
+    public void setCursor(ulong cursor)
+    {
+        this.tokenPtr = cursor;
+    }
+
+    /** 
+     * Retrieves the cursor's current position
+     *
+     * Returns: the position
+     */
+    public ulong getCursor()
+    {
+        return this.tokenPtr;
+    }
+
+    /** 
+     * Checks whether more tokens are available
+     * of not
+     *
+     * Returns: true if more tokens are available, false otherwise
+     */
+    public bool hasTokens()
+    {
+        return tokenPtr < tokens.length;
+    }
+
+    /** 
+     * Get the line position of the lexer in the source text
+     *
+     * Returns: the position
+     */
+    public ulong getLine()
+    {
+        return 0; // TODO: anything meaningful?
+    }
+
+    /** 
+     * Get the column position of the lexer in the source text
+     *
+     * Returns: the position
+     */
+    public ulong getColumn()
+    {
+        return 0; // TODO: anything meaningful?
+    }
+
+    /** 
+     * Exhaustively provide a list of all tokens
+     *
+     * Returns: a `Token[]` containing all tokens
+     */
+    public Token[] getTokens()
+    {
+        return tokens;
+    }
+}