In [1]:
# Your original version issue:
import re

code = "int x = 123; double y = 3.14e-5;"

# Original pattern (with capturing group)
bad_pattern = re.compile(r'\b[0-9]+\.?[0-9]*([eE][-+]?[0-9]+)?\b')
print("With capturing group:", bad_pattern.findall(code))
# Output: ['', 'e-5']  # Returns the captured group content!

# Fixed pattern (non-capturing group)
good_pattern = re.compile(r'\b[0-9]+\.?[0-9]*(?:[eE][-+]?[0-9]+)?\b')
print("With non-capturing group:", good_pattern.findall(code))
# Output: ['123', '3.14e-5']  # Returns the full match!

With capturing group: ['', 'e-5']
With non-capturing group: ['123', '3.14e-5']


In [2]:
def java_tokenize_standard(code):
    """
    A standard lexical tokenizer for Java.
    Counts: Keywords, Identifiers, Literals, Operators.
    Ignores: Whitespace, Comments.
    """
    token_pattern = re.compile(
        r'''
        "(?:\\.|[^\\"])*"         |  # String Literal
        '(?:\\.|[^\\'])*'         |  # Char Literal
        //.*?$                    |  # Line Comment (matches but we ignore)
        /\*.*?\*/                 |  # Block Comment (matches but we ignore)
        \b0[xX][0-9a-fA-F]+\b     |  # Hex Number
        \b[0-9]+\.?[0-9]*(?:[eE][-+]?[0-9]+)?\b | # Decimal Number (NON-CAPTURING)
        @[a-zA-Z_$][a-zA-Z0-9_$]* |  # Annotation
        [a-zA-Z_$][a-zA-Z0-9_$]*  |  # Identifier / Keyword
        [(){}\[\],.;:?!~+\-*/%&|^=<>]+ # Operators & Punctuation
        ''',
        re.VERBOSE | re.MULTILINE | re.DOTALL
    )
    
    # Find all matches
    raw_matches = token_pattern.findall(code)
    
    # Filter out comments (start with // or /*)
    clean_tokens = [t for t in raw_matches if not (t.startswith('//') or t.startswith('/*'))]
    
    return clean_tokens

In [3]:
java_code = """
public SecureRandomParameters getObject() throws Exception {
    if (this.isSingleton()) {
        if (instance == null) {
            instance = createInstance();
        }
        return instance;
    } else {
        return createInstance();
    }
}
"""

tokens = java_tokenize_standard(java_code)
print(f"Token count: {len(tokens)}")
print(f"Tokens: {tokens}")

Token count: 37
Tokens: ['public', 'SecureRandomParameters', 'getObject', '()', 'throws', 'Exception', '{', 'if', '(', 'this', '.', 'isSingleton', '())', '{', 'if', '(', 'instance', '==', 'null', ')', '{', 'instance', '=', 'createInstance', '();', '}', 'return', 'instance', ';', '}', 'else', '{', 'return', 'createInstance', '();', '}', '}']


In [4]:
java_code = """
   private void generateJavaClass() {
        setPackage();
        setImports();
        setClassNameAndType();
        setClassFields();
        setSettersAndGettersMethods();
        setCreateConnectorConfigurationMethod();
        setConfigureConnectorClassMethod();
        setValidateConnectorConfiguration();
        setGetConnectorDatabaseType();
    }
"""

tokens = java_tokenize_standard(java_code)
print(f"Token count: {len(tokens)}")
print(f"Tokens: {tokens}")

Token count: 24
Tokens: ['private', 'void', 'generateJavaClass', '()', '{', 'setPackage', '();', 'setImports', '();', 'setClassNameAndType', '();', 'setClassFields', '();', 'setSettersAndGettersMethods', '();', 'setCreateConnectorConfigurationMethod', '();', 'setConfigureConnectorClassMethod', '();', 'setValidateConnectorConfiguration', '();', 'setGetConnectorDatabaseType', '();', '}']
