In [4]:
import re

token_patterns = {
    "KEYWORD": r"\b(int|float|double|char|void|if|else|for|while|do|switch|case|break|continue|return|struct|union|typedef|const|static|enum|sizeof)\b",
    "IDENTIFIER": r"\b[A-Za-z_][A-Za-z0-9_]*\b",
    "NUMBER": r"\b\d+(\.\d+)?\b",
    "CHAR": r"'[^']'",
    "STRING": r'"[^"\n]*"',
    "PREPROCESSOR": r"#\s*[a-zA-Z_]+\b.*",
    "OPERATOR": r"(\+\+|--|\+=|-=|\*=|/=|%=|==|!=|>=|<=|&&|\|\||<<|>>|[+\-*/%=&|^~!<>])",
    "SEPARATOR": r"[{}\[\]();,:]",
    "COMMENT": r"(//.*?$|/\*[\s\S]*?\*/)",
    "WHITESPACE": r"[ \t\n]+"
}


In [5]:
# Print all token patterns to verify
for token_type, pattern in token_patterns.items():
    print(f"{token_type}: {pattern}")


KEYWORD: \b(int|float|double|char|void|if|else|for|while|do|switch|case|break|continue|return|struct|union|typedef|const|static|enum|sizeof)\b
IDENTIFIER: \b[A-Za-z_][A-Za-z0-9_]*\b
NUMBER: \b\d+(\.\d+)?\b
CHAR: '[^']'
STRING: "[^"\n]*"
PREPROCESSOR: #\s*[a-zA-Z_]+\b.*
OPERATOR: (\+\+|--|\+=|-=|\*=|/=|%=|==|!=|>=|<=|&&|\|\||<<|>>|[+\-*/%=&|^~!<>])
SEPARATOR: [{}\[\]();,:]
COMMENT: (//.*?$|/\*[\s\S]*?\*/)
WHITESPACE: [ \t\n]+


In [6]:
import re
token_patterns = {
    "KEYWORD": r"\b(int|float|double|char|void|if|else|for|while|do|switch|case|break|continue|return|struct|union|typedef|const|static|enum|sizeof)\b",
    "IDENTIFIER": r"\b[A-Za-z_][A-Za-z0-9_]*\b",
    "NUMBER": r"\b\d+(\.\d+)?\b",
    "CHAR": r"'[^']'",
    "STRING": r'"[^"\n]*"',
    "PREPROCESSOR": r"#\s*[a-zA-Z_]+\b.*",
    "OPERATOR": r"(\+\+|--|\+=|-=|\*=|/=|%=|==|!=|>=|<=|&&|\|\||<<|>>|[+\-*/%=&|^~!<>])",
    "SEPARATOR": r"[{}\[\]();,:]",
    "COMMENT": r"(//.*?$|/\*[\s\S]*?\*/)",
    "WHITESPACE": r"[ \t\n]+"
}

combined_pattern = '|'.join(f"(?P<{name}>{pattern})" for name, pattern in token_patterns.items())
token_regex = re.compile(combined_pattern, re.MULTILINE)

def tokenize(code):
    tokens = []
    for match in re.finditer(token_regex, code):
        kind = match.lastgroup
        value = match.group()
        if kind == "WHITESPACE" or kind == "COMMENT":
            continue  # skip
        tokens.append((kind, value))
    return tokens


In [7]:
sample_code = """
#include <stdio.h>

int main() {
    char str[10];
    gets(str);
    printf("Hello, %s", str);
    return 0;
}
"""

token_list = tokenize(sample_code)

# Print output
for token_type, token_value in token_list:
    print(f"{token_type:12} → {token_value}")


PREPROCESSOR → #include <stdio.h>
KEYWORD      → int
IDENTIFIER   → main
SEPARATOR    → (
SEPARATOR    → )
SEPARATOR    → {
KEYWORD      → char
IDENTIFIER   → str
SEPARATOR    → [
NUMBER       → 10
SEPARATOR    → ]
SEPARATOR    → ;
IDENTIFIER   → gets
SEPARATOR    → (
IDENTIFIER   → str
SEPARATOR    → )
SEPARATOR    → ;
IDENTIFIER   → printf
SEPARATOR    → (
STRING       → "Hello, %s"
SEPARATOR    → ,
IDENTIFIER   → str
SEPARATOR    → )
SEPARATOR    → ;
KEYWORD      → return
NUMBER       → 0
SEPARATOR    → ;
SEPARATOR    → }


In [8]:
class Parser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0
        self.current_token = self.tokens[self.pos] if self.tokens else None

    def advance(self):
        self.pos += 1
        if self.pos < len(self.tokens):
            self.current_token = self.tokens[self.pos]
        else:
            self.current_token = None

    def match(self, expected_type, expected_value=None):
        if self.current_token is None:
            return False
        token_type, token_value = self.current_token
        if token_type == expected_type and (expected_value is None or token_value == expected_value):
            self.advance()
            return True
        return False

    def parse(self):
        print("Parsing Program...")
        while self.current_token is not None:
            self.parse_function()

    def parse_function(self):
        if self.match("KEYWORD") and self.match("IDENTIFIER"):
            if self.match("SEPARATOR", "(") and self.match("SEPARATOR", ")"):
                if self.match("SEPARATOR", "{"):
                    print("✅ Function Declaration Found")
                    self.parse_statements()
                    if self.match("SEPARATOR", "}"):
                        print("✅ Function Block Closed")
                    else:
                        print("❌ Error: Missing '}'")
                else:
                    print("❌ Error: Missing '{'")
            else:
                print("❌ Error: Invalid function parameters")
        else:
            print("❌ Error: Invalid function definition")

    def parse_statements(self):
        while self.current_token and self.current_token[1] != "}":
            self.parse_statement()

    def parse_statement(self):
        if self.current_token[0] == "KEYWORD":
            self.parse_declaration()
        elif self.current_token[0] == "IDENTIFIER":
            self.parse_assignment_or_call()
        elif self.current_token[1] == "return":
            self.parse_return()
        else:
            print(f"⚠️ Skipping unexpected token: {self.current_token}")
            self.advance()

    def parse_declaration(self):
        self.advance()  # skip type
        if self.match("IDENTIFIER"):
            if self.match("OPERATOR", "="):
                self.parse_expression()
            if self.match("SEPARATOR", ";"):
                print("✅ Variable Declaration")
            else:
                print("❌ Error: Missing ';' in declaration")

    def parse_assignment_or_call(self):
        self.advance()  # skip identifier
        if self.match("OPERATOR", "="):
            self.parse_expression()
            if self.match("SEPARATOR", ";"):
                print("✅ Assignment")
        elif self.match("SEPARATOR", "("):
            while not self.match("SEPARATOR", ")"):
                self.advance()
            if self.match("SEPARATOR", ";"):
                print("✅ Function Call")

    def parse_return(self):
        self.advance()  # skip 'return'
        self.parse_expression()
        if self.match("SEPARATOR", ";"):
            print("✅ Return Statement")
        else:
            print("❌ Error: Missing ';' in return")

    def parse_expression(self):
        if self.current_token[0] in ["NUMBER", "IDENTIFIER", "STRING"]:
            self.advance()
        else:
            print(f"❌ Error: Invalid expression at {self.current_token}")
            self.advance()


In [9]:
sample_code = """
int main() {
    int x = 5;
    printf("Hello, %s", x);
    return 0;
}
"""

tokens = tokenize(sample_code)

parser = Parser(tokens)
parser.parse()


Parsing Program...
✅ Function Declaration Found
✅ Variable Declaration
✅ Function Call
⚠️ Skipping unexpected token: ('NUMBER', '0')
⚠️ Skipping unexpected token: ('SEPARATOR', ';')
✅ Function Block Closed


In [12]:
class Parser:
    def __init__(self, tokens):
        self.tokens = tokens
        self.pos = 0
        self.current_token = self.tokens[self.pos] if self.tokens else None

    def advance(self):
        self.pos += 1
        if self.pos < len(self.tokens):
            self.current_token = self.tokens[self.pos]
        else:
            self.current_token = None

    def match(self, expected_type, expected_value=None):
        if self.current_token is None:
            return False
        token_type, token_value = self.current_token
        if token_type == expected_type and (expected_value is None or token_value == expected_value):
            self.advance()
            return True
        return False

    def parse(self):
        print("🔍 Parsing Program...\n")
        while self.current_token is not None:
            self.parse_function()

    def parse_function(self):
        if self.match("KEYWORD") and self.match("IDENTIFIER"):
            if self.match("SEPARATOR", "(") and self.match("SEPARATOR", ")"):
                if self.match("SEPARATOR", "{"):
                    print("✅ Function Declaration Found\n")
                    self.parse_statements()
                    if self.match("SEPARATOR", "}"):
                        print("✅ Function Block Closed\n")
                    else:
                        print("❌ Error: Missing '}'\n")
                else:
                    print("❌ Error: Missing '{'\n")
            else:
                print("❌ Error: Invalid function parameters\n")
        else:
            print("❌ Error: Invalid function definition\n")

    def parse_statements(self):
        while self.current_token and self.current_token[1] != "}":
            self.parse_statement()

    def parse_statement(self):
        if self.current_token[0] == "KEYWORD":
            self.parse_declaration()
        elif self.current_token[0] == "IDENTIFIER":
            if self.current_token[1] in [
                "gets", "strcpy", "strcat", "scanf", "sprintf", "system", "eval"
            ]:
                self.check_insecure_function()
            self.parse_assignment_or_call()
        elif self.current_token[1] == "return":
            self.parse_return()
        else:
            print(f"⚠️ Skipping unexpected token: {self.current_token}")
            self.advance()

    def parse_declaration(self):
        self.advance()  # skip type
        if self.match("IDENTIFIER"):
            if self.match("OPERATOR", "="):
                if self.current_token[0] == "STRING":
                    if "password" in self.current_token[1].lower():
                        print("❌ Warning: Hardcoded password in string literal. [CWE-259]")
                    if "%n" in self.current_token[1]:
                        print("❌ Warning: Format string contains `%n`, which is dangerous. [CWE-134]")
                self.parse_expression()
            if self.match("SEPARATOR", ";"):
                print("✅ Variable Declaration\n")
            else:
                print("❌ Error: Missing ';' in declaration\n")

    def parse_assignment_or_call(self):
        if self.current_token[0] == "IDENTIFIER" and self.current_token[1] == "malloc":
            print("⚠️ Warning: `malloc()` used — ensure return is checked for NULL. [CWE-690]")

        self.advance()

        if self.match("OPERATOR", "="):
            self.parse_expression()
            if self.match("SEPARATOR", ";"):
                print("✅ Assignment\n")
        elif self.match("SEPARATOR", "("):
            while not self.match("SEPARATOR", ")") and self.current_token:
                self.advance()
            if self.match("SEPARATOR", ";"):
                print("✅ Function Call\n")

    def parse_return(self):
        self.advance()  # skip 'return'
        self.parse_expression()
        if self.match("SEPARATOR", ";"):
            print("✅ Return Statement\n")
        else:
            print("❌ Error: Missing ';' in return\n")

    def parse_expression(self):
        if self.current_token[0] in ["NUMBER", "IDENTIFIER", "STRING"]:
            self.advance()
        else:
            print(f"❌ Error: Invalid expression at {self.current_token}")
            self.advance()

    def check_insecure_function(self):
        func_name = self.current_token[1]
        print(f"❌ Warning: Use of insecure function `{func_name}()` detected.")

        if func_name == "gets":
            print("   💡 Use `fgets()` instead of `gets()` to avoid buffer overflows. [CWE-120]")
        elif func_name == "strcpy":
            print("   💡 Use `strncpy()` instead with size limit. [CWE-121]")
        elif func_name == "strcat":
            print("   💡 Use `strncat()` instead with length checking. [CWE-120]")
        elif func_name == "scanf":
            print("   💡 Always use length specifiers like `%10s` in `scanf`. [CWE-134]")
        elif func_name == "sprintf":
            print("   💡 Use `snprintf()` to avoid overflow. [CWE-120]")
        elif func_name == "system":
            print("   ⚠️ Avoid `system()` or validate/sanitize inputs. [CWE-78]")
        elif func_name == "eval":
            print("   ❌ Never use `eval()` — leads to code injection. [CWE-95]")

        self.advance()


In [16]:
test_code = """
int main() {
    char str[20];
    gets(str);
    strcpy(str, "hello");
    gets(input);
strcat(dest, src);
sprintf(buff, "%s", data);
char *pw = "password123";
malloc(100);

    scanf("%s", str);
    sprintf(str, "data");
    char *pass = "MyPassword123";
    int *mem = malloc(10 * sizeof(int));
    system("rm -rf /");
    return 0;
}
"""

tokens = tokenize(test_code)
parser = Parser(tokens)
parser.parse()


🔍 Parsing Program...

✅ Function Declaration Found

❌ Error: Missing ';' in declaration

⚠️ Skipping unexpected token: ('SEPARATOR', '[')
⚠️ Skipping unexpected token: ('NUMBER', '20')
⚠️ Skipping unexpected token: ('SEPARATOR', ']')
⚠️ Skipping unexpected token: ('SEPARATOR', ';')
   💡 Use `fgets()` instead of `gets()` to avoid buffer overflows. [CWE-120]
⚠️ Skipping unexpected token: ('SEPARATOR', ')')
⚠️ Skipping unexpected token: ('SEPARATOR', ';')
   💡 Use `strncpy()` instead with size limit. [CWE-121]
⚠️ Skipping unexpected token: ('SEPARATOR', ',')
⚠️ Skipping unexpected token: ('STRING', '"hello"')
⚠️ Skipping unexpected token: ('SEPARATOR', ')')
⚠️ Skipping unexpected token: ('SEPARATOR', ';')
   💡 Use `fgets()` instead of `gets()` to avoid buffer overflows. [CWE-120]
⚠️ Skipping unexpected token: ('SEPARATOR', ')')
⚠️ Skipping unexpected token: ('SEPARATOR', ';')
   💡 Use `strncat()` instead with length checking. [CWE-120]
⚠️ Skipping unexpected token: ('SEPARATOR', ',')
⚠️ S