In [1]:
import copy
import time

In [2]:
class LRParser:
    def __init__(self, grammar, terminals, non_terminals, start, dot):
        self.grammar = grammar
        self.terminals = terminals
        self.non_terminals = non_terminals
        self.start = start
        self.dot = dot
        
        self.first_table = {}
        self.follow_table = {}
        self.in_progress = set()     # to avoid left recursive when calculating first
        self.calculateFirstTable()
        self.calculateFollowTable()
        
        self.augmented_rules = []    # format of rule: [rhs, [<lhs symbol>]
        self.state_map = {}          # store rules of a state (format: state_count:[[rule1], [rule2], ...])
        self.state_dict = {}         # store which state go to which state
        self.state_count = 0
        self.initialAugmentation()
        self.generateStates()

        self.parse_table = []
        self.createParseTable()

    
    def shift(self, current_node, next_state, symbol):
        pass


    def reduce(self, current_node, rule):
        pass

    
    def initialAugmentation(self):
        for key in grammar.keys():
            lhs, rhs = grammar[key]
            new_rhs = [self.dot]
            for elem in rhs:
                new_rhs.append(elem)
            self.augmented_rules.append([lhs, new_rhs])

    def generateStates(self):
        
        # generate the first state I_0
        first_state = []
        for rule in self.augmented_rules:
            if rule[0] == self.start:
                first_state.append(rule)
        closure_rules = self.findClosure(first_state)
        self.state_dict[0] = closure_rules

        # generate states until no more state is able to be generated
        prev_len = -1
        state_completed_GOTO = []
        while prev_len != len(self.state_dict):
            prev_len = len(self.state_dict)

            keys = list(self.state_dict.keys())
            for state in keys:
                if state not in state_completed_GOTO:
                    self.computeGOTO(state)
                    state_completed_GOTO.append(state)

    
    def computeGOTO(self, state):
        generate_new_state_for = []
        for rule in self.state_dict[state]:
            # if the rule ends with dot (can't shift anymore) => skip
            if rule[1][-1] == self.dot:
                continue

            dot_ind = rule[1].index(self.dot)
            next_sym = rule[1][dot_ind+1]

            if next_sym not in generate_new_state_for:
                generate_new_state_for.append(next_sym)

        for sym in generate_new_state_for:
            self.GOTO(state, sym)

    
    def GOTO(self, state, sym):
        new_state = []
        for rule in self.state_dict[state]:
            # if the rule ends with dot (can't shift anymore) => skip
            if rule[1][-1] == self.dot:
                continue

            dot_ind = rule[1].index(self.dot)
            next_sym = rule[1][dot_ind+1]

            # shift operation from the previous state of rule on that
            if next_sym == sym:
                # swap dot with next_sym
                shifted_rule = copy.deepcopy(rule)
                shifted_rule[1][dot_ind] = shifted_rule[1][dot_ind + 1]
                shifted_rule[1][dot_ind + 1] = self.dot
                new_state.append(shifted_rule)

        closure_rules = self.findClosure(new_state)

        # check if state exist
        state_exists = -1
        for state_num in self.state_dict:
            if self.state_dict[state_num] == new_state:
                state_exists = state_num
                break
     
        # stateMap is a mapping of GOTO with
        # its output states
        if state_exists == -1:
            self.state_count += 1
            self.state_dict[self.state_count] = closure_rules
            self.state_map[(state, sym)] = self.state_count
        else:
            self.state_map[(state, sym)] = state_exists
            

    def findClosure(self, closure_rules):
        # generate closure for the rules in new_state
        # generate until can't generate anymore
        prev_len = -1
        while prev_len != len(closure_rules):
            prev_len = len(closure_rules)
            for rule in closure_rules:
                if rule[1][-1] == self.dot:
                    continue
                    
                dot_ind = rule[1].index(self.dot)
                next_sym = rule[1][dot_ind+1]
    
                # if next_sym is non_terminal then continue adding rule with that nonterminals as lhs
                if next_sym in self.non_terminals:
                    for augmented_rule in self.augmented_rules:
                        if augmented_rule[0] == next_sym and augmented_rule not in closure_rules:
                            closure_rules.append(augmented_rule)
        return closure_rules

        
    def calculateFirstTable(self):
        for key in grammar.keys():
            rule = grammar[key]
            lhs, rhs = rule

            if lhs not in self.first_table:
                self.first_table[lhs] = list(elem for elem in self.first(rule))
            else:
                res = self.first(rule)
                for elem in res:
                    if elem not in self.first_table[lhs]:
                        self.first_table[lhs].append(elem)

    
    def calculateFollowTable(self):
        for nt in self.non_terminals:
            self.follow_table[nt] = self.follow(nt)

    
    def first(self, rule):
        lhs, rhs = rule
        
        if lhs in self.in_progress:
            return []  # prevent infinite recursion
        
        # mark this non-terminal as being processed
        self.in_progress.add(lhs)
        
        # rule for terminals
        if rhs[0] in terminals:
            return [rhs[0]]
            
        # rule for epsilon
        elif rhs[0] == "#":
            return ["#"]
            
        # rule for non-terminal
        else:
            res = []
            for key in grammar.keys():
                if rhs[0] == grammar[key][0]:
                    for elem in self.first(grammar[key]):
                        res.append(elem) 

            if "#" in res:
                res.remove("#")
                
            self.in_progress.remove(lhs)  # finished processing this non-terminal
            return res

    
    def follow(self, nt, visited=None):
        if visited is None:
            visited = set()
    
        if nt in visited:
            return []

        visited.add(nt)
        res = set()

        # for start symbol return $
        if nt == self.start:
            res.add("$")

        for key in grammar.keys():
            lhs, rhs = grammar[key]
            
            for i, symbol in enumerate(rhs):
                if symbol == nt:
                    rhs = rhs[i + 1:]

                    # rule 2: there is a symbol after nt
                    if len(rhs) != 0:
                        # if the symbol after nt is also a non-terminal:
                        #   - calculate its first (remove epsilon) and add to res
                        #   - if its first contain epsilon, then continue checking the next symbol
                        # else the symbol after nt is a terminal:
                        #   - then add it to res
                        for sym in rhs:
                            if sym in self.terminals:
                                res.add(sym)
                                break
                            elif sym in self.first_table:
                                first_sym = self.first_table[sym]
                                res.update(set(first_sym) - {"#"})
    
                                if "#" in first_sym:
                                    res.remove("#")
                                else:
                                    break

                    # rule 3: there is no symbol after nt -> FOLLOW(lhs) ⊆ FOLLOW(nt)
                    if len(rhs) == 0:  
                        if lhs != nt:
                            res.update(self.follow(lhs, visited))
                            
        visited.remove(nt)
        return list(res)

    def createParseTable(self):
        rows = list(self.state_dict.keys())
        cols = self.terminals + ["$"] + self.non_terminals

        # create empty table
        temp_row = []
        for i in range(len(cols)):
            temp_row.append([])
        for i in range(len(rows)):
            self.parse_table.append(copy.deepcopy(temp_row))

        # add shift and goto entries to table
        for entry in self.state_map.keys():
            state = entry[0]
            sym = entry[1]

            row_ind = rows.index(state)
            col_ind = cols.index(sym)

            if sym in self.terminals:
                self.parse_table[row_ind][col_ind].append(f"S{self.state_map[entry]}")
            elif sym in self.non_terminals:
                self.parse_table[row_ind][col_ind].append(f"G{self.state_map[entry]}")

        # add reduce to table
        for state in self.state_dict.keys():
            for rule in self.state_dict[state]:
                # if the rule is a handle -> add reduce correspondingly
                if rule[1][-1] == self.dot:
                    copy_rhs = copy.deepcopy(rule[1])
                    copy_rhs.remove(self.dot)

                    # add entry R_rule_num (Reduce -> rule_num) to entry (state, follow(rhs)) in parse table
                    for rule_num in self.grammar.keys():
                        if grammar[rule_num][0] == rule[0] and grammar[rule_num][1] == copy_rhs:
                            for follow in self.follow_table[rule[0]]:
                                row_ind = rows.index(state)
                                col_ind = cols.index(follow)
                                if rule_num == 0:
                                    self.parse_table[row_ind][col_ind].append("Accept")
                                else:
                                    self.parse_table[row_ind][col_ind].append(f"R{rule_num}")

    	# printing table
        print("\nParsing table:\n")
        frmt = "{:>8}" * len(cols)
        print(" ", frmt.format(*cols), "\n")
        ptr = 0
        j = 0
        for y in self.parse_table:
            # frmt1 = "{:>8}"
            print(f"{{:>3}}".format('I'+str(j)), end="")
            for e in y:
                print(f"{{:>8}}".format("/".join(e)), end="")
            print()
            j += 1
            
        file = open("rules/parse_tables/parsetable1.csv", "w")
        file.write("state,"+",".join(cols)+"\n")
        j = 0
        for y in self.parse_table:
            line = ""
            line += f"I{j}"
            for e in y:
                line += "," + "/".join(e)
            file.write(line + "\n")
            j += 1
        file.close()
        

    def printResultAndGoto(self):
        print("\nStates Generated: \n")
        for st in self.state_dict:
            print(f"State = I{st}")
            self.printResult(self.state_dict[st])
            print()# print goto states
        print("\nStates Generated: \n")
        for st in self.state_dict:
            print(f"State = I{st}")
            self.printResult(self.state_dict[st])
            print()

        print("Result of GOTO computation:\n")
        self.printAllGOTO(self.state_map)

    

    def printResult(self, rules):
        for rule in rules:
            print(f"{rule[0]} ->"
                  f" {' '.join(rule[1])}")

    def printAllGOTO(self, diction):
        for itr in diction:
            print(f"GOTO ( I{itr[0]} ,"
                  f" {itr[1]} ) = I{self.state_map[itr]}")


In [7]:
class SLRParser(LRParser):
    def __init__(self, grammar, terminals, non_terminals, start, dot):
        super().__init__(grammar, terminals, non_terminals, start, dot)

    def parse(self, input_string):
        # self.printResultAndGoto()
        rows = list(self.state_dict.keys())
        cols = self.terminals + ["$"] + self.non_terminals
        
        ls_input = input_string + ["$"]
        current_char = ls_input[0]
        ls_output = []
        stack = [0]
        while True:
            # print(ls_input, current_char, stack)
            # time.sleep(1)
            if current_char not in cols:
                return False
            
            row_ind = rows.index(stack[-1])
            col_ind = cols.index(current_char)
            
            operation = self.parse_table[row_ind][col_ind]
            
            if operation == []:
                return False
                
            else:
                operation = operation[0] # just get the first operation in conflict cell
                # print(operation)
                # reduce operation
                if operation[0] == "R":
                    rule_num = int(operation[1:])
                    current_char = self.grammar[rule_num][0]
                    
                    # pop stack equal to number of char on rhs of reduce rule
                    stack_pop_count = len(self.grammar[rule_num][1])
                    stack = stack[:-stack_pop_count]

                    ls_output.append(rule_num)
                
                # goto operation
                elif operation[0] == "G":
                    stack.append(int(operation[1:]))
                    current_char = ls_input[0]  
                    
                # shift operation
                elif operation[0] == "S":
                    stack.append(int(operation[1:]))
                    ls_input.pop(0) 
                    current_char = ls_input[0]      

                # accept reached
                elif operation == "Accept":
                    return True

    
# Example 1 Grammar and Tables
grammar = {
    0: ("E'", ["E"]),                # Rule 0: E'→ E
    1: ("E", ["E", "+", "T"]),       # Rule 1: E → E + T
    2: ("E", ["T"]),                 # Rule 2: E → T
    3: ("T", ["T", "*", "F"]),       # Rule 3: T → T * F
    4: ("T", ["F"]),                 # Rule 4: T → F
    5: ("F", ["(", "E", ")"]),       # Rule 5: F → ( E )
    6: ("F", ["a"]),                 # Rule 6: F → a
    
}

terminals = ["a", "+", "*","(", ")"]
non_terminals = ["E'", "E", "T", "F"]
start = "E'"
dot = '·'

# Test the Parser
parser = SLRParser(grammar, terminals, non_terminals, start, dot)
input_string = list("a*a+a*a+a")
res = parser.parse(input_string)
if res == False:
    print(f"Input not accepted - {"".join(input_string)}")
else:
    print(f"Input accepted - {"".join(input_string)}")



# # Example 2 Grammar and Tables
# grammar = {
#     0: ("S'", ["S"]),
#     1: ("S", ["L", "=", "R"]),    # Rule 1: S → L = R
#     2: ("S", ["R"]),              # Rule 2: S → R
#     3: ("L", ["*", "R"]),         # Rule 3: L → * R
#     4: ("L", ["a"]),              # Rule 4: L → a
#     5: ("R", ["L"]),              # Rule 5: R → L
# }

# terminals = ["a", "=", "*"]
# non_terminals = ["S'", "S", "L", "R"]
# start = "S'"
# dot = '·'

# # Test the Parser
# parser = SLRParser(grammar, terminals, non_terminals, start, dot)
# # input_string = list("a=a")
# # parser.parse(input_string)





Parsing table:

         a       +       *       (       )       $      E'       E       T       F 

 I0      S5                      S4                              G1      G2      G3
 I1              S6                          Accept                                
 I2              R2      S7              R2      R2                                
 I3              R4      R4              R4      R4                                
 I4      S5                      S4                              G8      G2      G3
 I5              R6      R6              R6      R6                                
 I6      S5                      S4                                      G9      G3
 I7      S5                      S4                                             G10
 I8              S6                     S11                                        
 I9              R1      S7              R1      R1                                
I10              R3      R3              R3      R3       

# 2. Generator

In [4]:
from collections import deque

class GrammarExpander:
    def __init__(self, grammar, terminals, start):
        self.grammar = grammar
        self.terminals = terminals
        self.start_symbol = start

    def is_fully_expanded(self, symbols):
        """Check if all the symbols in the list are terminals."""
        return all(symbol in self.terminals for symbol in symbols)

    def get_productions_for(self, symbol):
        return [prod for lhs, prod in self.grammar.values() if lhs == symbol]

    def expand_grammar(self, max_strings, max_depth):
        queue = deque([([self.start_symbol], 0)])  # (current list of symbols, current depth)
        seen_strings = set()  # To avoid duplicates
        results = set()  # Store unique results (only fully terminal strings)
        
        while queue and len(results) < max_strings:
            current_string, depth = queue.popleft()
            
            # If the current string is fully expanded, store it as a result
            if self.is_fully_expanded(current_string):
                result_string = ''.join(current_string)  # Convert list of symbols back to string
                if result_string not in results:
                    results.add(result_string)
            elif depth < max_depth:
                # Find the first non-terminal to expand
                for i, symbol in enumerate(current_string):
                    if symbol not in self.terminals:
                        # Get productions for this non-terminal
                        productions = self.get_productions_for(symbol)
                        for production in productions:
                            # Replace part of the list with the production
                            new_string = current_string[:i] + production + current_string[i+1:]
                            if tuple(new_string) not in seen_strings:
                                queue.append((new_string, depth + 1))
                                seen_strings.add(tuple(new_string))  # Store the tuple to avoid duplicate lists
                        break  # Only expand one non-terminal at a time
        
        return list(results)


# Example 1 Grammar and Tables
grammar = {
    0: ("E'", ["E"]),                # Rule 0: E'→ E
    1: ("E", ["E", "+", "T"]),       # Rule 1: E → E + T
    2: ("E", ["T"]),                 # Rule 2: E → T
    3: ("T", ["T", "*", "F"]),       # Rule 3: T → T * F
    4: ("T", ["F"]),                 # Rule 4: T → F
    5: ("F", ["(", "E", ")"]),       # Rule 5: F → ( E )
    6: ("F", ["a"]),                 # Rule 6: F → a
    
}

terminals = ["a", "+", "*","(", ")"]
non_terminals = ["E'", "E", "T", "F"]
start = "E'"
dot = '·'

parser = SLRParser(grammar, terminals, non_terminals, start, dot)

expander = GrammarExpander(grammar, terminals, start)

num_string = 10000
valid_strings = expander.expand_grammar(max_strings=num_string, max_depth=1000)

# Test the parser
ls_false = []
for i, s in enumerate(valid_strings):
    # res = parser.parse(list(s[:int(len(s)/2)]))
    res = parser.parse(list(s))
    if res == False:
        ls_false.append(s)

print(f"Accuracy: {round((num_string - len(ls_false))/num_string,5)*100}%")




Parsing table:

         a       +       *       (       )       $      E'       E       T       F 

 I0      S5                      S4                              G1      G2      G3
 I1              S6                          Accept                                
 I2              R2      S7              R2      R2                                
 I3              R4      R4              R4      R4                                
 I4      S5                      S4                              G8      G2      G3
 I5              R6      R6              R6      R6                                
 I6      S5                      S4                                      G9      G3
 I7      S5                      S4                                             G10
 I8              S6                     S11                                        
 I9              R1      S7              R1      R1                                
I10              R3      R3              R3      R3       

In [5]:
for e in ls_false:
    print(e)

In [8]:
import random

class InvalidStringGenerator:
    def __init__(self, terminals, non_terminals):
        self.terminals = terminals
        self.non_terminals = non_terminals
        self.invalid_symbols = ['x', 'y', 'z', '1', '2', '3', '@', '#', '$']  # Invalid symbols
    
    def generate_invalid_string(self, valid_string):
        """ Randomly pick one of the corruption strategies """
        strategies = [
            self._symbol_corruption,
            self._random_string,
            self._rule_mutation,
            self._non_terminal_insertion
        ]
        strategy = random.choice(strategies)
        return strategy(valid_string)
    
    def _symbol_corruption(self, valid_string, corruption_rate=0.3):
        """ Replace, insert, or delete terminals in a valid string """
        corrupted_string = []
        all_symbols = self.terminals + self.invalid_symbols
        
        for char in valid_string:
            if random.random() < corruption_rate:  # Decide to corrupt this character
                choice = random.choice(all_symbols)  # Randomly replace it with a symbol
                corrupted_string.append(choice)
            else:
                corrupted_string.append(char)  # Keep the original character
        
        if random.random() < corruption_rate:
            # Randomly insert an extra symbol
            insert_index = random.randint(0, len(corrupted_string))
            extra_symbol = random.choice(self.invalid_symbols)
            corrupted_string.insert(insert_index, extra_symbol)
        
        return ''.join(corrupted_string)
    
    def _random_string(self, valid_string, min_length=3, max_length=10):
        """ Generate a completely random string """
        all_symbols = self.terminals + self.invalid_symbols + self.non_terminals
        length = random.randint(min_length, max_length)
        return ''.join(random.choice(all_symbols) for _ in range(length))
    
    def _rule_mutation(self, valid_string, mutation_rate=0.3):
        """ Mutate a valid string by violating grammar production rules """
        mutated_string = []
        for char in valid_string:
            if char in self.terminals and random.random() < mutation_rate:
                new_symbol = random.choice(self.invalid_symbols)
                mutated_string.append(new_symbol)
            else:
                mutated_string.append(char)
        
        return ''.join(mutated_string)
    
    def _non_terminal_insertion(self, valid_string, insertion_rate=0.3):
        """ Insert non-terminals into an otherwise valid string """
        result = []
        for char in valid_string:
            if random.random() < insertion_rate:
                result.append(random.choice(self.non_terminals))  # Insert a non-terminal
            result.append(char)
        return ''.join(result)


generator = InvalidStringGenerator(terminals, non_terminals)

# Generate 10 completely invalid strings
num_string = 10
invalid_strings = [generator.generate_invalid_string(random.choice(valid_strings)) for _ in range(num_string)]

# Test the parser
ls_true = []
for i, s in enumerate(invalid_strings):
    print(s)
    res = parser.parse(list(s))
    if res == True:
        ls_true.append(s)

print(f"Accuracy: {round((num_string - len(ls_true))/num_string,5)*100}%")


((Fa)E)+a*(a+a)
a*1a+(a*a@2*a*a
$#(a*az+((a)*a)
E12+1
((((y$1z#z)@
y+a*a*(a*(+1))1
@(((3)))*a+a)
TaT*a*a+aT*FaT+E(a)
a+a*Ta*a+a*a*E'a
2ax+@#((@*a))
Accuracy: 100.0%
