In [1]:

def save_file(filename, txt):
    f = open(filename, "w")
    f.write(txt)
    f.close()

In [2]:
class SPPF:
    def __init__(self, tag="", children=None):
        self.tag = tag
        self.visited = False
        self.children = children if children is not None else []
        
    def to_dict(self):
        res = {}
        res["tag"] = self.tag
        res["children"] = [ [ y.to_dict() for y in x] for x in self.children ]
        return res

class State:
    def __init__(self):
        self.LHS = ""
        self.RHS = []
        self.pos = -1; # 0 means the beginning
        self.origin = 0
        self.SPPF = SPPF()

    #debug only
    def str(self):
        res = self.LHS + " -> "
        for i, x in enumerate(self.RHS):
            if i == self.pos:
                res += ". "
            res  += x + " "
        if "." not in res:
            res += ". "
        res += "| " + str(self.origin)
        return res

    def __eq__(self, other):
        return self.LHS == other.LHS and self.RHS == other.RHS and self.pos == other.pos and self.origin == other.origin

In [3]:
def read_rules(string):
    res = {}
    for rule in string.split("\n"):
        head, body = rule.split("->")
        head = head.strip()
        body = [ x.strip() for x in body]
        body = [ x for x in body if x != ""]
        if head not in res:
            res[head] = []
        res[head].append(body)
    return res

In [4]:
def all_combs_of_2(list1, list2):
    #list1 is a list of lists, list2 is just list
    res = []
    for x in list1:
        for y in list2:
            res.append( x + [y])
    return res
    
def all_combs(lists):
    #function that takes a list L of lists l0,l1,l2... and returns of combination of their elements
    res = [ [x] for x in lists[0]]
    for x in lists[1:]:
        res = all_combs_of_2(res, x)
    return res

def unpack(tree):
    res = []
    if len(tree.children) == 0:
        res = [tree.tag]
    for alt in tree.children:
        children = [ unpack(child) for child in alt] #children is a list of lists
        combinations = all_combs(children)
        for x in combinations:
            res.append (tree.tag + "["+ " ".join(x) + "]")
    return res
        

In [5]:
import copy
class Parser:
    def __init__(self):
        self.sets = []

    def _debug(self, i):
            #debugging 
            print("-" * 30, i, "-"*30)
            for s in self.sets[i]:
                print(s.str())
                
    def parse(self, word, rules, head):
        self.sets = [ [] for x in range(len(word)+1) ]
        #add the first state
        for RHS in rules[head]:
            state = State()
            state.LHS = head
            state.RHS = RHS
            state.pos = 0
            state.origin = 0
            state.SPPF.tag = head
            self.sets[0].append(state)

        
        for i in range(len(word)):
             
            j = 0
            while j < len(self.sets[i]):
                state = self.sets[i][j]
                if state.pos != len(state.RHS):
                    if state.RHS[state.pos] == word[i]:
                        self.scanner(state, i)
                    else:
                        self.predict(rules, state, i)
                else:
                    self.complete(state, i)
                j += 1
            
        for s in self.sets[i+1]:
            if s.pos == len(s.RHS):
                self.complete(s,i+1)
            else:
                self.predict(rules, s, i+1)

        res = []
        for s in self.sets[i+1]:
            if s.LHS == head and s.pos == len(s.RHS) and s.origin == 0:
                res.append(s)

        return res
            
    def scanner(self, state, i):
        _state = copy.deepcopy(state)
        _state.pos += 1
        leaf = SPPF(state.RHS[state.pos], [])
        if len(_state.SPPF.children) == 0:
            _state.SPPF.children.append([leaf])
        else:
            for child in _state.SPPF.children:
                child.append(leaf)
        
        if _state not in self.sets[i+1]:
            self.sets[i+1].append(_state)

    def predict(self, rules, state, i):
        next_nterm = state.RHS[state.pos]
        for head, RHSs in rules.items():
            if head == next_nterm:
                for RHS in RHSs:
                    _state = State()
                    _state.LHS = head
                    _state.RHS = RHS
                    _state.pos = 0
                    _state.origin = i
                    #or should we keep the old sppf?
                    _state.SPPF = SPPF(head, [])
                    if _state not in self.sets[i]:
                        self.sets[i].append(_state)
                        #no need to call for predict recurently, the new state will be noticed and predicted in the main loop

    def complete(self, state, i):
        for s in self.sets[state.origin]:
            if s.pos < len(s.RHS) and s.RHS[s.pos] == state.LHS:
                _state = copy.deepcopy(s)
                _state.pos += 1

                
                
                leaf = state.SPPF
                if len(_state.SPPF.children) == 0:
                    _state.SPPF.children.append([leaf])
                else:
                    for child in _state.SPPF.children:
                        child.append(leaf)

                if _state not in self.sets[i]:
                    self.sets[i].append(_state)
                else:
                    #we just add the alternative children
                    #print("the evil state is ccaused by: ", state.str())
                    pos = self.sets[i].index(_state)
                    self.sets[i][pos].SPPF.children.extend(_state.SPPF.children)
                
        

In [6]:
def state_to_cpp(state, i):
    res = ""
    res += "\t//" + state.str() + "\n"
    res += "\t{\n"
    res += "\t\tState state;\n"
    res += "\t\tstate.rule.LHS = \"" + state.LHS + "\";\n"
    for rhs in state.RHS:
        res += "\t\tstate.rule.RHS.push_back(\"" + rhs + "\");\n"
    res += "\t\tstate.pos = " + str(state.pos) + ";\n"
    res += "\t\tstate.origin = " + str(state.origin) + ";\n"
    
    res += "\n\t\tstates[" + str(i) + "].push_back(state);\n"
    
    res += "\t}"
    
    return res;

def states_to_cpp(states):
    print("std::vector<std::list<State>> states(", len(states), ");")
    for i in range(len(states)):
        #p._debug(i)
        print("//states[", i, "]")
        print ("{")
        for state in states[i]:
            print(state_to_cpp(state, i))
        print("}")

def print_states(parser):
    for i in range(len(parser.sets)):
        parser._debug(i)
    print("-"*63)


In [9]:
def parsing_to_cpp(parsing, offset):
    
    DECL_TEMPL = """result_holder[{{offset}} + {{i}}].tag = "{{tag}}";\n"""
    CHILD_TEMPL = """result_holder[{{offset}} + {{i}}].children.push_back(&result_holder[{{offset}} + {{j}}]);\n"""
    
    curr_id = 0
    
    declarations = ""
    children = ""
    
    stack = []
    
    for i in parsing:
        if i == " ":
            pass
        elif i == "[":
            stack.append(curr_id-1)
        elif i == "]":
            stack.pop()
        else:
            if stack != []:
                children +=  CHILD_TEMPL\
                    .replace("{{offset}}", str(offset))\
                    .replace("{{i}}", str(stack[-1]))\
                    .replace("{{j}}", str(curr_id))
            
            declarations += DECL_TEMPL\
                .replace("{{offset}}", str(offset))\
                .replace("{{i}}", str(curr_id))\
                .replace("{{tag}}", i)
            
            curr_id += 1
            
    return (curr_id, declarations+ "\n"+ children)

def parsings_to_cpp(parsings):
    res = "std::vector<PNode*> results;\n"
    offset = 0
    for i in parsings:
        _offset, _res = parsing_to_cpp(i, offset)
        res += "\n// " + i + "\n\n" + _res
        res += "\nresults.push_back(&result_holder[{{i}}]);\n\n".replace("{{i}}", str(offset))
        offset += _offset
        
    res = "PNodes result_holder("+str(offset)+");\n\n" + res
    
    return(res)
    
#                  0 1 2 3 4 5   6 7      8 9 10   11 12 
#parsings_to_cpp(["A[B a[b b[A]] g[f]]", "c[a b]", "A[a]"])

In [11]:
def test(rules_str, head, input):
    rules = read_rules(rules_str)
    p = Parser()
    res = p.parse(input, rules, head)
    
    #states_to_cpp(p.sets)
    #print_states(p)
    res = [unpack(x.SPPF) for x in res][0]
    #print (res)
    print(parsings_to_cpp(res))

In [14]:
print(parsings_to_cpp(["a[b c[e] d]"]).replace("result", "result1"))

PNodes result1_holder(5);

std::vector<PNode*> result1s;

// a[b c[e] d]

result1_holder[0 + 0].tag = "a";
result1_holder[0 + 1].tag = "b";
result1_holder[0 + 2].tag = "c";
result1_holder[0 + 3].tag = "e";
result1_holder[0 + 4].tag = "d";

result1_holder[0 + 0].children.push_back(&result1_holder[0 + 1]);
result1_holder[0 + 0].children.push_back(&result1_holder[0 + 2]);
result1_holder[0 + 2].children.push_back(&result1_holder[0 + 3]);
result1_holder[0 + 0].children.push_back(&result1_holder[0 + 4]);

result1s.push_back(&result1_holder[0]);




In [43]:
rules = """ S -> S S 
S -> b"""
test(rules, "S", ["b","b","b"])

PNodes result_holder(16);

std::vector<PNode*> results;

// S[S[S[b] S[b]] S[b]]

result_holder[0 + 0].tag = "S";
result_holder[0 + 1].tag = "S";
result_holder[0 + 2].tag = "S";
result_holder[0 + 3].tag = "b";
result_holder[0 + 4].tag = "S";
result_holder[0 + 5].tag = "b";
result_holder[0 + 6].tag = "S";
result_holder[0 + 7].tag = "b";

result_holder[0 + 0].children.push_back(&result_holder[0 + 1]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 2]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 3]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 4]);
result_holder[0 + 4].children.push_back(&result_holder[0 + 5]);
result_holder[0 + 0].children.push_back(&result_holder[0 + 6]);
result_holder[0 + 6].children.push_back(&result_holder[0 + 7]);

results.push_back(&result_holder[0]);


// S[S[b] S[S[b] S[b]]]

result_holder[8 + 0].tag = "S";
result_holder[8 + 1].tag = "S";
result_holder[8 + 2].tag = "b";
result_holder[8 + 3].tag = "S";
result_holder[8 + 4

In [30]:
input = ["x", "+", "x", "*", "x"]
rules = """ P -> S
S -> S + M
S -> M
M -> M * T
M -> T
T -> x """
test(rules, "P", input)

PNodes result_holder(14);

// P[S[S[M[T[x]]] + M[M[T[x]] * T[x]]]]

result_holder[0 + 0].tag = "P";
result_holder[0 + 1].tag = "S";
result_holder[0 + 2].tag = "S";
result_holder[0 + 3].tag = "M";
result_holder[0 + 4].tag = "T";
result_holder[0 + 5].tag = "x";
result_holder[0 + 6].tag = "+";
result_holder[0 + 7].tag = "M";
result_holder[0 + 8].tag = "M";
result_holder[0 + 9].tag = "T";
result_holder[0 + 10].tag = "x";
result_holder[0 + 11].tag = "*";
result_holder[0 + 12].tag = "T";
result_holder[0 + 13].tag = "x";

result_holder[0 + 0].children.push_back(&result_holder[0 + 1]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 2]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 3]);
result_holder[0 + 3].children.push_back(&result_holder[0 + 4]);
result_holder[0 + 4].children.push_back(&result_holder[0 + 5]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 6]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 7]);
result_holder[0 + 7].children.p

In [31]:
rules = """ S -> a S
S -> a """
input = ["a","a","a","a"]
test(rules, "S", input)

PNodes result_holder(8);

// S[a S[a S[a S[a]]]]

result_holder[0 + 0].tag = "S";
result_holder[0 + 1].tag = "a";
result_holder[0 + 2].tag = "S";
result_holder[0 + 3].tag = "a";
result_holder[0 + 4].tag = "S";
result_holder[0 + 5].tag = "a";
result_holder[0 + 6].tag = "S";
result_holder[0 + 7].tag = "a";

result_holder[0 + 0].children.push_back(&result_holder[0 + 1]);
result_holder[0 + 0].children.push_back(&result_holder[0 + 2]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 3]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 4]);
result_holder[0 + 4].children.push_back(&result_holder[0 + 5]);
result_holder[0 + 4].children.push_back(&result_holder[0 + 6]);
result_holder[0 + 6].children.push_back(&result_holder[0 + 7]);



In [32]:
rules = """ S ->  S a
S -> a """
input = ["a","a","a","a"]
test(rules, "S", input)

PNodes result_holder(8);

// S[S[S[S[a] a] a] a]

result_holder[0 + 0].tag = "S";
result_holder[0 + 1].tag = "S";
result_holder[0 + 2].tag = "S";
result_holder[0 + 3].tag = "S";
result_holder[0 + 4].tag = "a";
result_holder[0 + 5].tag = "a";
result_holder[0 + 6].tag = "a";
result_holder[0 + 7].tag = "a";

result_holder[0 + 0].children.push_back(&result_holder[0 + 1]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 2]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 3]);
result_holder[0 + 3].children.push_back(&result_holder[0 + 4]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 5]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 6]);
result_holder[0 + 0].children.push_back(&result_holder[0 + 7]);



In [33]:
rules = """ S ->  S A
S -> A
A -> a 
A -> a a"""
input = ["a","a","a","a"]
test(rules, "S", input)

PNodes result_holder(50);

// S[S[A[a a]] A[a a]]

result_holder[0 + 0].tag = "S";
result_holder[0 + 1].tag = "S";
result_holder[0 + 2].tag = "A";
result_holder[0 + 3].tag = "a";
result_holder[0 + 4].tag = "a";
result_holder[0 + 5].tag = "A";
result_holder[0 + 6].tag = "a";
result_holder[0 + 7].tag = "a";

result_holder[0 + 0].children.push_back(&result_holder[0 + 1]);
result_holder[0 + 1].children.push_back(&result_holder[0 + 2]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 3]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 4]);
result_holder[0 + 0].children.push_back(&result_holder[0 + 5]);
result_holder[0 + 5].children.push_back(&result_holder[0 + 6]);
result_holder[0 + 5].children.push_back(&result_holder[0 + 7]);

// S[S[S[A[a]] A[a]] A[a a]]

result_holder[8 + 0].tag = "S";
result_holder[8 + 1].tag = "S";
result_holder[8 + 2].tag = "S";
result_holder[8 + 3].tag = "A";
result_holder[8 + 4].tag = "a";
result_holder[8 + 5].tag = "A";
result_holder[8 + 6].

In [34]:
rules = """ S ->  a S B B
S -> a
B -> b
B -> """
input = ["a","a","a","b"]
test(rules, "S", input)

PNodes result_holder(11);

// S[a S[a S[a] B B] B[b] B]

result_holder[0 + 0].tag = "S";
result_holder[0 + 1].tag = "a";
result_holder[0 + 2].tag = "S";
result_holder[0 + 3].tag = "a";
result_holder[0 + 4].tag = "S";
result_holder[0 + 5].tag = "a";
result_holder[0 + 6].tag = "B";
result_holder[0 + 7].tag = "B";
result_holder[0 + 8].tag = "B";
result_holder[0 + 9].tag = "b";
result_holder[0 + 10].tag = "B";

result_holder[0 + 0].children.push_back(&result_holder[0 + 1]);
result_holder[0 + 0].children.push_back(&result_holder[0 + 2]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 3]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 4]);
result_holder[0 + 4].children.push_back(&result_holder[0 + 5]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 6]);
result_holder[0 + 2].children.push_back(&result_holder[0 + 7]);
result_holder[0 + 0].children.push_back(&result_holder[0 + 8]);
result_holder[0 + 8].children.push_back(&result_holder[0 + 9]);
result_holder

In [55]:
rules = """S -> A
A -> B
B -> A
B -> a """
input = ["a"]
test(rules, "S", input)

std::vector<std::list<State>> states( 2 );
//states[ 0 ]
{
	//S -> . A | 0
	{
		State state;
		state.rule.LHS = "S";
		state.rule.RHS.push_back("A");
		state.pos = 0;
		state.origin = 0;

		states[0].push_back(state);
	}
	//A -> . B | 0
	{
		State state;
		state.rule.LHS = "A";
		state.rule.RHS.push_back("B");
		state.pos = 0;
		state.origin = 0;

		states[0].push_back(state);
	}
	//B -> . A | 0
	{
		State state;
		state.rule.LHS = "B";
		state.rule.RHS.push_back("A");
		state.pos = 0;
		state.origin = 0;

		states[0].push_back(state);
	}
	//B -> . a | 0
	{
		State state;
		state.rule.LHS = "B";
		state.rule.RHS.push_back("a");
		state.pos = 0;
		state.origin = 0;

		states[0].push_back(state);
	}
}
//states[ 1 ]
{
	//B -> a . | 0
	{
		State state;
		state.rule.LHS = "B";
		state.rule.RHS.push_back("a");
		state.pos = 1;
		state.origin = 0;

		states[1].push_back(state);
	}
	//A -> B . | 0
	{
		State state;
		state.rule.LHS = "A";
		state.rule.RHS.push_back("B");
		state.pos = 1;
		sta

In [58]:
res[0].SPPF.children[0][0].children[1][0].children[0][0].children

[[<__main__.SPPF at 0x266c0d62e10>], [<__main__.SPPF at 0x266bf078650>]]