Correct conditional probability computations, changed output format

Signed-off-by: Rafael Lopez <rafael@case.edu>
timtadh · Apr 13, 2012 · b3b8ffc · b3b8ffc
1 parent f2987e3
commit b3b8ffc
Showing 1 changed file with 28 additions and 19 deletions.
diff --git a/artifacts/tables/counts.py b/artifacts/tables/counts.py
@@ -119,15 +119,15 @@ def production_probability(path, oldtable, tables, conf):
 def conditional_probabilities(path, oldtable, tables, conf):
     grammar = dict((row[0], tuple(row[1:])) for row in tables['infer_grammar'])
 
-    counts = dict()
+    counts = dict() #counts how many times a RULE is reached by a specific prevTuple (e.g. counts[NT => A:B:C][(NT1,NT2)] == 5)
+    terminalCounts = dict() #counts how many times a prevTuple reaches a specific NONTERMINAL (e.g. terminalCounts[(NT1,NT2)][NT] == 78)
     stack = list()
-    lookBack = 2
+    lookBack = 2 #how many items in prevTuple?
     def callback(grammar, node, depth):
-        #if this is a new ast then we want to reset our information
+        #if this is a new ast then we want to clear our stack
         if node.label == "Start":
             while stack:
                 stack.pop()
-            counts.clear()
             initStack = (tuple(None for x in range(lookBack)), False)
             stack.append(initStack)
 
@@ -147,20 +147,30 @@ def callback(grammar, node, depth):
         productions = grammar[node.label]
         p = productions.index(':'.join(kid.label for kid in node.children)) + 1
 
-        #build up our dictionary of production counts
-        if not counts.has_key(node.label):
-            counts[node.label] = {prevAsTuple : 1}
+        chosenRule = node.label + " => " + grammar[node.label][p-1]
+
+
+        if not counts.has_key(chosenRule):
+            counts[chosenRule] = {prevAsTuple : 1}
         else:
-            if not counts.get(node.label).has_key(prevAsTuple):
-                counts[node.label][prevAsTuple] = 1
+            if not counts.get(chosenRule).has_key(prevAsTuple):
+                counts[chosenRule][prevAsTuple] = 1
             else:
-                counts[node.label][prevAsTuple] += 1
+                counts[chosenRule][prevAsTuple] += 1
+
+        if not terminalCounts.has_key(prevAsTuple):
+            terminalCounts[prevAsTuple] = {node.label : 1}
+        else:
+            if not terminalCounts.get(prevAsTuple).has_key(node.label):
+                terminalCounts[prevAsTuple][node.label] = 1
+            else:
+                terminalCounts[prevAsTuple][node.label] += 1
 
         #append this new rule to the stack as our new "most previous"
         if grammar[node.label][p-1].count(":") > 1:
             stack.append(
                 (
-                    tuple(prev[x+1] for x in range(lookBack-1)) + (grammar[node.label][p-1],),
+                    tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,),
                     False
                 )
             )
@@ -169,33 +179,32 @@ def callback(grammar, node, depth):
               #e.g. with NT:NT2:NT3, when we get to NT2, we dont want previous to include the previous from when we went down NT's productions
             stack.append(
                 (
-                    tuple(prev[x+1] for x in range(lookBack-1)) + (grammar[node.label][p-1],),
+                    tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,),
                     True
                  )
             )
     walktrees(conf['trees'], functools.partial(callback, grammar))
 
 
-    #now we normalize
     probabilities = dict(
         (
-          nonterm,
+          rule,
           dict(
             (
               prev,
-              float(num)/float(sum(num for num in myCounts.itervalues()))
+              float(num)/float(terminalCounts[prev][rule.split("=>")[0].strip()]) #P[rule | prev]
             )
             for prev, num in myCounts.iteritems()
           )
         )
-        for nonterm, myCounts in counts.iteritems()
+        for rule, myCounts in counts.iteritems()
     )
 
     table = tuple(
-        (nonterm, prev, probability)
-        for nonterm, myCounts in probabilities.iteritems()
+        (lookBack, prod) + tuple(nt for nt in prev) + (probability,)
+        for prod, myCounts in probabilities.iteritems()
             for prev, probability in myCounts.iteritems()
     )
 
     save(path, table)
-    return table
+    return table