/
table_lp.py
executable file
·156 lines (120 loc) · 4.29 KB
/
table_lp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
#coding=utf-8
from pulp import *
import table
import math
def colVar(x, ctype):
return "col%s_is_%s"%(x,ctype)
def rowVar(y, rtypes):
return "row%s_is_%s"%(y,rtypes)
def tokVar(t, ttypes):
return "tok%s_is_%s"%(t,ttypes)
class TableModel:
def __init__(self, ctypes, table):
self.ctypes = ctypes;
self.rtypes = ["unknown", "product", "header"]
self.ttypes = ["unclassified"]
self.ttypes_dec = []
self.ttypes_val = []
for ctype in ctypes:
self.ttypes_dec.append(ctype + "_dec")
self.ttypes_val.append(ctype + "_val")
self.ttypes.append(ctype + "_dec")
self.ttypes.append(ctype + "_val")
self.table = table
#build LP problem
self.lp = LpProblem("table", LpMaximize)
self.v = {}
#build column type variables and constrain only one-of-n is active
for x in range(table.width):
cats = []
for ctype in self.ctypes:
cats.append(self.addVariable(colVar(x, ctype), 0,1,cat='Integer'))
self.lp += lpSum(cats) == 1
#build row type variables and constrain only one-of-n is active
for y in range(table.height):
cats = []
for rtype in self.rtypes:
cats.append(self.addVariable(rowVar(y, rtype), 0,1,cat='Integer'))
self.lp += lpSum(cats) == 1
#ensure at least one row in the top three is a header
'''
cats = []
for y in range(min(3, table.height)):
cats.append(self.v[rowVar(y, "header")])
self.lp += lpSum(cats) >=1
'''
row_decs = {}
row_vals = {}
#build token type variables and constrain only one-of-n is active
for tid, (x_s, x_e, y, word) in enumerate(table.get_tokens()):
print word
cats = []
for ttype in self.ttypes:
cats.append(self.addVariable(tokVar(tid, ttype), 0,1,cat='Integer'))
self.lp += lpSum(cats) == 1
#furthermore, ensure only declarations (or unclassified) appear in header rows
#and values appear only in product rows
decs = []
vals = []
for ttype in self.ttypes_dec:
decs.append(self.v[tokVar(tid, ttype)])
vals = []
for ttype in self.ttypes_val:
vals.append(self.v[tokVar(tid, ttype)])
self.lp += lpSum(decs) == self.v[rowVar(y, "header")]
self.lp += lpSum(vals) == self.v[rowVar(y, "product")]
#furthermore, ensure column types match either a token val or declaration, or unknown
#in entire range of the token
for x in range(x_s, x_e):
for ctype in self.ctypes:
self.lp += \
self.v[tokVar(tid, ctype + "_dec")] + \
self.v[tokVar(tid, ctype + "_val")] + \
self.v[tokVar(tid, "unclassified")] \
>= self.v[colVar(x, ctype)]
def solve(self, token_probability_fn):
#maximize log probability of function that maps words to types
obj = []
for tid, (x_s, x_e, y, word) in enumerate(table.get_tokens()):
for ttype in self.ttypes:
p = math.log(token_probability_fn(ttype, word))
obj.append( p * self.v[tokVar(tid, ttype)])
self.lp += lpSum(obj)
print "solving"
self.lp.solve()
print "Status:", LpStatus[self.lp.status]
for x in range(self.table.width):
for ctype in self.ctypes:
if self.v[colVar(x, ctype)].varValue > 0.5:
self.table.setColType(x, ctype)
print colVar(x, ctype), " is true"
for y in range(self.table.height):
for rtype in self.rtypes:
if self.v[rowVar(y, rtype)].varValue > 0.5:
self.table.setRowType(y, rtype)
print rowVar(y, rtype), " is true"
for tid, (x_s, x_e, y, word) in enumerate(self.table.get_tokens()):
for ttype in self.ttypes:
if self.v[tokVar(tid, ttype)].varValue > 0.5:
self.table.setTokType(tid, ttype)
pass
print word, tokVar(tid, ttype), " is true"
def addVariable(self, name, LB=None, UB=None, cat='Continuous'):
self.v[name] = LpVariable(name, LB,UB,cat)
return self.v[name]
def probabilityCatagoryGivenWord(catagory, word):
if word == "Digi-Key" and catagory == "ordercode_dec":
return 0.9
if word.endswith("-ND") and catagory == "ordercode_val":
return 0.8
if "-" in word and catagory == "partnum_val":
return 0.4
if catagory == "unclassified" or catagory == "unknown_val" or catagory == "unknown_dec":
return 0.3
return 0.1
if __name__ == "__main__":
table = table.read_table(sys.argv[1])
model = TableModel(["unknown", "partnum", "ordercode"], table)
model.solve(probabilityCatagoryGivenWord)
print table