# vivekn/nlp-sandbox

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
57 lines (43 sloc) 1.73 KB
 """ Solution to second problem using letter trigram """ import re, collections def product(xs): return reduce(lambda x, y: x*y, xs) def two_letters(text): return re.findall('..', text.lower()) def train(lett_seq): corp = collections.defaultdict(int) for n, word in enumerate(lett_seq[2:]): corp[(lett_seq[n], lett_seq[n+1], word)] += 1 corp['sum'] += 1 return corp def prob(corpus, pair): return float((corpus[pair] + 1)) / (len(corpus) + corpus['sum']) CORPUS = train((file('big.txt').read())) def transpose(array): width = len(array[0]) height = len(array) return [[array[j][i] for j in range(height)] for i in range(width)] def strip_corr(left, right): return product(prob(CORPUS, (l[-2], l[-1], r[0])) for l, r in zip(left, right)) def combine_strips(strips): return ''.join(map(''.join, transpose(strips))) def reorder(strips): closest_pair = max([(x,y) for x in strips for y in strips if x!= y], key=lambda (a, b): max(strip_corr(a, b), strip_corr(b, a))) c1, c2 = closest_pair strips.remove(c1) strips.remove(c2) correct = [c1, c2] if strip_corr(c1, c2) > strip_corr(c2, c1) else [c2, c1] while strips: left_max = max(strips, key=lambda x:strip_corr(x, correct[0])) right_max = max(strips, key=lambda x:strip_corr(correct[-1], x)) if strip_corr(correct[-1], right_max) > strip_corr(left_max, correct[0]): q = right_max correct.append(q) else: q = left_max correct.insert(0, q) strips.remove(q) return correct def main(): f = open('input.txt') lines = [l.strip().strip('|').split('|') for l in f] print combine_strips(reorder(transpose(lines))) if __name__ == '__main__': main()