In [6]:
#!/usr/bin/env python
import optparse
import sys
import models
from collections import namedtuple

optparser = optparse.OptionParser()
optparser.add_option("-i", "--input", dest="input", default="data/input", help="File containing sentences to translate (default=data/input)")
optparser.add_option("-t", "--translation-model", dest="tm", default="data/tm", help="File containing translation model (default=data/tm)")
optparser.add_option("-l", "--language-model", dest="lm", default="data/lm", help="File containing ARPA-format language model (default=data/lm)")
optparser.add_option("-n", "--num_sentences", dest="num_sents", default=10000000, type="int", help="Number of sentences to decode (default=no limit)")
optparser.add_option("-k", "--translations-per-phrase", dest="k", default=1, type="int", help="Limit on number of translations to consider per phrase (default=1)")
optparser.add_option("-s", "--stack-size", dest="s", default=10000, type="int", help="Maximum stack size (default=1)")
optparser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,  help="Verbose mode (default=off)")
opts = optparser.parse_args()[0]

# open the translation model
# tm cis indexed by any source phrase, like "wi mussen" and it returns a Phrase object with its english translation and
# logprob.
tm = models.TM(opts.tm, opts.k)

# open the language model, which is a trigram/bigram language model
lm = models.LM(opts.lm)


french = [tuple(line.strip().split()) for line in open(opts.input).readlines()[:opts.num_sents]]

# tm should translate unknown words as-is with probability 1
for word in set(sum(french,())):
  if (word,) not in tm:
    tm[(word,)] = [models.phrase(word, 0.0)]

sys.stderr.write("Decoding %s...\n" % (opts.input,))
for f in french:
  # The following code implements a monotone decoding
  # algorithm (one that doesn't permute the target phrases).
  # Hence all hypotheses in stacks[i] represent translations of 
  # the first i words of the input sentence. You should generalize
  # this so that they can represent translations of *any* i words.
  print('Source Sentence: {}'.format(f))
  hypothesis = namedtuple("hypothesis", "logprob, lm_state, predecessor, phrase")
  initial_hypothesis = hypothesis(0.0, lm.begin(), None, None)
  stacks = [{} for _ in f] + [{}]
  stacks[0][lm.begin()] = initial_hypothesis
  for i, stack in enumerate(stacks[:-1]):
    for h in sorted(stack.itervalues(),key=lambda h: -h.logprob)[:opts.s]: # prune
      for j in xrange(i+1,len(f)+1):
        print("Current Source Phrase:\n{}".format(f[i:j]))
        if f[i:j] in tm:
          print('translation(s) of {} --> {}'.format(f[i:j], tm[f[i:j]]))
          for phrase in tm[f[i:j]]:
            logprob = h.logprob + phrase.logprob
            lm_state = h.lm_state
            for word in phrase.english.split():
              (lm_state, word_logprob) = lm.score(lm_state, word)
              logprob += word_logprob
            logprob += lm.end(lm_state) if j == len(f) else 0.0
            new_hypothesis = hypothesis(logprob, lm_state, h, phrase)
            if lm_state not in stacks[j] or stacks[j][lm_state].logprob < logprob: # second case is recombination
              stacks[j][lm_state] = new_hypothesis 
  winner = max(stacks[-1].itervalues(), key=lambda h: h.logprob)
  def extract_english(h): 
    return "" if h.predecessor is None else "%s%s " % (extract_english(h.predecessor), h.phrase.english)
  print(extract_english(winner))

Usage: ipykernel_launcher.py [options]

ipykernel_launcher.py: error: no such option: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [7]:
for i in range(0,10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [12]:
# open the language model, which is a trigram/bigram language model
lm = models.LM('./data/lm')

Reading language model from ./data/lm...


In [14]:
lm.table

{('government', 'than', 'it'): ngram_stats(logprob=-1.246138, backoff=0.0),
 ('little', 'note'): ngram_stats(logprob=-2.910431, backoff=-0.02105464),
 ('<s>', 'things'): ngram_stats(logprob=-4.01113, backoff=-0.07642649),
 ('thank', 'first'): ngram_stats(logprob=-2.45918, backoff=-0.02105464),
 ('education', ',', 'we'): ngram_stats(logprob=-1.833844, backoff=0.0),
 ('Quebec', 'as', 'well'): ngram_stats(logprob=-0.9909095, backoff=0.0),
 ('that', 'much', 'younger'): ngram_stats(logprob=-2.432601, backoff=0.0),
 ('permit', 'this'): ngram_stats(logprob=-2.025688, backoff=0.0),
 ('at', 'the', 'beginning'): ngram_stats(logprob=-1.600901, backoff=0.0),
 ('must', 'work', 'with'): ngram_stats(logprob=-0.8320117, backoff=0.0),
 ('party', 'line'): ngram_stats(logprob=-2.59109, backoff=-0.02105463),
 ('I', 'accept', 'the'): ngram_stats(logprob=-0.7693377, backoff=0.0),
 ('that', 'is', 'very'): ngram_stats(logprob=-1.842617, backoff=0.0),
 ('not', 'unique', 'in'): ngram_stats(logprob=-1.256101, ba

In [25]:
import itertools
candidate = ['a', 'committee', 'selection', 'was', 'achievement', '.']
chart = dict()
permutations= {}
for idx,permut in enumerate(itertools.permutations(candidate)):
    permut = ['<s>'] + list(permut)
    print(permut)
    chart[idx] = 0
    permutations[idx] = permut
    for i in range(0, len(permut)):
        for j in range(i + 1, len(permut)):
        
            if j == len(permut) - 1:
                # if we're at the end of sentence
                if (permut[i], permut[j]) in lm.table:
                    chart[idx] += lm.end((permut[i], permut[j]))
            else:
                # if we're somewhere in the middle
                if (permut[i], permut[j]) in lm.table:
                    (_lm, word_logprob) = lm.score((permut[i], permut[j]), permut[j+1])
                    chart[idx] += word_logprob


['<s>', 'a', 'committee', 'selection', 'was', 'achievement', '.']
['<s>', 'a', 'committee', 'selection', 'was', '.', 'achievement']
['<s>', 'a', 'committee', 'selection', 'achievement', 'was', '.']
['<s>', 'a', 'committee', 'selection', 'achievement', '.', 'was']
['<s>', 'a', 'committee', 'selection', '.', 'was', 'achievement']
['<s>', 'a', 'committee', 'selection', '.', 'achievement', 'was']
['<s>', 'a', 'committee', 'was', 'selection', 'achievement', '.']
['<s>', 'a', 'committee', 'was', 'selection', '.', 'achievement']
['<s>', 'a', 'committee', 'was', 'achievement', 'selection', '.']
['<s>', 'a', 'committee', 'was', 'achievement', '.', 'selection']
['<s>', 'a', 'committee', 'was', '.', 'selection', 'achievement']
['<s>', 'a', 'committee', 'was', '.', 'achievement', 'selection']
['<s>', 'a', 'committee', 'achievement', 'selection', 'was', '.']
['<s>', 'a', 'committee', 'achievement', 'selection', '.', 'was']
['<s>', 'a', 'committee', 'achievement', 'was', 'selection', '.']
['<s>', 'a

In [None]:
import itertools
for permut in itertools.permutations(list(lm_state) + [word]):
    print(permut)
    wd = permut[-1]
    st = permut[0:-1]
    print('current permutation: {}'.format(permut))
    if st in lm.table:
        print(lm.score(st, wd))
    else:
        print('{} not in lm'.format(st))


In [28]:
permutations[max(chart, key=chart.get)]

['<s>', 'selection', 'achievement', 'a', 'was', 'committee', '.']

In [None]:
if opts.verbose:
    def extract_tm_logprob(h):
      return 0.0 if h.predecessor is None else h.phrase.logprob + extract_tm_logprob(h.predecessor)
    tm_logprob = extract_tm_logprob(winner)
    sys.stderr.write("LM = %f, TM = %f, Total = %f\n" % 
      (winner.logprob - tm_logprob, tm_logprob, winner.logprob))