## data preparing

In [1]:
import math
import operator
from pprint import pprint
from collections import defaultdict, OrderedDict

PRONS = set([line.strip('\n') for line in open('prons.txt')])

with open('HiFreWords') as f:
    HiFreWords = set(f.readline().split('\t'))


In [2]:
def create_sentence_pattern_list(input_pat):
    pattern = []
    final = []
    for i in input_pat:
        if i != '':
            pattern.append(i)
        else:
            final.append(pattern.copy())
            pattern.clear()

    # Last one
    final.append(pattern)
    return final


In [3]:
# English
english_corpus = open('corpus.txt', 'r').read().strip('\n').split('\n')
english_corpus = create_sentence_pattern_list(english_corpus)

In [4]:
# English correct sentences
english_sent = open('UM-Corpus.en.200k.txt', 'r').read().split('\n')

In [5]:
# Chinese
chinese_corpus = open('UM-Corpus.ch.200k.tagged.ch.txt', 'r').read().split('\n')

# Align
aligns = open('align.final.200k', 'r').read().split('\n')

In [6]:
def pattern_pos(sent1, sent2):
    if not isinstance(sent1, list):
        sent1 = sent1.split()

    if not isinstance(sent2, list):
        sent2 = sent2.split()

    if len(sent1) < len(sent2):
        sent1, sent2 = sent2, sent1

    # sent1 is the whole sentence
    # sent2 is the sub sentence

    count = 0
    n = len(sent2)
    for i in range(len(sent1)):
        count = 0
        for j in range(n):
            if sent1[i] == sent2[j]:
                count += 1
                i += 1
                if count == n:
                    return (i - n, i)
            else:
                i -= count
                break
    return (-1, -1)

In [7]:
def compute_score(word, sent):
    global PRONS
    global HiFreWords

    word = word.lower()
    sent = sent.lower().split()
    length = len(sent)

    locationOfWord = -1 if word not in sent else sent.index(word)
    hiFreWordsScore = len([w for w in sent if w not in HiFreWords])
    pronsScore = len([w for w in sent if w in PRONS])

    return locationOfWord - hiFreWordsScore - pronsScore


In [8]:
from orderedset import OrderedSet

In [9]:
def extract_ch_grammar(ch_pat):
    ch_grammar = []
    
    # "莊稼_N 了_ASP 收割_V 莊稼_N" -> ['N', 'ASP', 'V', 'N']
    ch_grammar = ch_pat.split()
    
    ch_grammar = [cg.split('_')[1] for cg in ch_grammar if '_' in cg]
    ch_grammar = [cg for cg in ch_grammar if cg == 'V' or cg == 'P' or cg == 'N']
    
    if ch_grammar == ['V', 'V']:
        ch_grammar = 'V v'
    else:
        ch_grammar = OrderedSet(ch_grammar)
        ch_grammar = ' '.join(ch_grammar).lower().replace('v', 'V')

    return ch_grammar

In [10]:
def extract_pattern():
    count = 0
    noisy_channel = defaultdict(lambda: defaultdict(list))
    for english, chinese, align in zip(english_corpus, chinese_corpus, aligns):
        count += 1
        en_sent = english[0].split()
        ch_sent = chinese.split()
        align = align.split()
        en_ch = OrderedDict()
        index = 0

        try:
            for a in align:
                en_pos, ch_pos = a.split('-')
                en_pos = int(en_pos)
                ch_pos = int(ch_pos)
                en = en_sent[en_pos]
                ch = ch_sent[ch_pos]
                en_ch[index, en_pos, en] = ch
                index += 1

            for _ in english[1:]:
                _, en_grammar, en_pat = _.split('\t')
                start, end = pattern_pos(en_sent, en_pat)
                ch_pat = ""
                for en, ch_term in en_ch.items():
                    _, en_pos, en_term = en
                    if en_pos >= start and en_pos < end:
                        ch_pat += "%s " % ch_term
                    elif en_pos >= end:
                        break
                if 'V' in ch_pat:
                    ch_grammar = extract_ch_grammar(ch_pat)
                    noisy_channel_pattern = "%s | %s" % (en_pat, ch_pat)
                    noisy_channel[en_grammar][ch_grammar].append(noisy_channel_pattern)

        except Exception as e:
            print("line %d: %s" % (count, str(e)))
    return noisy_channel

In [11]:
noisy_channel = extract_pattern()

line 1108: list index out of range
line 1365: list index out of range
line 1643: list index out of range
line 1995: list index out of range
line 2526: list index out of range
line 2592: list index out of range
line 2666: list index out of range
line 2722: list index out of range
line 3154: list index out of range
line 3707: list index out of range
line 4320: list index out of range
line 4451: list index out of range
line 4570: list index out of range
line 4610: list index out of range
line 4768: list index out of range
line 6265: list index out of range
line 6453: list index out of range
line 6989: list index out of range
line 7071: list index out of range
line 7318: list index out of range
line 7319: list index out of range
line 7413: list index out of range
line 7524: list index out of range
line 8129: list index out of range
line 8672: list index out of range
line 8883: list index out of range
line 8968: list index out of range
line 9297: list index out of range
line 9953: list inde

line 18274: list index out of range
line 18369: list index out of range
line 18385: list index out of range
line 18390: list index out of range
line 18395: list index out of range
line 18397: list index out of range
line 18399: list index out of range
line 18400: list index out of range
line 18401: list index out of range
line 18405: list index out of range
line 18445: list index out of range
line 18487: list index out of range
line 18489: list index out of range
line 18549: list index out of range
line 18555: list index out of range
line 18556: list index out of range
line 18558: list index out of range
line 18572: list index out of range
line 18603: list index out of range
line 18616: list index out of range
line 18618: list index out of range
line 18629: list index out of range
line 18639: list index out of range
line 18654: list index out of range
line 18677: list index out of range
line 18678: list index out of range
line 18702: list index out of range
line 18704: list index out o

line 25481: list index out of range
line 25497: list index out of range
line 25502: list index out of range
line 25517: list index out of range
line 25533: list index out of range
line 25541: list index out of range
line 25578: list index out of range
line 25660: list index out of range
line 25668: list index out of range
line 25696: list index out of range
line 25710: list index out of range
line 25714: list index out of range
line 25730: list index out of range
line 25734: list index out of range
line 25739: list index out of range
line 25751: list index out of range
line 25756: list index out of range
line 25773: list index out of range
line 25787: list index out of range
line 25796: list index out of range
line 25804: list index out of range
line 25814: list index out of range
line 25840: list index out of range
line 25861: list index out of range
line 25879: list index out of range
line 25894: list index out of range
line 25902: list index out of range
line 25922: list index out o

line 31783: list index out of range
line 31784: list index out of range
line 31785: list index out of range
line 31786: list index out of range
line 31787: list index out of range
line 31788: list index out of range
line 31789: list index out of range
line 31791: list index out of range
line 31794: list index out of range
line 31795: list index out of range
line 31796: list index out of range
line 31797: list index out of range
line 31799: list index out of range
line 31804: list index out of range
line 31805: list index out of range
line 31806: list index out of range
line 31807: list index out of range
line 31812: list index out of range
line 31813: list index out of range
line 31815: list index out of range
line 31816: list index out of range
line 31817: list index out of range
line 31819: list index out of range
line 31821: list index out of range
line 31822: list index out of range
line 31823: list index out of range
line 31824: list index out of range
line 31825: list index out o

line 35615: list index out of range
line 35618: list index out of range
line 35631: list index out of range
line 35632: list index out of range
line 35640: list index out of range
line 35657: list index out of range
line 35675: list index out of range
line 35679: list index out of range
line 35680: list index out of range
line 35682: list index out of range
line 35688: list index out of range
line 35718: list index out of range
line 35744: list index out of range
line 35749: list index out of range
line 35765: list index out of range
line 35768: list index out of range
line 35789: list index out of range
line 35799: list index out of range
line 35811: list index out of range
line 35812: list index out of range
line 35818: list index out of range
line 35832: list index out of range
line 35846: list index out of range
line 35871: list index out of range
line 35880: list index out of range
line 35884: list index out of range
line 35886: list index out of range
line 35887: list index out o

line 114763: list index out of range
line 114813: list index out of range
line 114836: list index out of range
line 114843: list index out of range
line 114999: list index out of range
line 115089: list index out of range
line 115210: list index out of range
line 115221: list index out of range
line 115250: list index out of range
line 115266: list index out of range
line 115393: list index out of range
line 115530: list index out of range
line 115552: list index out of range
line 115675: list index out of range
line 115683: list index out of range
line 115721: list index out of range
line 115812: list index out of range
line 115875: list index out of range
line 115913: list index out of range
line 115914: list index out of range
line 115916: list index out of range
line 115917: list index out of range
line 115932: list index out of range
line 115941: list index out of range
line 115973: list index out of range
line 116054: list index out of range
line 116056: list index out of range
l

line 132288: list index out of range
line 132361: list index out of range
line 132363: list index out of range
line 132366: list index out of range
line 132382: list index out of range
line 132390: list index out of range
line 132439: list index out of range
line 132469: list index out of range
line 132478: list index out of range
line 132541: list index out of range
line 132550: list index out of range
line 132555: list index out of range
line 132558: list index out of range
line 132562: list index out of range
line 132612: list index out of range
line 132801: list index out of range
line 132809: list index out of range
line 132823: list index out of range
line 132891: list index out of range
line 132896: list index out of range
line 132986: list index out of range
line 133173: list index out of range
line 133175: list index out of range
line 133177: list index out of range
line 133182: list index out of range
line 133184: list index out of range
line 133196: list index out of range
l

line 136587: list index out of range
line 136591: list index out of range
line 136595: list index out of range
line 136600: list index out of range
line 136611: list index out of range
line 136612: list index out of range
line 136624: list index out of range
line 136625: list index out of range
line 136627: list index out of range
line 136634: list index out of range
line 136640: list index out of range
line 136661: list index out of range
line 136665: list index out of range
line 136671: list index out of range
line 136674: list index out of range
line 136677: list index out of range
line 136680: list index out of range
line 136687: list index out of range
line 136691: list index out of range
line 136695: list index out of range
line 136697: list index out of range
line 136709: list index out of range
line 136721: list index out of range
line 136727: list index out of range
line 136731: list index out of range
line 136734: list index out of range
line 136737: list index out of range
l

line 139790: list index out of range
line 139793: list index out of range
line 139794: list index out of range
line 139800: list index out of range
line 139806: list index out of range
line 139813: list index out of range
line 139814: list index out of range
line 139819: list index out of range
line 139820: list index out of range
line 139823: list index out of range
line 139825: list index out of range
line 139828: list index out of range
line 139834: list index out of range
line 139844: list index out of range
line 139845: list index out of range
line 139850: list index out of range
line 139936: list index out of range
line 139938: list index out of range
line 140271: list index out of range
line 140428: list index out of range
line 140429: list index out of range
line 140430: list index out of range
line 140536: list index out of range
line 140539: list index out of range
line 140541: list index out of range
line 140544: list index out of range
line 140545: list index out of range
l

line 158344: list index out of range
line 158554: list index out of range
line 158631: list index out of range
line 158743: list index out of range
line 158775: list index out of range
line 158779: list index out of range
line 159900: list index out of range
line 160519: list index out of range
line 160602: list index out of range
line 160655: list index out of range
line 160700: list index out of range
line 160701: list index out of range
line 160725: list index out of range
line 160917: list index out of range
line 160948: list index out of range
line 160967: list index out of range
line 160985: list index out of range
line 161052: list index out of range
line 161074: list index out of range
line 161090: list index out of range
line 161265: list index out of range
line 161339: list index out of range
line 161340: list index out of range
line 161349: list index out of range
line 161356: list index out of range
line 161617: list index out of range
line 161664: list index out of range
l

line 164423: list index out of range
line 164424: list index out of range
line 164426: list index out of range
line 164427: list index out of range
line 164429: list index out of range
line 164430: list index out of range
line 164431: list index out of range
line 164432: list index out of range
line 164433: list index out of range
line 164434: list index out of range
line 164435: list index out of range
line 164436: list index out of range
line 164437: list index out of range
line 164440: list index out of range
line 164443: list index out of range
line 164444: list index out of range
line 164449: list index out of range
line 164450: list index out of range
line 164451: list index out of range
line 164453: list index out of range
line 164454: list index out of range
line 164455: list index out of range
line 164456: list index out of range
line 164459: list index out of range
line 164460: list index out of range
line 164461: list index out of range
line 164465: list index out of range
l

line 166805: list index out of range
line 166806: list index out of range
line 166807: list index out of range
line 166808: list index out of range
line 166809: list index out of range
line 166810: list index out of range
line 166811: list index out of range
line 166812: list index out of range
line 166813: list index out of range
line 166814: list index out of range
line 166815: list index out of range
line 166817: list index out of range
line 166818: list index out of range
line 166819: list index out of range
line 166822: list index out of range
line 166824: list index out of range
line 166825: list index out of range
line 166827: list index out of range
line 166832: list index out of range
line 166833: list index out of range
line 166834: list index out of range
line 166835: list index out of range
line 166836: list index out of range
line 166837: list index out of range
line 166838: list index out of range
line 166839: list index out of range
line 166840: list index out of range
l

line 169029: list index out of range
line 169030: list index out of range
line 169033: list index out of range
line 169036: list index out of range
line 169037: list index out of range
line 169038: list index out of range
line 169039: list index out of range
line 169042: list index out of range
line 169043: list index out of range
line 169044: list index out of range
line 169045: list index out of range
line 169047: list index out of range
line 169049: list index out of range
line 169050: list index out of range
line 169052: list index out of range
line 169058: list index out of range
line 169059: list index out of range
line 169060: list index out of range
line 169061: list index out of range
line 169062: list index out of range
line 169064: list index out of range
line 169065: list index out of range
line 169066: list index out of range
line 169068: list index out of range
line 169069: list index out of range
line 169070: list index out of range
line 169071: list index out of range
l

line 170424: list index out of range
line 170425: list index out of range
line 170426: list index out of range
line 170427: list index out of range
line 170428: list index out of range
line 170429: list index out of range
line 170430: list index out of range
line 170433: list index out of range
line 170434: list index out of range
line 170436: list index out of range
line 170437: list index out of range
line 170438: list index out of range
line 170439: list index out of range
line 170440: list index out of range
line 170441: list index out of range
line 170442: list index out of range
line 170443: list index out of range
line 170444: list index out of range
line 170445: list index out of range
line 170446: list index out of range
line 170447: list index out of range
line 170448: list index out of range
line 170449: list index out of range
line 170451: list index out of range
line 170452: list index out of range
line 170453: list index out of range
line 170454: list index out of range
l

line 172799: list index out of range
line 172800: list index out of range
line 172801: list index out of range
line 172803: list index out of range
line 172808: list index out of range
line 172809: list index out of range
line 172810: list index out of range
line 172811: list index out of range
line 172812: list index out of range
line 172813: list index out of range
line 172814: list index out of range
line 172815: list index out of range
line 172816: list index out of range
line 172818: list index out of range
line 172819: list index out of range
line 172821: list index out of range
line 172824: list index out of range
line 172825: list index out of range
line 172830: list index out of range
line 172832: list index out of range
line 172833: list index out of range
line 172834: list index out of range
line 172836: list index out of range
line 172837: list index out of range
line 172838: list index out of range
line 172839: list index out of range
line 172840: list index out of range
l

line 174800: list index out of range
line 174801: list index out of range
line 174802: list index out of range
line 174803: list index out of range
line 174805: list index out of range
line 174807: list index out of range
line 174808: list index out of range
line 174809: list index out of range
line 174810: list index out of range
line 174814: list index out of range
line 174815: list index out of range
line 174816: list index out of range
line 174817: list index out of range
line 174818: list index out of range
line 174819: list index out of range
line 174821: list index out of range
line 174822: list index out of range
line 174823: list index out of range
line 174824: list index out of range
line 174827: list index out of range
line 174828: list index out of range
line 174829: list index out of range
line 174831: list index out of range
line 174832: list index out of range
line 174834: list index out of range
line 174835: list index out of range
line 174836: list index out of range
l

line 176545: list index out of range
line 176547: list index out of range
line 176550: list index out of range
line 176551: list index out of range
line 176554: list index out of range
line 176555: list index out of range
line 176556: list index out of range
line 176559: list index out of range
line 176560: list index out of range
line 176562: list index out of range
line 176563: list index out of range
line 176565: list index out of range
line 176566: list index out of range
line 176568: list index out of range
line 176571: list index out of range
line 176573: list index out of range
line 176577: list index out of range
line 176578: list index out of range
line 176579: list index out of range
line 176580: list index out of range
line 176583: list index out of range
line 176584: list index out of range
line 176585: list index out of range
line 176586: list index out of range
line 176588: list index out of range
line 176589: list index out of range
line 176591: list index out of range
l

line 178284: list index out of range
line 178285: list index out of range
line 178286: list index out of range
line 178287: list index out of range
line 178288: list index out of range
line 178289: list index out of range
line 178296: list index out of range
line 178297: list index out of range
line 178299: list index out of range
line 178300: list index out of range
line 178301: list index out of range
line 178307: list index out of range
line 178308: list index out of range
line 178309: list index out of range
line 178314: list index out of range
line 178315: list index out of range
line 178316: list index out of range
line 178317: list index out of range
line 178319: list index out of range
line 178320: list index out of range
line 178321: list index out of range
line 178323: list index out of range
line 178325: list index out of range
line 178326: list index out of range
line 178329: list index out of range
line 178330: list index out of range
line 178331: list index out of range
l

line 180304: list index out of range
line 180305: list index out of range
line 180306: list index out of range
line 180307: list index out of range
line 180308: list index out of range
line 180310: list index out of range
line 180311: list index out of range
line 180312: list index out of range
line 180313: list index out of range
line 180314: list index out of range
line 180316: list index out of range
line 180317: list index out of range
line 180318: list index out of range
line 180319: list index out of range
line 180321: list index out of range
line 180322: list index out of range
line 180323: list index out of range
line 180324: list index out of range
line 180325: list index out of range
line 180326: list index out of range
line 180327: list index out of range
line 180328: list index out of range
line 180329: list index out of range
line 180330: list index out of range
line 180331: list index out of range
line 180332: list index out of range
line 180333: list index out of range
l

line 181716: list index out of range
line 181718: list index out of range
line 181719: list index out of range
line 181720: list index out of range
line 181722: list index out of range
line 181723: list index out of range
line 181724: list index out of range
line 181725: list index out of range
line 181726: list index out of range
line 181727: list index out of range
line 181728: list index out of range
line 181729: list index out of range
line 181731: list index out of range
line 181733: list index out of range
line 181735: list index out of range
line 181736: list index out of range
line 181737: list index out of range
line 181738: list index out of range
line 181739: list index out of range
line 181740: list index out of range
line 181741: list index out of range
line 181742: list index out of range
line 181744: list index out of range
line 181745: list index out of range
line 181746: list index out of range
line 181747: list index out of range
line 181748: list index out of range
l

line 183433: list index out of range
line 183435: list index out of range
line 183437: list index out of range
line 183438: list index out of range
line 183439: list index out of range
line 183440: list index out of range
line 183442: list index out of range
line 183444: list index out of range
line 183445: list index out of range
line 183446: list index out of range
line 183447: list index out of range
line 183451: list index out of range
line 183453: list index out of range
line 183455: list index out of range
line 183457: list index out of range
line 183458: list index out of range
line 183463: list index out of range
line 183464: list index out of range
line 183465: list index out of range
line 183466: list index out of range
line 183469: list index out of range
line 183471: list index out of range
line 183472: list index out of range
line 183473: list index out of range
line 183474: list index out of range
line 183475: list index out of range
line 183476: list index out of range
l

line 184775: list index out of range
line 184777: list index out of range
line 184778: list index out of range
line 184779: list index out of range
line 184781: list index out of range
line 184783: list index out of range
line 184785: list index out of range
line 184787: list index out of range
line 184788: list index out of range
line 184789: list index out of range
line 184794: list index out of range
line 184800: list index out of range
line 184801: list index out of range
line 184802: list index out of range
line 184803: list index out of range
line 184804: list index out of range
line 184805: list index out of range
line 184806: list index out of range
line 184809: list index out of range
line 184810: list index out of range
line 184814: list index out of range
line 184818: list index out of range
line 184819: list index out of range
line 184820: list index out of range
line 184823: list index out of range
line 184824: list index out of range
line 184827: list index out of range
l

line 186955: list index out of range
line 186958: list index out of range
line 186959: list index out of range
line 186962: list index out of range
line 186964: list index out of range
line 186965: list index out of range
line 186966: list index out of range
line 186967: list index out of range
line 186968: list index out of range
line 186970: list index out of range
line 186971: list index out of range
line 186973: list index out of range
line 186974: list index out of range
line 186975: list index out of range
line 186976: list index out of range
line 186977: list index out of range
line 186978: list index out of range
line 186979: list index out of range
line 186980: list index out of range
line 186981: list index out of range
line 186983: list index out of range
line 186984: list index out of range
line 186985: list index out of range
line 186986: list index out of range
line 186987: list index out of range
line 186988: list index out of range
line 186989: list index out of range
l

line 188336: list index out of range
line 188338: list index out of range
line 188339: list index out of range
line 188341: list index out of range
line 188343: list index out of range
line 188344: list index out of range
line 188346: list index out of range
line 188347: list index out of range
line 188348: list index out of range
line 188349: list index out of range
line 188350: list index out of range
line 188351: list index out of range
line 188352: list index out of range
line 188353: list index out of range
line 188354: list index out of range
line 188359: list index out of range
line 188362: list index out of range
line 188364: list index out of range
line 188369: list index out of range
line 188372: list index out of range
line 188373: list index out of range
line 188374: list index out of range
line 188375: list index out of range
line 188380: list index out of range
line 188382: list index out of range
line 188383: list index out of range
line 188384: list index out of range
l

line 189771: list index out of range
line 189775: list index out of range
line 189776: list index out of range
line 189778: list index out of range
line 189781: list index out of range
line 189783: list index out of range
line 189789: list index out of range
line 189791: list index out of range
line 189792: list index out of range
line 189793: list index out of range
line 189794: list index out of range
line 189795: list index out of range
line 189797: list index out of range
line 189798: list index out of range
line 189799: list index out of range
line 189801: list index out of range
line 189803: list index out of range
line 189804: list index out of range
line 189806: list index out of range
line 189807: list index out of range
line 189808: list index out of range
line 189810: list index out of range
line 189811: list index out of range
line 189812: list index out of range
line 189813: list index out of range
line 189814: list index out of range
line 189815: list index out of range
l

line 192129: list index out of range
line 192130: list index out of range
line 192131: list index out of range
line 192132: list index out of range
line 192133: list index out of range
line 192134: list index out of range
line 192135: list index out of range
line 192136: list index out of range
line 192138: list index out of range
line 192140: list index out of range
line 192143: list index out of range
line 192144: list index out of range
line 192145: list index out of range
line 192146: list index out of range
line 192147: list index out of range
line 192148: list index out of range
line 192149: list index out of range
line 192151: list index out of range
line 192152: list index out of range
line 192154: list index out of range
line 192156: list index out of range
line 192157: list index out of range
line 192161: list index out of range
line 192162: list index out of range
line 192163: list index out of range
line 192165: list index out of range
line 192166: list index out of range
l

line 193973: list index out of range
line 193977: list index out of range
line 193978: list index out of range
line 193979: list index out of range
line 193980: list index out of range
line 193981: list index out of range
line 193983: list index out of range
line 193984: list index out of range
line 193986: list index out of range
line 193988: list index out of range
line 193989: list index out of range
line 193990: list index out of range
line 193993: list index out of range
line 193997: list index out of range
line 193998: list index out of range
line 193999: list index out of range
line 194001: list index out of range
line 194003: list index out of range
line 194004: list index out of range
line 194009: list index out of range
line 194011: list index out of range
line 194012: list index out of range
line 194013: list index out of range
line 194014: list index out of range
line 194015: list index out of range
line 194016: list index out of range
line 194017: list index out of range
l

line 196183: list index out of range
line 196184: list index out of range
line 196188: list index out of range
line 196190: list index out of range
line 196192: list index out of range
line 196196: list index out of range
line 196197: list index out of range
line 196198: list index out of range
line 196199: list index out of range
line 196200: list index out of range
line 196201: list index out of range
line 196202: list index out of range
line 196203: list index out of range
line 196204: list index out of range
line 196205: list index out of range
line 196207: list index out of range
line 196208: list index out of range
line 196209: list index out of range
line 196210: list index out of range
line 196211: list index out of range
line 196213: list index out of range
line 196214: list index out of range
line 196216: list index out of range
line 196217: list index out of range
line 196218: list index out of range
line 196220: list index out of range
line 196221: list index out of range
l

line 198448: list index out of range
line 198449: list index out of range
line 198450: list index out of range
line 198451: list index out of range
line 198454: list index out of range
line 198457: list index out of range
line 198458: list index out of range
line 198461: list index out of range
line 198463: list index out of range
line 198465: list index out of range
line 198470: list index out of range
line 198471: list index out of range
line 198472: list index out of range
line 198473: list index out of range
line 198474: list index out of range
line 198476: list index out of range
line 198477: list index out of range
line 198478: list index out of range
line 198479: list index out of range
line 198482: list index out of range
line 198485: list index out of range
line 198486: list index out of range
line 198487: list index out of range
line 198492: list index out of range
line 198494: list index out of range
line 198495: list index out of range
line 198496: list index out of range
l

In [12]:
def get_pattern(input_pat):
    _sum = 0
    stddev = 0.0
    k0 = 0.001

    N = len(noisy_channel[input_pat])
    if N == 0:
        return "NO RESULT"
    
    for k, v in noisy_channel[input_pat].items():
        _sum += len(v)
    avg = _sum / N

    print("%s (%d)" % (input_pat, _sum))

    for k, v in noisy_channel[input_pat].items():
        stddev += (len(v) - avg) ** 2
    stddev = math.sqrt(stddev / N - 1)
    
    final_result = {}
    
    # Filter good grammar
    for grammar, sentences in noisy_channel[input_pat].items():
        best_sentences = [(-999.9,''), (-999.9,'')]
        freqi = len(sentences)
        strength = (freqi - avg) / stddev
        if not strength > k0:
            continue

        # Find Good Dictionary Example
        for sentence in sentences:
            score = compute_score(input_pat, sentence)
            if score > best_sentences[0][0]:
                best_sentences.pop(0)
                best_sentences.append((score, sentence))
                best_sentences.sort()

        final_result[(grammar, freqi)] = best_sentences

    # Print the result
    for key in sorted(final_result, key=lambda x: x[1], reverse=True):
        values = final_result[key]
        print('-> %s (%d)' % (key[0], key[1]))
        for value in values:
            en, ch = value[1].split(" | ")
            print('     %s %s' % (en, ch))

In [15]:
get_pattern('V')

V (13422)
-> V (7914)
     barge in 打斷_V 
     lock in 鎖定_V 
-> V v (1187)
     bed down 上床_V 睡_V 
     bed down 睡覺_V 沉_V 
->  (1021)
     came in 來_ADV 
     cash in 可以_ADV 
-> n V (950)
     batter down 守軍_N 壓倒_V 
     lock down 墊片_N 擰緊_V 
