## 1. preprocessing o.txt -> list

In [1]:
def create_sentence_pattern_list(input_pat):
    pattern = []
    final = []
    for i in input_pat:
        if i != '':
            pattern.append(i)
        else:
            final.append(pattern.copy())
            pattern.clear()

    # Last one
    final.append(pattern)
    return final

In [2]:
# English
english_corpus = open('o.txt', 'r').read().strip('\n').split('\n')
english_corpus = create_sentence_pattern_list(english_corpus)

# English correct sentences
english_sent = open('UM-Corpus.en.200k.txt', 'r').read().split('\n')

- sentence has some wired word

**eg. abbot âs > ’s** 

In [3]:
for i,obj in enumerate(english_corpus):
    if obj[0] != english_sent[i]:
        obj[0] = english_sent[i]

In [4]:
# Chinese
chinese_corpus = open('UM-Corpus.ch.200k.tagged.txt', 'r').read().split('\n')

In [5]:
aligns = open('align.final.200k', 'r').read().split('\n')

In [6]:
from collections import defaultdict, OrderedDict

In [7]:
def pattern_pos(sent1, sent2):
    if not isinstance(sent1, list):
        sent1 = sent1.split()

    if not isinstance(sent2, list):
        sent2 = sent2.split()

    if len(sent1) < len(sent2):
        sent1, sent2 = sent2, sent1

    # sent1 is the whole sentence
    # sent2 is the sub sentence

    count = 0
    n = len(sent2)
    for i in range(len(sent1)):
        count = 0
        for j in range(n):
            if sent1[i] == sent2[j]:
                count += 1
                i += 1
                if count == n:
                    return (i - n, i)
            else:
                i -= count
                break
    return (-1, -1)

In [8]:
#from orderedset import OrderedSet 

- 從 UM-Corpus.ch.200k.tagged.txt 取出 中文文法

In [9]:
def extract_ch_grammar(ch_pat):
    ch_grammar = ch_pat.split(' ')
    ch_grammar = [cg.split('_')[1] for cg in ch_grammar if '_' in cg]
    ch_grammar = [cg for cg in ch_grammar if cg == 'V' or cg == 'P' or cg == 'N']
    
    if ch_grammar == ['V', 'V']:
        ch_grammar = 'V v'
    else:
        #ch_grammar = OrderedSet(ch_grammar)
        ch_grammar = ' '.join(ch_grammar).lower().replace('v', 'V')

    return ch_grammar

- 計算 english grammar -> chinese grammar 的次數
    - noisy_channel[en_gram][ch_gram] = list(en_sen->ch_sen pair)

In [10]:
import re
def extract_pattern():
    count = 0
    noisy_channel = defaultdict(lambda: defaultdict(list))
    for english, chinese, align in zip(english_corpus, chinese_corpus, aligns):
        count += 1
        en_sent = english[0].split()
        ch_sent = chinese.split()
        align = align.split()
        en_ch = OrderedDict()
        index = 0

        try:
            for a in align:
                en_pos, ch_pos = a.split('-')
                en_pos = int(en_pos)
                ch_pos = int(ch_pos)
                en = en_sent[en_pos]
                ch = ch_sent[ch_pos]
                en_ch[index, en_pos, en] = ch
                index += 1

            for _ in english[1:]:
                _, en_grammar, en_pat = _.split('\t')
                en_grammar = re.sub('about|in|on|to|for|with', 'p', en_grammar)
                start, end = pattern_pos(en_sent, en_pat)
                ch_pat = ""
                for en, ch_term in en_ch.items():
                    _, en_pos, en_term = en
                    if en_pos >= start and en_pos < end:
                        ch_pat += "%s " % ch_term
                    elif en_pos >= end:
                        break
                if 'V' in ch_pat:
                    ch_grammar = extract_ch_grammar(ch_pat)
                    noisy_channel_pattern = "%s | %s" % (en_pat, ch_pat)
                    noisy_channel[en_grammar][ch_grammar].append(noisy_channel_pattern)

        except Exception as e:
            print("line %d: %s" % (count, str(e)))
    return noisy_channel

In [11]:
en_ch_pat = extract_pattern()

line 166219: list index out of range
line 180500: list index out of range
line 180503: list index out of range
line 186533: list index out of range
line 199209: list index out of range


In [13]:
import math
import operator
from pprint import pprint

PRONS = set([line.strip('\n') for line in open('prons.txt')])

with open('HiFreWords') as f:
    HiFreWords = set(f.readline().split('\t'))

def compute_score(word, sent):
    global PRONS
    global HiFreWords

    word = word.lower()
    sent = sent.lower().split()
    length = len(sent)

    locationOfWord = -1 if word not in sent else sent.index(word)
    hiFreWordsScore = len([w for w in sent if w not in HiFreWords])
    pronsScore = len([w for w in sent if w in PRONS])

    return locationOfWord - hiFreWordsScore - pronsScore


In [17]:
def get_pattern(input_pat):
    _sum = 0
    stddev = 0.0
    k0 = 0.5

    N = len(en_ch_pat[input_pat])
    if N == 0:
        return "NO RESULT"
    
    for k, v in en_ch_pat[input_pat].items():
        _sum += len(v)
    avg = _sum / N

    print("%s (%d)" % (input_pat, _sum))

    for k, v in en_ch_pat[input_pat].items():
        stddev += (len(v) - avg) ** 2
    stddev = math.sqrt(stddev / N - 1)
    
    final_result = {}
    
    # Filter good grammar
    for grammar, sentences in en_ch_pat[input_pat].items():
        best_sentences = [(-999.9,''), (-999.9,''), (-999.9,'')]
        freqi = len(sentences)
        strength = (freqi - avg) / stddev
        if not strength > k0:
            continue

        # Find Good Dictionary Example
        for sentence in sentences:
            score = compute_score(input_pat, sentence)
            if score > best_sentences[0][0]:
                best_sentences.pop(0)
                best_sentences.append((score, sentence))
                best_sentences.sort()

        final_result[(grammar, freqi)] = best_sentences

    # Print the result
    for key in sorted(final_result, key=lambda x: x[1], reverse=True):
        values = final_result[key]
        print('-> %s (%d)' % (key[0], key[1]))
        for value in values:
            en, ch = value[1].split(" | ")
            print('     %s %s' % (en, ch))

In [18]:
get_pattern('V n')

V n (121119)
-> V n (26889)
     authorized me 授權_V 我_N 
     keeping abreast 與時俱進_V 國際_N 
     leave any fingerprints 留下_V 指紋_N 
-> V (13617)
     adore everything 崇拜_V 
     be an aggregate 服務_V 
     constitute a crime 構成_V 
-> V v (10849)
     controlling aspects 上癮_V 上癮_V 
     explain everything 說明_V 詳細_V 
     give directions 指路_V 指路_V 
-> V n n (10192)
     buy insurance 買_V 保險_N 保險_N 
     decrease body fat 減少_V 脂肪_N 脂肪_N 
     don aboriginal dress 穿上_V 土著_N 居民_N 
-> V V n (8953)
     has abundant experience 擁有_V 豐富_V 經驗_N 
     having a love 有_V 有_V 愛_N 
     helped anchor Asia 幫助_V 掌握_V 亞洲_N 
-> V V V (3600)
     is a master 是_V 是_V 貼切_V 
     is gaining extensive attention 受到_V 廣泛_V 重視_V 
     make a house call 浴_V 出診_V 出診_V 
-> n V (3188)
     has clinical importance 糖尿病_N 重要_V 
     have access 他們_N 訪問_V 
     have control 你_N 控制_V 
-> n V n (2719)
     be an administrator 管理員_N 冒充_V 管理員_N 
     became a citizen 奧巴馬_N 加入_V 印尼_N 
     have both 你們_N 需要_V 你們_N 
-> V V n n (

In [16]:
get_pattern('V p n')

V p n (27246)
-> V n (2379)
     is manufactured in China 單挑_V 中國_N 
     lay for hours 長_V 時間_N 
     listen for frogs 傾聽_V 蛙鳴_N 
-> V V n (2160)
     calling for elections 體面_V 舉行_V 選舉_N 
     intend for me 打算_V 讓_V 我_N 
     look for an apartment 指望_V 是_V 公寓_N 
-> V (1793)
     is in conformity 符合_V 
     keep in check 努力_V 
     know for a certainty 知道_V 
-> V v (1649)
     be in charge 主導_V 主導_V 
     be in charge 負責_V 負責_V 
     be in line 適合_V 適合_V 
-> V n n (1199)
     are in charge 是_V 負責人_N 負責人_N 
     fall in love 墜入_V 愛河_N 愛河_N 
     is for defence 是_V 費用_N 國防_N 
-> V V V (1100)
     ask for compensation 要求_V 賠償_V 賠償_V 
     be found in certain degrees 奢_V 惰_V 惰_V 
     go in for expansion 搞_V 搞_V 擴張_V 
-> V V n n (993)
     applied for college admission 申請_V 申請_V 大學_N 大學_N 
     look for a job 尋找_V 尋找_V 工作_N 工作_N 
     looking for a bone 翻找_V 翻找_V 骨頭_N 骨頭_N 
-> V p n (775)
     buy in China 買_V 在_P 中國_N 
     live in London 住_V 在_P 倫敦_N 
     lived in Canada 住_V 在_P 加拿大_N 