In [396]:
original_prica = [1, 5, 8, 9, 10, 17, 17, 20, 24, 30, 35]

In [397]:
from collections import defaultdict

In [398]:
price = defaultdict(int)

In [399]:
for i,p in enumerate(original_prica):
    price[i+1] = p

In [400]:
price[11]

35

## Get the max splitting by enumerate

In [401]:
price

defaultdict(int,
            {1: 1,
             2: 5,
             3: 8,
             4: 9,
             5: 10,
             6: 17,
             7: 17,
             8: 20,
             9: 24,
             10: 30,
             11: 35})

In [402]:
def r(n):
    
    return max(
        [price[n]] + [r(i) + r(n-i) for i in range(1,n)]
    )

In [403]:
r(10)

30

In [404]:
r(15)

45

In [405]:
import time

In [406]:
#@get_time
def fibonacci(n):
    if n  <= 2:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

In [407]:
start = time.time()
print(fibonacci(34))
end = time.time()
print(end-start)

5702887
1.0000231266021729


In [408]:
mem = defaultdict()
def fibonacci_op(n):
    if n in mem:
        return mem[n]
    else: 
        if n <= 2:
            mem[n] = 1
            return n
        else:
            result = fibonacci_op(n-1) + fibonacci_op(n-2)
            mem[n] = result
            return result

In [14]:
start = time.time()
print(fibonacci_op(64))
end = time.time()
print(end-start)

14662949395604
0.00024008750915527344


# Analysis: How to optimize

## A Simpler Problem

### Decorator

In [409]:
def get_time(func):
    def wrapper(*args):
        start = time.time()
        func(*args)
        end = time.time()
        print('used time : {}'.format(end-start))
    return wrapper

In [410]:
def f1(func):
    def wrapper(*args,**kwargs):
        print('Started')
        func(*args,**kwargs)
        print('Ended')
    return wrapper

In [411]:
def f():
    print('HELLO')

In [412]:
f = f1(f)

In [413]:
print(f.__name__)

wrapper


In [414]:
@f1
def g(a):
    print(a)

In [415]:
g('hello')

Started
hello
Ended


In [416]:
def k(*arg,**kwargs):
    print(kwargs)

In [417]:
k(6,b=5)

{'b': 5}


In [418]:
from functools import wraps

In [419]:
def memo(f):
    memo.already_computed = {}
    @wraps(f)
    def _wrap(arg):
        if arg in memo.already_computed:
            result = memo.already_computed[arg]
        else:
            result = f(arg)
            memo.already_computed[arg] = result
        return result
    return _wrap

# We use this method to solve Cut Rod probelm¶

In [420]:
solution = {}

In [421]:
@memo
def r(n):
    """
    Args: n is the iron length
    Return: the max revenue 
    """
    max_price, max_split = max(
        [(price[n], 0)] + [(r(i) + r(n-i), i) for i in range(1, n)], key=lambda x: x[0]
    )

    solution[n] = (n - max_split, max_split)
    
    return max_price

In [422]:
r(20)

60

In [423]:
price

defaultdict(int,
            {1: 1,
             2: 5,
             3: 8,
             4: 9,
             5: 10,
             6: 17,
             7: 17,
             8: 20,
             9: 24,
             10: 30,
             11: 35,
             15: 0,
             14: 0,
             13: 0,
             12: 0,
             20: 0,
             19: 0,
             18: 0,
             17: 0,
             16: 0})

In [424]:
solution

{1: (1, 0),
 2: (2, 0),
 3: (3, 0),
 4: (2, 2),
 5: (3, 2),
 6: (6, 0),
 7: (6, 1),
 8: (6, 2),
 9: (6, 3),
 10: (10, 0),
 11: (11, 0),
 12: (11, 1),
 13: (11, 2),
 14: (11, 3),
 15: (13, 2),
 16: (14, 2),
 17: (11, 6),
 18: (17, 1),
 19: (17, 2),
 20: (17, 3)}

# How do we parse solution?¶

In [425]:
def parse_solution(n):
    left_split, right_split = solution[n]
    
    if right_split == 0: return [left_split]
    
    return parse_solution(left_split) + parse_solution(right_split)

In [426]:
r(24)

75

In [427]:
parse_solution(20)

[11, 6, 3]

# Edit Distance

In [460]:
solution = {}

In [461]:
from functools import lru_cache

In [462]:
@lru_cache(maxsize=2**10)
def edit_distance(string1, string2):
    
    if len(string1) == 0: return len(string2)
    if len(string2) == 0: return len(string1)
    
    tail_s1 = string1[-1]
    tail_s2 = string2[-1]
    
    candidates = [
        (edit_distance(string1[:-1], string2) + 1, 'DEL {}'.format(tail_s1)),  
        # string 1 delete tail
        (edit_distance(string1, string2[:-1]) + 1, 'ADD {}'.format(tail_s2)),  
        # string 1 add tail of string2
    ]
    
    if tail_s1 == tail_s2:
        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 0, '')
    else:
        both_forward = (edit_distance(string1[:-1], string2[:-1]) + 1, 'SUB {} => {}'.format(tail_s1, tail_s2))

    candidates.append(both_forward)
    
    min_distance, operation = min(candidates, key=lambda x: x[0])
    
    solution[(string1, string2)] = operation 
    
    return min_distance

In [463]:
edit_distance('intention','execution')

5

In [464]:
solution

{('i', 'e'): 'SUB i => e',
 ('i', 'ex'): 'ADD x',
 ('i', 'exe'): 'ADD e',
 ('i', 'exec'): 'ADD c',
 ('i', 'execu'): 'ADD u',
 ('i', 'execut'): 'ADD t',
 ('i', 'executi'): '',
 ('i', 'executio'): 'ADD o',
 ('i', 'execution'): 'ADD n',
 ('in', 'e'): 'DEL n',
 ('in', 'ex'): 'SUB n => x',
 ('in', 'exe'): 'ADD e',
 ('in', 'exec'): 'ADD c',
 ('in', 'execu'): 'ADD u',
 ('in', 'execut'): 'ADD t',
 ('in', 'executi'): 'DEL n',
 ('in', 'executio'): 'SUB n => o',
 ('in', 'execution'): '',
 ('int', 'e'): 'DEL t',
 ('int', 'ex'): 'DEL t',
 ('int', 'exe'): 'SUB t => e',
 ('int', 'exec'): 'ADD c',
 ('int', 'execu'): 'ADD u',
 ('int', 'execut'): '',
 ('int', 'executi'): 'ADD i',
 ('int', 'executio'): 'ADD o',
 ('int', 'execution'): 'DEL t',
 ('inte', 'e'): '',
 ('inte', 'ex'): 'DEL e',
 ('inte', 'exe'): '',
 ('inte', 'exec'): 'ADD c',
 ('inte', 'execu'): 'ADD u',
 ('inte', 'execut'): 'DEL e',
 ('inte', 'executi'): 'SUB e => i',
 ('inte', 'executio'): 'ADD o',
 ('inte', 'execution'): 'ADD n',
 ('inten',

## Todo: Parse Solution is our homework

In [509]:
def print_solution(solution):
    start = max(solution, key=lambda x: len(x[0]) + len(x[1]))
    s1 = start[0]
    s2 = start[1]
    es1 = s1
    es2 = s2
    print('({}, {}): start'.format(s1, s2))
    while start in solution:
#         print(start)
        action = solution[start]
        if 'ADD' in action:
            letter = action.split(' ')[-1]
#             print(s1[:len(es1)])
#             print(s1[len(es1):])
#             print("*")
            s1 = s1[:len(es1)] + letter + s1[len(es1):]
            print('({}, {}): ADD {}'.format(s1, s2,letter))
            es2 = es2[:-1]
        elif 'DEL' in action:
            letter = action.split(' ')[-1]
            s1 = s1[:len(es1)-1] + s1[len(es1):]
            print('({}, {}): DEL {}'.format(s1, s2, letter))
            es1 = es1[:-1]
        elif 'SUB' in action:
            s1 = es1[:-1] + es2[-1] + s1[len(es1):]
            print('({}, {}): SUB {} => {}'.format(s1, s2, es1[-1], es2[-1]))
            es1 = es1[:-1]
            es2 = es2[:-1]
        else:
            es1 = es1[:-1]
            es2 = es2[:-1]
        start = (es1, es2)

In [510]:
print_solution(solution)

(intention, execution): start
(intenution, execution): ADD u
(intecution, execution): SUB n => c
(inecution, execution): DEL t
(ixecution, execution): SUB n => x
(execution, execution): SUB i => e


# Problem Case 3: Pinyin Auto Correction Problem

In [27]:
chinese_dataset = 'article_9k.txt'

In [28]:
CHINESE_CHARATERS = open(chinese_dataset).read()

In [30]:
CHINESE_CHARATERS[:40]

'此外自本周6月12日起除小米手机6等15款机型外其余机型已暂停更新发布含开发版体'

In [1]:
import pinyin

In [57]:
pinyin.get('你好，中国',format='strip',delimiter=' ')

'ni hao ， zhong guo'

In [34]:
def chinese_to_pinyin(character):
    return pinyin.get(character, format='strip', delimiter=' ')

In [35]:
CHINESE_CHARATERS_COPYS = chinese_to_pinyin(CHINESE_CHARATERS)

In [36]:
len(CHINESE_CHARATERS_COPYS)

129433034

In [37]:
import re

In [38]:
def tokens(text):
    "List all the pinyin characters"
    return re.findall('[a-z]+',text.lower())

In [39]:
CHINESE_CHARATERS_COPYS[:100]

'ci wai zi ben zhou 6 yue 1 2 ri qi chu xiao mi shou ji 6 deng 1 5 kuan ji xing wai qi yu ji xing yi '

In [40]:
tokens(CHINESE_CHARATERS_COPYS[:100])

['ci',
 'wai',
 'zi',
 'ben',
 'zhou',
 'yue',
 'ri',
 'qi',
 'chu',
 'xiao',
 'mi',
 'shou',
 'ji',
 'deng',
 'kuan',
 'ji',
 'xing',
 'wai',
 'qi',
 'yu',
 'ji',
 'xing',
 'yi']

In [41]:
from collections import Counter, defaultdict

In [42]:
PINYIN_COUNT = Counter(tokens(CHINESE_CHARATERS_COPYS))

In [43]:
def correct(word):
    'Find the most possible pinyin based on edit distance'
    # Prefer edit distance 0, then 1, then 2; otherwist default to word itself
    candidates = (known(edits0(word)) or
                  known(edits1(word)) or
                  known(edits2(word)) or
                  [word])
    return max(candidates,key=PINYIN_COUNT.get)

In [44]:
def known(words):
    'Return the pinyin in our data'
    return {w for w in words if w in PINYIN_COUNT}

def edits0(word):
    'Return all strings that are zero edits away from word (i.e., just word itself).'
    return {word}

def edits2(word):
    'Return all strings that are two edits away from this pinyin.'
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

In [58]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'

def splits(word):
    'Return a list of all possible (first, rest) pairs that comprise pinyin.'
    return [(word[:i], word[i:])
           for i in range(len(word)+1)]

def edits1(word):
    'Return all strings that are one edit away from this pinyin.'
    pairs = splits(word)
    deletes = [a+b[1:] for (a,b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a,b) in pairs if len(b) > 1]
    replaces = [a+c+b[1:] for (a,b) in pairs for c in alphabet if b]
    inserts = [a+c+b for (a,b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

In [46]:
splits('pinyin')

[('', 'pinyin'),
 ('p', 'inyin'),
 ('pi', 'nyin'),
 ('pin', 'yin'),
 ('piny', 'in'),
 ('pinyi', 'n'),
 ('pinyin', '')]

In [48]:
print(edits0('pinyin'))

{'pinyin'}


In [49]:
print(edits1('pinyin'))

{'piwyin', 'winyin', 'pinyon', 'pinpin', 'pinyia', 'pwinyin', 'piqyin', 'pinlyin', 'pinyino', 'pinyxn', 'pinwyin', 'pinyio', 'pinayin', 'pinyikn', 'pincin', 'pinyiy', 'pinyinc', 'jinyin', 'pinyien', 'pinyi', 'pinyine', 'pinyqn', 'pifnyin', 'pinbyin', 'pindyin', 'xinyin', 'pinynin', 'pinyzn', 'pinyis', 'pinyimn', 'pinyih', 'ppnyin', 'pisyin', 'pvinyin', 'tinyin', 'qinyin', 'pinyifn', 'pinyik', 'pinwin', 'pinzyin', 'yinyin', 'pinysn', 'pinyini', 'pinyjin', 'pinrin', 'pinyinx', 'pinnin', 'piniyin', 'pzinyin', 'pinfyin', 'pinygin', 'pinuin', 'puinyin', 'ponyin', 'piznyin', 'kinyin', 'piyyin', 'cpinyin', 'vinyin', 'pidyin', 'pinyitn', 'pinyink', 'pinyvin', 'pinydin', 'pnnyin', 'ninyin', 'qpinyin', 'pinxin', 'pinyins', 'jpinyin', 'piiyin', 'pinycn', 'ginyin', 'bpinyin', 'pinryin', 'pinqin', 'pbnyin', 'ypinyin', 'pifyin', 'ipnyin', 'pinyinf', 'pineyin', 'pinyiw', 'pinymn', 'psinyin', 'pinqyin', 'pinyic', 'pijnyin', 'pinyirn', 'pingyin', 'pinyan', 'cinyin', 'pinypin', 'pinyrin', 'ptinyin', 'gp

# Test

In [50]:
correct('yin')

'yin'

In [51]:
correct('yign')

'ying'

In [52]:
correct('yinn')

'ying'

In [55]:
def correct_sequence_pinyin(text_pinyin):
    return ' '.join(map(correct, text_pinyin.split()))

In [56]:
correct_sequence_pinyin('zhe sih yi ge ce sho')

'zhe shi yi ge ce shi'

In [57]:
correct_sequence_pinyin('wo xiang shagn qinng hua da xue')

'wo xiang shang qing hua da xue'

# 思考题-homework？    
#### 如何在不带空格的时候完成自动修整？--> 如何完成拼音的自动分割？   
###### 提示：使用第一节课提到的语言模型!

woyaoshangqinghua
w yaoshangqinghua
wo yaoshangqinghua
woyao shangqinghua

-> DP

## n_gram构建

In [15]:
import pandas as pd
import re
import thulac
import pickle
import jieba
import pinyin

In [6]:
def token(string):return re.findall('\w+', string)


def cut(string): 
    return list(jieba.cut(string))

In [35]:
#考虑使用n-gram，先构造n-gram,这里用了豆瓣评论语料，效果不好放弃
filename="/Users/junjiexie/Downloads/movie_comments.csv"
content = pd.read_csv(filename, encoding = 'utf_8')
articles = content['comment'].tolist()
articles_clean = [''.join(token(str(a)))for a in articles]

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
#这里使用汉语新闻语料
han_filename = "/Users/junjiexie/Documents/NLP学习/nlp文本摘要项目/sqlResult_1558435.csv"
data = pd.read_csv(han_filename,encoding="GB18030")
articles = data["content"].tolist()
articles_clean = [''.join(token(str(a)))for a in articles]

In [8]:
len(articles_clean)

89611

In [560]:
TOKEN = []
for i,line in enumerate(articles_clean):
    if i % 10000 ==0 and i !=0 : 
        print(i)
    
    TOKEN += cut(line.strip())

10000
20000
30000
40000
50000
60000
70000
80000


In [53]:
# import pickle
# with open("/Users/junjiexie/Documents/NLP学习/nlp第十课/Token",'wb') as f:
#     pickle.dump(Token,f)

In [76]:
# import pickle
# with open("/Users/junjiexie/Documents/NLP学习/nlp第十课/Token",'rb') as f:
#     Token = pickle.load(f)

In [572]:
# import pickle
# with open("/Users/junjiexie/Documents/NLP学习/nlp第十课/Token_second",'wb') as f:
#     pickle.dump(TOKEN,f)

In [573]:
# import pickle
# with open("/Users/junjiexie/Documents/NLP学习/nlp第十课/Token_second",'rb') as f:
#     Token_second = pickle.load(f)

In [125]:
#经过测试词和字的n-gram，最后使用字的n-gram，并使用中文新闻语料作为数据源
Token_third = []
for i,line in enumerate(articles_clean):
    if i % 10000 ==0 and i !=0 : 
        print(i)  
    Token_third += [i for i in line]

10000
20000
30000
40000
50000
60000
70000
80000


In [16]:
def chinese_to_pinyin(character):
    return pinyin.get(character, format='strip', delimiter="")

In [126]:
TOKEN_spell = [chinese_to_pinyin(i) for i in Token_third]

In [186]:
len(TOKEN_spell)

33336215

In [127]:
TOKEN_1_GRAM = [str(t) for t in TOKEN_spell] #为了构造出2gram作准备
TOKEN_2_GRAM = [''.join(TOKEN_1_GRAM[i:i+2]) for i in range(len(TOKEN_1_GRAM[:-2]))]

In [128]:
from collections import Counter
word_count = Counter(TOKEN_1_GRAM)
words_count_2 = Counter(TOKEN_2_GRAM)

In [95]:
#使用lidstone法则平滑，感觉效果不显著，换回+1平滑
def prob_1(word):
    
    λ = 0.5
    N = len(TOKEN_1_GRAM)
    B = len(word_count)
    
    if str(word) in word_count:
        origin = word_count[word] / len(TOKEN_spell)
        return (origin + λ) / (N + B*λ)
    else: 
        origin = 1 / len(TOKEN_spell)
        return (origin + λ) / (N + B*λ)

def prob_2(word1, word2):
    
    λ = 0.3
    N = len(TOKEN_2_GRAM)
    B = len(words_count_2)
    
    if str(word1) + str(word2) in words_count_2:
        if word_count[word2] == 0:
            origin = 1 / (len(TOKEN_spell)*1000000)#例如"zhe","nde",不清楚有没有别的解决方法,目前就强制让它的概率降0
        else:
            origin = words_count_2[word1+word2] / (word_count[word2])
        return (origin + λ) / (N + B*λ)
    else:
        origin = 1 / len(TOKEN_2_GRAM)
        return (origin + λ) / (N + B*λ)

In [129]:
#目前使用+1平滑
def prob_1(word):
    
    if str(word) in word_count:
        origin = word_count[word] / len(TOKEN_spell)
        return origin 
    else: 
        origin = 1 / len(TOKEN_spell)
        return origin
    
def prob_2(word1, word2):
    μ = 5000000
    if str(word1) + str(word2) in words_count_2:
        if word_count[word2] == 0:
            origin = 1 / (len(TOKEN_spell)*1000000) #例如"zhe","nde",不清楚有没有别的解决方法,目前就强制让它的概率降0
        else:
            origin = words_count_2[word1+word2] / (word_count[word2])
        return origin
    else:
        origin = 1 / len(TOKEN_2_GRAM)
        return origin

In [132]:
prob_2("hao","nan") - prob_2("xiao","hua")

-0.0016309748730558354

In [133]:
prob_2("zhe","nde") - prob_2("zhen","de")

-0.002652221853142112

## 拼音自动分割

In [176]:
"""这个是使用了1-gram,2-gram,运用了类似beam-search的思想，第一次切分保留三种，
第二次切分各保留三种，最后共有九种，从中选择n-gram认为最有可能性的,然后保留一个最有可能的第一次、第二次切割组合，
依次递归，但问题是n-gram数据稀疏，即使平滑也很容易有偏，不知道别的同学有没有好方法解决，感觉效果还是很差"""

def spell_split(string,number=3,stay=[]):
        #这里是模糊判断是不是最后一个拼音了，不然就会强行分下去
        if prob_1(string) !=  1 / len(TOKEN_spell) and len(string) <= 3:
            stay.append(string)
            return stay
    
    
        temprorary_stay=[]
        
        possibility_split = [(string[0:i+1],string[i+1:]) for i in range(len(string)-1)]
        save_prob = sorted(possibility_split,key=lambda x: prob_1(x[0]),reverse=True)[:number]
        
        first_possible_split = [i[0] for i in save_prob]
        next_need_split = [i[1] for i in save_prob]
        
        #beam-search 第二步
        for first,next in enumerate(next_need_split):
            
            possibility_split = [(next[0:i+1],next[i+1:]) for i in range(len(next)-1)] 
            save_prob = sorted(possibility_split,key=lambda x: prob_1(x[0]),reverse=True)[:number]
            second_possible_split = [i[0] for i in save_prob]
            next_next_need_split = [i[1] for i in save_prob]
            
            for second,element in enumerate(second_possible_split):
                
                a = first_possible_split[first]
                b = second_possible_split[second]
                c = prob_2(a,b)
                #汇总9种组合
                temprorary_stay.append([a,b,c])
        #这里从9种排序，是因为有些“a”，这种拼音，频率很高，要筛掉先
        need_select = sorted(temprorary_stay,key=lambda x: x[2],reverse=True)
        

#         print(temprorary_stay)
#         print("__9个元素__")
#         print(need_select)
#         print("__3个选择元素__")
        print(sorted(temprorary_stay,key=lambda x: x[2],reverse=True))
        print("__9个元素排序__")
        
        output = []
        #筛掉“a”这种，然后取概率第一的
        for i in need_select:
            if len(i[0]) != 1 and len(i[1]) != 1:
                output.append(i)
        print(output)
        print("__输出__")
                
        #递归出口
        if len(output) >= 1:
            first_spell = output[0][0]
            second_spell = output[0][1]
            
#             print(first_spell)
#             print(second_spell)
            
            stay.append(first_spell)
            stay.append(second_spell)
        
            next_string = string[len(first_spell+second_spell):]
#             print(next_string)
        else:
            return stay
        
        return spell_split(string=next_string,stay=stay)     

In [177]:
spell_split("youdianmafanle",stay=[])

[['y', 'o', 0.009765028989929814], ['you', 'dian', 0.009097226104805796], ['you', 'di', 0.003583921614708089], ['yo', 'u', 0.0005520287054926856], ['you', 'd', 0.00029568302779420464], ['y', 'ou', 5.703530485370444e-05], ['y', 'oud', 2.999740672418869e-14], ['yo', 'ud', 2.999740672418869e-14], ['yo', 'udi', 2.999740672418869e-14]]
__9个元素排序__
[['you', 'dian', 0.009097226104805796], ['you', 'di', 0.003583921614708089], ['yo', 'ud', 2.999740672418869e-14], ['yo', 'udi', 2.999740672418869e-14]]
__输出__
[['m', 'a', 0.013257526812249645], ['ma', 'fan', 0.0052692069666696135], ['maf', 'an', 0.003298065430076586], ['maf', 'a', 0.0013955291381315415], ['ma', 'f', 0.0006622516556291391], ['ma', 'fa', 0.000231623466030703], ['maf', 'anl', 2.999740852387762e-08], ['m', 'af', 2.999740672418869e-14], ['m', 'afa', 2.999740672418869e-14]]
__9个元素排序__
[['ma', 'fan', 0.0052692069666696135], ['maf', 'an', 0.003298065430076586], ['ma', 'fa', 0.000231623466030703], ['maf', 'anl', 2.999740852387762e-08]]
__输出

['you', 'dian', 'ma', 'fan', 'le']

In [178]:
spell_split("nidongbudongwodexin",stay=[])

[['n', 'i', 0.04423787658192407], ['ni', 'dong', 0.0007845884413309982], ['n', 'id', 2.999740852387762e-08], ['n', 'ido', 2.999740852387762e-08], ['ni', 'd', 2.999740852387762e-08], ['ni', 'do', 2.999740852387762e-08], ['nid', 'o', 2.999740852387762e-08], ['nid', 'on', 2.999740852387762e-08], ['nid', 'ong', 2.999740672418869e-14]]
__9个元素排序__
[['ni', 'dong', 0.0007845884413309982], ['ni', 'do', 2.999740852387762e-08], ['nid', 'on', 2.999740852387762e-08], ['nid', 'ong', 2.999740672418869e-14]]
__输出__
[['b', 'u', 0.01849296163400497], ['bu', 'dong', 0.0061506129597197895], ['bu', 'd', 0.00029568302779420464], ['bu', 'do', 2.999740852387762e-08], ['b', 'udo', 2.999740852387762e-08], ['bud', 'o', 2.999740852387762e-08], ['bud', 'on', 2.999740852387762e-08], ['b', 'ud', 2.999740672418869e-14], ['bud', 'ong', 2.999740672418869e-14]]
__9个元素排序__
[['bu', 'dong', 0.0061506129597197895], ['bu', 'do', 2.999740852387762e-08], ['bud', 'on', 2.999740852387762e-08], ['bud', 'ong', 2.999740672418869e-1

['ni', 'dong', 'bu', 'dong', 'wo', 'de', 'xin']

In [187]:
spell_split("zhendehaonan",number=3,stay=[]) #通过number控制n*n种beam——search

[['z', 'hen', 0.09779446269357109], ['z', 'h', 0.010834236186348862], ['zhe', 'n', 0.003048695746308723], ['zhen', 'de', 0.002652221853172109], ['z', 'he', 3.500309777415301e-06], ['zhe', 'nd', 2.999740852387762e-08], ['zhen', 'd', 2.999740852387762e-08], ['zhen', 'deh', 2.999740852387762e-08], ['zhe', 'nde', 2.999740672418869e-14]]
__9个元素排序__
[['zhen', 'de', 0.002652221853172109], ['zhe', 'nd', 2.999740852387762e-08], ['zhen', 'deh', 2.999740852387762e-08], ['zhe', 'nde', 2.999740672418869e-14]]
__输出__
[['h', 'a', 0.01452384028944308], ['hao', 'n', 0.001976385774118563], ['hao', 'na', 0.0011761656998185914], ['ha', 'o', 2.999740852387762e-08], ['h', 'ao', 2.999740852387762e-08], ['ha', 'on', 2.999740672418869e-14], ['ha', 'ona', 2.999740672418869e-14], ['h', 'aon', 2.999740672418869e-14]]
__9个元素排序__
[['hao', 'na', 0.0011761656998185914], ['ha', 'on', 2.999740672418869e-14], ['ha', 'ona', 2.999740672418869e-14]]
__输出__


['zhen', 'de', 'hao', 'na', 'n']

In [189]:
spell_split("jintianzhongguorenminzhanqilaile",number=6,stay=[])

[['jin', 'tian', 0.04257555847568988], ['jint', 'i', 0.03438234964721693], ['jinti', 'an', 0.03047943689406348], ['ji', 'n', 0.013249538087484695], ['jintia', 'n', 0.004028842651312007], ['j', 'i', 0.0031358494792249974], ['jin', 'ti', 0.0017359049606170097], ['ji', 'nt', 2.999740852387762e-08], ['ji', 'ntia', 2.999740852387762e-08], ['ji', 'ntianz', 2.999740852387762e-08], ['jin', 't', 2.999740852387762e-08], ['jin', 'tia', 2.999740852387762e-08], ['jin', 'tianz', 2.999740852387762e-08], ['jin', 'tianzh', 2.999740852387762e-08], ['j', 'int', 2.999740852387762e-08], ['j', 'intia', 2.999740852387762e-08], ['jint', 'ia', 2.999740852387762e-08], ['jint', 'ianz', 2.999740852387762e-08], ['jint', 'ianzh', 2.999740852387762e-08], ['jint', 'ianzho', 2.999740852387762e-08], ['jinti', 'a', 2.999740852387762e-08], ['jinti', 'anz', 2.999740852387762e-08], ['jinti', 'anzh', 2.999740852387762e-08], ['jinti', 'anzho', 2.999740852387762e-08], ['jinti', 'anzhon', 2.999740852387762e-08], ['jintia', 'nz

['jin', 'tian', 'zhong', 'guo', 'ren', 'min', 'zhan', 'qi', 'la', 'il', 'e']