## Language Model using News corpus.

In [1]:
filename = '../Data_source/80k_articles.txt'

In [2]:
all_content = open(filename, encoding='UTF-8').read()

In [3]:
len(all_content)

34475997

In [4]:
string = all_content[0:200]

In [5]:
import re

In [6]:
def tokenize(string):
    return ''.join(re.findall('[\w]+',string))

In [7]:
tokenize(string)

'新华社照片东莞广东2017年4月7日n体育9篮球CBA总决赛第四场广东对阵新疆n4月7日广东东莞银行队球员易建联在比赛中扣篮n当日在20162017赛季中国男子篮球职业联赛CBA总决赛第四场比赛中广东东莞银行队主场迎战新疆喀什古城队n新华社记者孟永民摄n新华社北京４月１４日新媒体专电记者杨烨作为国民经济的重要支柱央企一季度交上了一份漂亮的'

## Unigram

$P(W_oW_1W_2Wn) = P(W_o) \cdot P(W_1) \cdot P(W_2) \cdot P(W_n) $

In [8]:
all_character = tokenize(all_content)

In [9]:
from collections import Counter

In [10]:
all_character_counts = Counter(all_character)

In [11]:
all_character_counts.most_common(10)

[('的', 635684),
 ('n', 605563),
 ('国', 303683),
 ('1', 285430),
 ('在', 273451),
 ('一', 255874),
 ('中', 249541),
 ('日', 248419),
 ('2', 247140),
 ('新', 243975)]

In [12]:
def get_probability_from_counts(counts):
    all_occurences = sum(counts.values())
    def get_prob(items):
        return counts[items] / all_occurences
    return get_prob  

In [13]:
get_char_prob = get_probability_from_counts(all_character_counts)

In [14]:
from functools import reduce

In [15]:
from operator import mul

In [16]:
def prob_of_string(string):
    return reduce(mul,[get_char_prob(c) for c in string])

In [17]:
pair = """前天晚上吃晚饭的时候
前天晚上吃早饭的时候""".split('\n')

pair2 = """正是一个好看的小猫
真是一个好看的小猫""".split('\n')

pair3 = """我无言以对，简直
我简直无言以对""".split('\n')

pairs = [pair, pair2, pair3]

In [18]:
def get_probability_prefromance(language_model_func, pairs):
    for (p1, p2) in pairs:
        print('*'*18)
        print('\t\t {} with probability {}'.format(p1, language_model_func(tokenize(p1))))
        print('\t\t {} with probability {}'.format(p2, language_model_func(tokenize(p2))))

In [19]:
get_probability_prefromance(prob_of_string, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 1.2207058723774045e-31
		 前天晚上吃早饭的时候 with probability 1.420433440421635e-31
******************
		 正是一个好看的小猫 with probability 3.2528612289150613e-25
		 真是一个好看的小猫 with probability 1.0220793879946632e-25
******************
		 我无言以对，简直 with probability 3.7425390630342124e-22
		 我简直无言以对 with probability 3.742539063034212e-22


In [20]:
print(prob_of_string('广州有一个地方叫做沥窖'))
print(prob_of_string('杭州有一个地方叫做西湖'))

1.2745292803369746e-36
2.0995356460752042e-33


functional programming in python

In [21]:
def add_some_num(num1):
    def _add(num2):
        return num1 + num2
    return _add
    

In [22]:
add_ten = add_some_num(10)
add_twenty = add_some_num(20)

In [23]:
print(add_ten(11))
print(add_twenty(11))

21
31


## Bigram

$$ Pr(w_ow_1w_2...w_n) = Pr(w_1 | w_0) \cdot Pr(w_2 | w_1) ... \cdot Pr(w_n | w_{n-1})  $$

$$ Pr(w_1 | w_0) = \frac{Pr(w_1 w_0)}{Pr(w_0)} $$

In [24]:
gram_length = 2
two_gram_counts = Counter([all_character[i:i+gram_length] for i in range(len(all_character)-gram_length)])

In [25]:
two_gram_counts.most_common(10)

[('新华', 135490),
 ('华社', 129104),
 ('20', 123427),
 ('nn', 118789),
 ('01', 102583),
 ('17', 81801),
 ('n新', 78433),
 ('中国', 77776),
 ('外代', 74795),
 ('7年', 59051)]

In [26]:
get_pair_prob = get_probability_from_counts(two_gram_counts)

In [27]:
def get_2gram_prob(prev,word):
    if get_pair_prob(prev+word) > 0:
        return get_pair_prob(prev+word)/get_char_prob(prev)
    else:
        return get_char_prob(prev) 

In [28]:
def get_2gram_string_prob(string):
    probability = []
    for i,c in enumerate(string):
        prev = 'n' if i == 0 else string[i-1]
        probability.append(get_2gram_prob(prev,c))
    return reduce(mul,probability)
    #return probability

In [29]:
string_pair = ['发表了重要的讲话', '发表了重要的僵化']

In [30]:
get_2gram_string_prob(string_pair[0])

6.204612960989359e-16

In [31]:
get_2gram_string_prob(string_pair[1])

5.303640669954108e-18

In [32]:
get_probability_prefromance(get_2gram_string_prob, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 2.6234499089924518e-23
		 前天晚上吃早饭的时候 with probability 6.698347779288344e-23
******************
		 正是一个好看的小猫 with probability 6.899582699458634e-20
		 真是一个好看的小猫 with probability 8.457208248607783e-21
******************
		 我无言以对，简直 with probability 8.241868374267051e-20
		 我简直无言以对 with probability 7.969727269316052e-20


## Language Model using Wikipedia data

### Data Processing

In [33]:
import re
import os
from opencc import OpenCC

In [34]:
def is_CN_char(ch):
    return ch >= u'\u4e00' and ch <= u'\u9fa5'

In [35]:
def convert2simple(word):
    openCC = OpenCC('tw2sp')
    return openCC.convert(word)

In [36]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    return cleantext

In [37]:
def cleansymbol(words):
    result = ['$']
    for word in words:
        if is_CN_char(word) or word in '，、；：':
            result.append(word)
        if word in '。？！':
            result.append(word)
            result.append('$')
    return result       

In [38]:
def data_process(data):
    clean = cleanhtml(content)
    converted = convert2simple(clean)
    line = cleansymbol(converted)
    return ''.join(line)   

In [39]:
def wiki2txt(filename):
    output = open('wiki_seg.txt', 'a', encoding='utf-8')
    for root, dirs, files in os.walk(filename):
        for filename in files:
            file_path = root + '/' + filename
            content = open(file_path, encoding='UTF-8').read()
            result = data_process(content)
            output.write(result)
    output.close()

In [40]:
def wiki2txt(filename):
    output = open('wiki_seg.txt', 'a', encoding='utf-8')
    for root, dirs, files in os.walk(filename):
        for filename in files:
            file_path = root + '/' + filename
            content = open(file_path, encoding='UTF-8').read()
            clean = cleanhtml(content)
            converted = convert2simple(clean)
            line = cleansymbol(converted)
            output.write(''.join(line))
    output.close()

In [None]:
wiki2txt("../Data_source/wiki_zh/AA")

### Unigram        

In [41]:
content = open('wiki_seg.txt', encoding='UTF-8').read()

In [42]:
char_counts = Counter(content)

In [43]:
char_counts.most_common(10)

[('，', 1733047),
 ('的', 1247374),
 ('$', 959624),
 ('。', 953526),
 ('、', 442447),
 ('年', 382463),
 ('在', 365549),
 ('为', 350941),
 ('一', 350413),
 ('国', 347783)]

In [44]:
def get_probability_from_count(counts):
    all_occurences = sum(counts.values())
    def get_prob(item):
        return counts[item]/all_occurences
    return get_prob   

In [45]:
get_char_prob = get_probability_from_count(char_counts)

In [46]:
def get_prob_of_sentence(string):
    return reduce(mul,[get_char_prob(c) for c in string])

In [47]:
get_probability_prefromance(get_prob_of_sentence, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 3.4006591032847233e-32
		 前天晚上吃早饭的时候 with probability 9.504178445822157e-32
******************
		 正是一个好看的小猫 with probability 9.242154922773662e-26
		 真是一个好看的小猫 with probability 2.883752707827827e-26
******************
		 我无言以对，简直 with probability 8.936504790621565e-22
		 我简直无言以对 with probability 8.936504790621565e-22


### Bigram

In [48]:
gram_length = 2
two_gram_counts = Counter([content[i:i+gram_length] for i in range(len(content)-gram_length)])

In [49]:
two_gram_counts.most_common(10)

[('。$', 953526),
 ('$年', 102911),
 ('年月', 79523),
 ('月日', 63989),
 ('中国', 60797),
 ('年，', 57733),
 ('一个', 55836),
 ('，并', 49650),
 ('$在', 48974),
 ('，但', 48552)]

In [50]:
get_pair_prob = get_probability_from_counts(two_gram_counts)

In [51]:
def get_2gram_prob(prev,word):
    if get_pair_prob(prev+word) > 0:
        return get_pair_prob(prev+word)/get_char_prob(prev)
    else:
        return get_char_prob(prev)

In [52]:
def get_2gram_prob_of_sentence(string):
    probability = []
    for i in range(len(string)-1):
        probability.append(get_2gram_prob(string[i],string[i+1]))
    return reduce(mul,probability)

In [53]:
get_probability_prefromance(get_2gram_prob_of_sentence, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 3.9332378822471356e-21
		 前天晚上吃早饭的时候 with probability 6.823461930353542e-22
******************
		 正是一个好看的小猫 with probability 2.787742595269777e-17
		 真是一个好看的小猫 with probability 9.6313544690318e-18
******************
		 我无言以对，简直 with probability 1.0727257212925853e-16
		 我简直无言以对 with probability 8.572784175554936e-18


### Word level bigram

In [54]:
import jieba

In [55]:
def cut(string): return list(jieba.cut(string))

In [56]:
words = cut(content)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Anan\AppData\Local\Temp\jieba.cache
Loading model cost 0.739 seconds.
Prefix dict has been built succesfully.


In [57]:
gram_length = 2

In [58]:
WordLevel_2gram_counts = Counter([words[i]+words[i+gram_length] for i in range(len(words)-gram_length)])

In [59]:
WordLevel_2gram_counts.most_common(10)

[('的，', 192291),
 ('的。', 148494),
 ('、、', 129461),
 ('$，', 97978),
 ('。年', 73416),
 ('。在', 47928),
 ('，的', 47868),
 ('$的', 46121),
 ('，在', 43134),
 ('，是', 31226)]

In [60]:
word_counts = Counter(words)

In [61]:
word_counts.most_common(10)

[('，', 1733047),
 ('的', 1236718),
 ('$', 959624),
 ('。', 953526),
 ('、', 442447),
 ('在', 321409),
 ('是', 230373),
 ('年', 207907),
 ('和', 194577),
 ('了', 160284)]

In [62]:
get_word_prob = get_probability_from_counts(word_counts)
get_pair_prob = get_probability_from_counts(WordLevel_2gram_counts)

In [63]:
def get_WordLevel_2gram_prob(prev,word):
    if get_pair_prob(prev+word) > 0:
        return get_pair_prob(prev+word)/get_word_prob(prev)
    else:
        return get_word_prob(prev) 

In [64]:
def get_WordLevel_2gram_prob_of_sentence(string):
    probability = []
    for i in range(len(string)-1):
        probability.append(get_WordLevel_2gram_prob(string[i],string[i+1]))
    return reduce(mul,probability)
    #return probability

In [65]:
def get_probability_prefromance(language_model_func, pairs):
    for (p1, p2) in pairs:
        print('*'*18)
        print('\t\t {} with probability {}'.format(p1, language_model_func(cut(p1))))
        print('\t\t {} with probability {}'.format(p2, language_model_func(cut(p2))))

In [66]:
get_probability_prefromance(get_WordLevel_2gram_prob_of_sentence, pairs)

******************
		 前天晚上吃晚饭的时候 with probability 2.7891100676209766e-22
		 前天晚上吃早饭的时候 with probability 2.5980227553882673e-26
******************
		 正是一个好看的小猫 with probability 3.3219131668687725e-18
		 真是一个好看的小猫 with probability 1.6929894627554645e-20
******************
		 我无言以对，简直 with probability 1.552385392458882e-15
		 我简直无言以对 with probability 1.0248977613776488e-09


In [None]:
string_pair = ['发表了重要的讲话', '发表了重要的僵化']

In [68]:
print(get_WordLevel_2gram_prob_of_sentence(string_pair[0]))
print(get_WordLevel_2gram_prob_of_sentence(string_pair[1]))

1.2768872047325275e-23
3.169456250569292e-30
