In [5]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [6]:
file_name = "../80k_articles.txt"

In [7]:
all_strings = open(file_name, encoding="utf8").read()

In [8]:
len(all_strings)

34475997

In [9]:
def tokenize(strings):
    strings = strings.replace(u"\\n", u'')
    return "".join(re.findall('[\w\d]+', strings))
tokenize(all_strings)[:100]

'新华社照片东莞广东2017年4月7日体育9篮球CBA总决赛第四场广东对阵新疆4月7日广东东莞银行队球员易建联在比赛中扣篮当日在20162017赛季中国男子篮球职业联赛CBA总决赛第四场比赛中广东东莞银'

In [10]:
from collections import Counter
ALL_CHARACTERS = tokenize(all_strings)
one_gram_count = Counter(ALL_CHARACTERS)

In [11]:
two_gram_count = Counter([ALL_CHARACTERS[i:i+2] for i in range(len(ALL_CHARACTERS)-1)])

In [12]:
two_gram_count.most_common(4)

[('新华', 135496), ('华社', 129108), ('20', 123427), ('01', 102612)]

In [13]:
one_gram_count.most_common(4)

[('的', 635684), ('国', 303683), ('1', 285430), ('在', 273451)]

In [14]:
def get_count_prob(counts):
    all_occurances = sum(counts.values())
    smallest_occurance = min(counts.values())
    def get_item_prob(item):
        return counts[item] * 1.0 / all_occurances if item in counts else smallest_occurance * 1.0 / all_occurances
    return get_item_prob

def one_gram_get_prob_bad_example(item):
    return one_gram_count[item] * 1.0 / sum(one_gram_count.values())

In [15]:
one_gram_get_prob = get_count_prob(one_gram_count)
two_gram_get_prob = get_count_prob(two_gram_count)

In [16]:
one_gram_get_prob("的")

0.02181773151797367

In [17]:
one_gram_get_prob("四年话")

3.4321662206337854e-08

In [18]:
two_gram_get_prob("新华")

0.0046504481019210625

In [19]:
two_gram_get_prob("四")

3.432166338431439e-08

In [20]:
import time
def time_consume(func, arg, times):
    start_time = time.time()
    for _ in range(times):
        func(arg)
    print("\t\t {} total running time is: {}".format(func.__name__, time.time() - start_time))

In [21]:
time_consume(one_gram_get_prob, "是", 10000)

		 get_item_prob total running time is: 0.010029792785644531


In [22]:
time_consume(one_gram_get_prob_bad_example, "是", 10000)

		 one_gram_get_prob_bad_example total running time is: 1.3999018669128418


# Compute Probability of sentences

In [23]:
from functools import reduce
from operator import mul, add
def one_gram_sentence_prob(sentence):
    return reduce(mul, [one_gram_get_prob(char) for char in sentence])

In [44]:
def two_gram_sentence_prob(sentence):
    sentence = "," + sentence
    probs = [two_gram_get_prob(sentence[i-1:i+1]) / one_gram_get_prob(sentence[i-1])
             if sentence[i-1:i+1] in two_gram_count 
             else one_gram_get_prob(sentence[i]) 
             for i in range(1, len(sentence))]
    #print(probs[:10])
    return reduce(mul, probs)
    #return reduce(mul, [two_gram_get_prob(sentence[i:i+2]) / one_gram_get_prob(sentence[i]) for i in range(len(sentence)-1)])

In [46]:
sentence_prob_estimation_comparison(get_2_gram_string_prob, sample_sentences)

**********
		 前天晚上吃晚饭的时候: 1.563021270632768e-22
		 前天晚上吃早饭的时候: 3.9907985363991065e-22
**********
		 正是一个好看的小猫: 1.3976690481561745e-19
		 真是一个好看的小猫: 3.003169925034457e-20
**********
		 我无言以对，简直: 3.690883096893807e-20
		 我简直无言以对: 3.4840360609470335e-20


In [35]:
one_gram_sentence_prob("你说的对")

2.8526143980136535e-11

In [36]:
two_gram_sentence_prob("你说的对")

[0.0002513375323370121, 0.006008466681708752, 0.010366480096731563, 0.0025216932548524523]


3.947703975809707e-11

In [37]:
sample_sentences = """前天晚上吃晚饭的时候 前天晚上吃早饭的时候
正是一个好看的小猫 真是一个好看的小猫
我无言以对，简直 我简直无言以对"""
sample_sentences = [pair.split() for pair in sample_sentences.split("\n")]
sample_sentences

[['前天晚上吃晚饭的时候', '前天晚上吃早饭的时候'],
 ['正是一个好看的小猫', '真是一个好看的小猫'],
 ['我无言以对，简直', '我简直无言以对']]

In [38]:
def sentence_prob_estimation_comparison(func, pairs):
    for p1, p2 in pairs:
        print("*"*10)
        print("\t\t {}: {}".format(p1, func(tokenize(p1))))
        print("\t\t {}: {}".format(p2, func(tokenize(p2))))


In [39]:
sentence_prob_estimation_comparison(one_gram_sentence_prob, sample_sentences)

**********
		 前天晚上吃晚饭的时候: 1.495549535936482e-31
		 前天晚上吃早饭的时候: 1.7402460500283889e-31
**********
		 正是一个好看的小猫: 3.9051374808390024e-25
		 真是一个好看的小猫: 1.2270306802427597e-25
**********
		 我无言以对，简直: 4.31418912791861e-22
		 我简直无言以对: 4.31418912791861e-22


In [47]:
sentence_prob_estimation_comparison(two_gram_sentence_prob, sample_sentences)

**********
		 前天晚上吃晚饭的时候: 1.563021270632768e-22
		 前天晚上吃早饭的时候: 3.9907985363991065e-22
**********
		 正是一个好看的小猫: 1.3976690481561745e-19
		 真是一个好看的小猫: 3.003169925034457e-20
**********
		 我无言以对，简直: 3.690883096893807e-20
		 我简直无言以对: 3.4840360609470335e-20
