In [5]:
# tutorial00
# ファイルの中の単語の頻度を数えるプログラムを作成
from collections import defaultdict
DEBUG = True

# ディクショナリの初期値を0に設定
d = defaultdict(lambda: 0)

# デバッグモードの場合は短文を入力にする
input_path = "../../test/00-input.txt" \
    if DEBUG else "../../data/wiki-en-train.word"

with open(input_path) as f:
    # 1行ずつ
    for s_line in f:
        # 文を単語に分割
        for word in s_line.split():
            # ディクショナリに1を足していく
            d[word] += 1

# 結果を表示
for (key, value) in d.items():
    print(key, value)

a 1
b 2
c 2
d 1


In [6]:
# tutorial01 train
from collections import defaultdict
DEBUG = False

# ディクショナリの初期値を0に設定
d = defaultdict(lambda: 0)

# データパス
train_data_path = "../../test/01-train-input.txt" if DEBUG else "../../data/wiki-en-train.word"
save_model_path = "train-input-model.txt" if DEBUG else "wiki-en-train-model.txt"

total_count = 0

# テキストの読み込み
with open(train_data_path) as f:
    for line in f:
        words = line.split()
        words.append("</s>")
        # 特定の単語の出現数と全体の単語数をカウント
        for word in words:
            d[word] += 1
            total_count += 1

# 単語と出現率を出力
with open(save_model_path, mode='w') as f:
    for key, value in d.items():
        prob = value / total_count
        f.write("{} {}\n".format(key, prob))
print("Saved model in {}".format(save_model_path))

Saved model in wiki-en-train-model.txt


In [7]:
# tutorial01 test
import math
from collections import defaultdict
DEBUG = False
UNKNOWN_RATE = 0.05
N = 1000000

# ディクショナリの初期値を0に設定
d = defaultdict(lambda: 0)

# データパス
model_path = "train-input-model.txt" if DEBUG else "wiki-en-train-model.txt"
test_data_path =  "../../test/01-test-input.txt" if DEBUG else "../../data/wiki-en-test.word"

# モデルの読み込み
with open(model_path) as f:
    for line in f:
        word, prob = line.split()
        d[word] = prob

# 評価と結果表示
total_word_count = 0
total_unknown_count = 0
h = 0
with open(test_data_path) as f:
    for line in f:
        words = line.split()
        words.append("</s>")
        for word in words:
            total_word_count += 1
            p = UNKNOWN_RATE / N
            if d[word] != 0:
                p += (1 - UNKNOWN_RATE) * float(d[word])
            else:
                total_unknown_count += 1
            h += -math.log(p, 2)

print("entropy = {}".format(h/total_word_count))
print("coverage = {}".format((total_word_count - total_unknown_count)/total_word_count))

entropy = 10.527337238682652
coverage = 0.895226024503591


In [9]:
!cat ../../test/01-test-answer.txt

entropy = 6.709899
coverage = 0.800000

# Let's check this answer!
#
# The general probability is:
#  P(x) = lambda_1 P_ML(x) + lambda_unk 1/1000000
# 
# So the probability for each word is
#  P(a) = 0.95*0.25 + 0.05*1/1000000    = .23750005
#  P(c) = 0.95*0.125 + 0.05*1/1000000   = .11875005
#  P(e) = 0.95*0 + 0.05*1/1000000       = .00000005
#  P(</s>) = 0.95*0.25 + 0.05*1/1000000 = .23750005
#
# Thus, the entropy for a single word is:
#  H(a)     = -log_2( .23750005 ) = 2.07400
#  H(c)     = -log_2( .11875005 ) = 3.07399
#  H(e)     = -log_2( .23750005 ) = 24.2534
#  H(</s>)  = -log_2( .00000005 ) = 2.07400
#
# Taking the entropy of the total corpus, we get:
#  ( H(a) + H(c) + H(</s>) + H(e) + H(</s>) ) / 5
#  = 6.7098
#
# For the coverage, only one of the words "e" out of 5 is missing, so our
# coverage is 4/5 = 0.8
#
# Note that we are dividing by 5, including the sentence final symbol. It is
# also possible (and maybe more correct) to divide by 3, the total number of
# actual wor

0.5