In [2]:
# 80. コーパスの整形
import re

with open("./enwiki-20150112-400-r100-10576.txt", "r") as rf, open("./enwiki-cleaned.txt", "w") as wf:
    for line in rf:
        tokens = [re.sub(r"^[\.,!?;:\(\)\[\]\'\"]+|[\.,!?;:\(\)\[\]\'\"]+$", "", token) for token in line.split() if re.sub(r"^[\.,!?;:\(\)\[\]\'\"]+|[\.,!?;:\(\)\[\]\'\"]+$", "", token) is not ""]
        wf.write(" ".join(tokens) + "\n")

In [42]:
# 81. 複合語からなる国名への対処
with open("Countries.txt") as f:
    country_dict = {}
    for line in f:
        country_dict[line.replace("\n", "")] = line.replace(" ", "_").replace("\n", "")
        
# 以降の問いでも使うのでテキストを配列にする
sentences = []
with open("enwiki-cleaned.txt") as f:
    for line in f:
        rep_line = line
        for key in country_dict:
            rep_line = rep_line.replace(key, country_dict[key])
        sentences.append(rep_line.split())

In [43]:
# 82. 文脈の抽出
import random

with open("context_words.txt", "w") as f:
    for sentence in sentences:
        for pos, token in enumerate(sentence):
            context_word = []
            window_size = random.randint(1, 5)
            for idx in range(1, window_size + 1):
                # 単語tより前の単語
                p = pos - idx
                if p >= 0:
                    context_word.append(sentence[p])

                #単語tより後の単語
                p = pos + idx
                if p < len(sentence):
                    context_word.append(sentence[p])
            f.write("{0}\t{1}\n".format(token, "\t".join(context_word)))

In [44]:
# 83. 単語／文脈の頻度の計測
co_occur_dict = {} # "t:::c"をkeyにして共起回数を保持する辞書
word_dict = {} # tをkeyにして出現回数を保持する
con_word_dict = {} # cをkeyにして出現回数を保持する
with open("context_words.txt") as f:
    for line in f:
        tokens = line.replace("\n", "").split("\t")
        # tの出現回数を計算
        if tokens[0] not in word_dict:
            word_dict[tokens[0]] = 1
        else:
            word_dict[tokens[0]] += 1
        
        for con_word in tokens[1:]:
            # cの出現回数を計算
            if con_word not in con_word_dict:
                con_word_dict[con_word] = 1
            else:
                con_word_dict[con_word] += 1
                
            # tとcの共起回数を計算
            if "{}:::{}".format(tokens[0], con_word) not in co_occur_dict:
                co_occur_dict["{}:::{}".format(tokens[0], con_word)] = 1
            else:
                co_occur_dict["{}:::{}".format(tokens[0], con_word)] += 1
                
N = len(co_occur_dict)

In [45]:
print(N)

21314489


In [53]:
# 84. 単語文脈行列の作成
import math

# 行にt,列にcが並んだ行列
# ti成分にはPPMI値が入る
# {t1: [[c1, 1.44],[c2, 0.031]], t2: [[c2, 9.14], [c3, 3.22]]...}
word_context_matrix = {}

for co_occur in co_occur_dict:
    t, c = co_occur.split(":::")
    f_tc = co_occur_dict[co_occur]
    f_t = word_dict[t]
    f_c = con_word_dict[c]
    if f_tc >= 10:
        PPMI = max([math.log((N * f_tc) / (f_t * f_c)), 0])
        if PPMI > 0:
            if t not in word_context_matrix:
                word_context_matrix[t] = [[c, PPMI]]
            else:
                word_context_matrix[t].append([c, PPMI])

In [54]:
# 列全てが0でない列を抽出
column_list = []
for key in word_context_matrix:
    for column in word_context_matrix[key]:
        if column[0] not in column_list:
            column_list.append(column[0])
print(len(column_list))

32574


In [55]:
# どの単語が列の何番目に対応するのかを保持する
word2idx = {}
for idx, column in enumerate(column_list):
    word2idx[column] = idx

In [67]:
# 各行が32462列のデータを保持するようにする
# {t1: [0, 4.2, 0, 1.2,....,7.12], t2: [0, 0, 0,..]...}
matrix = {}

for key in word_context_matrix:
    row_list = [0] * len(column_list)
    for c_list in word_context_matrix[key]:
        row_list[word2idx[c_list[0]]] = c_list[1]
    matrix[key] = row_list

In [72]:
# 85. 主成分分析による次元圧縮
# 主成分分析をかけるために行列を用意する
X = []
for key in matrix:
    X.append(matrix[key])
print(len(X))

32620


In [73]:
print(len(X[0]))

32574


In [None]:
# 計算が終わらなかったため断念
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)

In [74]:
# 86. 単語ベクトルの表示
# 主成分分析による次元圧縮がうまくいかなかったため、32574次元の意味ベクトルで進める
print(matrix["United_States"])

[0, 0, 0.5838847055311968, 0.4169088804015991, 0.03981118027898801, 0.03189168989156353, 0.5672482219469697, 0, 1.4705071396155291, 0.4712612506528778, 0.21099697915524232, 0, 0.2689451291002665, 0, 0, 1.3249174059155036, 0.609438936936924, 0, 0, 0.770879188525087, 0, 0.7063748000487534, 0, 0.6131630465667256, 0, 0.12591209551881394, 0, 0, 0, 0.11409436288645651, 0.038132913121730116, 0, 0, 1.5635786711479525, 0.2498348716276897, 1.169190135911999, 0, 0.1440293296039631, 0.3021603305587908, 0.11079575491953074, 1.132011798671005, 0, 0.13530115826132022, 0, 0.2581763399342427, 0, 0.5067645266549139, 0, 1.3804555234305682, 0, 0, 0.3569699013202161, 0.5022331298502186, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3717324625495729, 0, 0, 0, 0, 0, 0, 0, 0, 0.3123326287870508, 0.07984516977999997, 0, 0, 0, 0, 0, 0.05841945483841541, 0.3075306211630111, 0, 0, 0.1618651432314188, 0, 0.9514734790919326, 0, 0.2604465644850462, 0, 0.45539962211301294, 2.6329920896741412, 0, 0, 0, 2.082520670

In [77]:
# 87. 単語の類似度
# やはり次元圧縮しないといい数値にならない

def calc_cos_similarity(v1, v2):
    denominator = sum([a * b for a,b in zip(v1, v2)])
    numerator = (math.pow(sum([math.pow(a, 2) for a in v1]), 0.5)) * (math.pow(sum([math.pow(b, 2) for b in v2]), 0.5))
    return denominator / numerator

v_united_states = matrix["United_States"]
v_US = matrix["U.S"]
print(calc_cos_similarity(v_united_states, v_US))

0.5086421961697336


In [80]:
# 88. 類似度の高い単語10件
v_England = matrix["England"]
v_list = []
for key in matrix:
    similarity = calc_cos_similarity(v_England, matrix[key])
    v_list.append([key, similarity])
sorted_list = sorted(v_list, key=lambda x:x[1], reverse=True)

for vector in sorted_list[:10]:
    print("{}\t{}".format(vector[0], vector[1]))

England	1.0
Australia	0.3201335523954833
Wales	0.2820730440527197
Ireland	0.2608013360723768
Scotland	0.245248969741504
United_Kingdom	0.24514218074143942
France	0.23502350981647657
Italy	0.22617367621532639
Germany	0.2167831269599782
Japan	0.20928290108115288


In [83]:
# 89. 加法構成性によるアナロジー

def calc_vector(v1, v2, operand="add"):
    result = []
    if operand == "add":
        for a, b in zip(v1, v2):
            result.append(a+b)
    elif operand == "substract":
        for a, b in zip(v1, v2):
            result.append(a-b)
    return result
        
v_Spain = matrix["Spain"]
v_Madrid = matrix["Madrid"]
v_Athens = matrix["Athens"]

result = calc_vector(calc_vector(v_Spain, v_Madrid, operand="substract"), v_Athens, operand="add")

In [84]:
v_list = []
for key in matrix:
    similarity = calc_cos_similarity(result, matrix[key])
    v_list.append([key, similarity])
sorted_list = sorted(v_list, key=lambda x:x[1], reverse=True)

for vector in sorted_list[:10]:
    print("{}\t{}".format(vector[0], vector[1]))

Spain	0.6085779567838507
Athens	0.5061297508515469
Macedonians	0.3042732904012962
Lisbon	0.28986176010836134
Finland	0.2860736197533424
Yugoslavia	0.2775023750070958
Denmark	0.2711560350430096
Netherlands	0.2704541724609527
Turkey	0.2551952993733486
Isabella	0.22902135683231228
