# 第9章: ベクトル空間法 (I)

In [42]:
import sys, os
from pathlib import Path
from tqdm import tqdm_notebook, tqdm
import re

### 80. コーパスの整形

文を単語列に変換する最も単純な方法は，空白文字で単語に区切ることである．ただ，この方法では文末のピリオドや括弧などの記号が単語に含まれてしまう． そこで，コーパスの各行のテキストを空白文字でトークンのリストに分割した後，各トークンに以下の処理を施し，単語から記号を除去せよ．
- トークンの先頭と末尾に出現する次の文字を削除: `.,!?;:()[]'"`
- 空文字列となったトークンは削除

In [3]:
token_list = []
def get_corpus():
    if Path('./data10/corpus80.txt').exists():
        return
    with open('./data10/enwiki-20150112-400-r10-105752.txt', 'r') as f_in, \
         open('./data10/corpus80.txt', 'w') as f_out:
        corpus = []
        for line in f_in.readlines():
            line = [word.strip('.,!?:;()[]\'\"') for word in line.split() \
                       if word.strip('.,!?:;()[]\'\"')!='']
            if len(line) >= 1:
                corpus + line
                f_out.write(' '.join(line)+'\n')
get_corpus()

In [5]:
for line in zip(range(5), open('./data/corpus100_80.txt')):
    print(*line, end='')

0 Anarchism
1 Anarchism is a political philosophy that advocates stateless societies often defined as self-governed voluntary institutions but that several authors have defined as more specific institutions based on non-hierarchical free associations Anarchism holds the state to be undesirable unnecessary or harmful While anti-statism is central anarchism entails opposing authority or hierarchical organisation in the conduct of human relations including but not limited to the state system
2 As a subtle and anti-dogmatic philosophy anarchism draws on many currents of thought and strategy Anarchism does not offer a fixed body of doctrine from a single particular world view instead fluxing and flowing as a philosophy There are many types and traditions of anarchism not all of which are mutually exclusive Anarchist schools of thought can differ fundamentally supporting anything from extreme individualism to complete collectivism Strains of anarchism have often been divided into the categor

### 81. 複合語からなる国名への対処
英語では，複数の語の連接が意味を成すことがある．例えば，アメリカ合衆国は"United States"，イギリスは"United Kingdom"と表現されるが，"United"や"States"，"Kingdom"という単語だけでは，指し示している概念・実体が曖昧である．そこで，コーパス中に含まれる複合語を認識し，複合語を1語として扱うことで，複合語の意味を推定したい．しかしながら，複合語を正確に認定するのは大変むずかしいので，ここでは複合語からなる国名を認定したい

~~国名データは[ここ](https://www.worldometers.info/geography/alphabetical-list-of-countries/)から手に入れる~~
マン島がなくてダメだった．

~~腹が立ったけど[ISO 3166-1に存在する国](http://www.asahi-net.or.jp/~ax2s-kmtn/ref/iso3166-1.html)から取得することにした．~~


結局 `pip install pycounter`で対処

In [18]:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

def get_countriy_name():
    url = 'https://www.listofcountriesoftheworld.com/'
    save_path = './data/countries.txt'
    if os.path.exists(save_path):
        return
    # Setup Driver
    print('Setup WebDriver')
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--blink-settings=imageEnabled=false')
    browser = webdriver.Chrome(executable_path='./chromedriver', options=options)
    browser.set_page_load_timeout(10*62)
    
    # Scraping
    try:
        browser.get(url)
        table_contents = browser.find_element_by_xpath('//*[@id="left-col"]')
        rows = table_contents.find_elements_by_xpath('//*[@id="ctry"]') # //*[@id="ctry"]
        with open(save_path, 'w') as f:
            for row in rows:
                content = row.find_element_by_tag_name('a')
                print(content)
                f.write(content.text+'\n')
#        countries = '\n'.join([row.find_elements_by_tag_name('td')[1].text for row in rows])
        print('Scraping is success.')
    except Exception as ERROR:
        print(f'[TimeoutException]: {ERROR}')
    browser.close()

get_countriy_name()

In [7]:
import pycountry
# from tqdm import tqdm_notebook, tqdm
countries = list(pycountry.countries)
countries_compound = [country.name for country in countries if ' ' in country.name]
with open('./data10/countries.txt', 'w') as f:
    f.write('\n'.join(['_'.join(name.split()) for name in countries_compound]))

In [8]:
with open('./data10/corpus81.txt', 'w')as f_81:
    for line in open('./data10/corpus80.txt'):
        for name in countries_compound:
            if name in line:
                line = line.replace(name, '_'.join(name.split()))
        f_81.write(line)

In [10]:
n = 0
for line in open('./data10/corpus81.txt'):
    for ctry in open('./data10/countries.txt'):
        if ctry in line:
            print(line)
            n += 1
            continue
    if n == 7:
        break

The song is often included in songbooks in a wide variety of religious congregations in the United_States

Politics of American_Samoa

United_States

New_Zealand

Nevertheless his patronage led to the expansion of Buddhism in the Mauryan empire and other kingdoms during his rule and worldwide from about 250 BCE Prominent in this cause were his son Mahinda Mahendra and daughter Sanghamitra whose name means friend of the Sangha who established Buddhism in Ceylon now Sri_Lanka

Other common names are ear shells sea ears and muttonfish or muttonshells in Australia ormer in Great Britain and in New_Zealand

The haliotid family has a worldwide distribution along the coastal waters of every continent except the Pacific coast of South America the East Coast of the United_States the Arctic and Antarctica The majority of abalone species are found in cold waters such as off the coasts of New_Zealand South_Africa Australia Western North America and Japan



### 82. 文脈の抽出
81で作成したコーパス中に出現するすべての単語tに関して，単語tと文脈語cのペアをタブ区切り形式ですべて書き出せ．ただし，文脈語の定義は次の通りとする．

ある単語tの前後d単語を文脈語cとして抽出する（ただし，文脈語に単語tそのものは含まない）
単語tを選ぶ度に，文脈幅dは{1,2,3,4,5}の範囲でランダムに決める．

In [11]:
from tqdm import tqdm_notebook
import random

with open('./data10/answer82.txt', 'w') as f_82:
    for line in tqdm_notebook(open('./data10/corpus81.txt')):
        words = line.split()
        for index, t in enumerate(words):
            d = random.randint(1, 5)
            start = max(index - d, 0)
            end = index + d + 1
            for c in words[start:index] + words[index+1:end]:
                f_82.write(f'{t}\t{c}\n')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [12]:
for i, context in enumerate(open('./data10/answer82.txt')):
    if i == 10:
        break
    print(context)

Anarchism	is

Anarchism	a

Anarchism	political

is	Anarchism

is	a

is	political

a	Anarchism

a	is

a	political

a	philosophy



### 83. 単語／文脈の頻度の計測
82の出力を利用し，以下の出現分布，および定数を求めよ．

- f(t,c): 単語tと文脈語cの共起回数
- f(t,∗): 単語tの出現回数
- f(∗,c): 文脈語cの出現回数
- N: 単語と文脈語のペアの総出現回数

In [13]:
from collections import Counter
import pickle
from tqdm import tqdm_notebook

# f(t,c)
f_tc_counter = Counter()
f_t_counter = Counter()
f_c_counter = Counter()

tc_list = []
t_list = []
c_list = []
num = 1000000
for i ,line in enumerate(tqdm_notebook(open('./data10/answer82.txt'))):
    tc = line.strip()
    t, c = tc.split('\t')
    tc_list.append(tc)
    t_list.append(t)
    c_list.append(c)
    
    if i % num == 0:
        f_tc_counter.update(tc_list)
        tc_list = []
        f_t_counter.update(t_list)
        t_list = []
        f_c_counter.update(c_list)
        c_list = []
f_tc_counter.update(tc_list)
f_t_counter.update(t_list)
f_c_counter.update(c_list)

with open('./data10/f_tc_counter.pkl', 'wb') as counter_file:
    pickle.dump(f_tc_counter, counter_file)
with open('./data10/f_t_counter.pkl', 'wb') as counter_file:
    pickle.dump(f_t_counter, counter_file)
with open('./data10/f_c_counter.pkl', 'wb') as counter_file:
    pickle.dump(f_c_counter, counter_file)
print(f'N = {sum([i for i in f_t_counter.values()])}')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




OSError: [Errno 22] Invalid argument

mac環境下で `pickle.dump`で4GB以上のデータを保存しようとするとエラーが出てしまったので， [Qiita](https://qiita.com/NomuraS/items/da3fd3a1ecd76175e5f8) を丸パクリすることで回避

In [16]:
class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size

def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)
def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(MacOSFile(f))

"""
with open('./data10/f_tc_counter.pkl', 'wb') as counter_file:
    pickle_dump(f_tc_counter, counter_file)
with open('./data10/f_t_counter.pkl', 'wb') as counter_file:
    pickle_dump(f_t_counter, counter_file)
with open('./data10/f_c_counter.pkl', 'wb') as counter_file:
    pickle_dump(f_c_counter, counter_file)
"""
pickle_dump(f_tc_counter, './data10/f_tc_counter.pkl')
pickle_dump(f_t_counter, './data10/f_t_counter.pkl')
pickle_dump(f_c_counter, './data10/f_c_counter.pkl')
print(f'N = {sum([i for i in f_t_counter.values()])}')

writing total_bytes=2491441306...
writing bytes [0, 1073741824)... done.
writing bytes [1073741824, 2147483648)... done.
writing bytes [2147483648, 2491441306)... done.
writing total_bytes=25914858...
writing bytes [0, 25914858)... done.
writing total_bytes=25914780...
writing bytes [0, 25914780)... done.
N = 689461055


### 84. 単語文脈行列の作成
83の出力を利用し，単語文脈行列XXを作成せよ．ただし，行列XXの各要素XtcXtcは次のように定義する．

- $f(t,c) \geq 10$ ならば， $X_{tc} = PPMI(t,c) = \max\{\log {\frac{N \times f(t,c)}{f(t,∗)\times f(∗,c)}},0\}$
- $f(t,c)<10f(t,c)<10ならば，Xtc=0Xtc=0 $
ここで，PPMI(tt,cc)はPositive Pointwise Mutual Information（正の相互情報量）と呼ばれる統計量である．なお，行列XXの行数・列数は数百万オーダとなり，行列のすべての要素を主記憶上に載せることは無理なので注意すること．幸い，行列XXのほとんどの要素は0になるので，非0の要素だけを書き出せばよい．

In [17]:
from collections import Counter, OrderedDict
from scipy import sparse, io
import pickle
import numpy as np
with open('./data10/f_tc_counter.pkl', 'rb') as counter_file:
    f_tc_counter = pickle_load(counter_file)
with open('./data10/f_t_counter.pkl', 'rb') as counter_file:
    f_t_counter = pickle_load(counter_file)
with open('./data10/f_c_counter.pkl', 'rb') as counter_file:
    f_c_counter = pickle_load(counter_file)
N = sum([i for i in f_t_counter.values()])

index_t = OrderedDict((key, i) for i, key in enumerate(f_t_counter.keys()))
index_c = OrderedDict((key, i) for i, key in enumerate(f_c_counter.keys()))

size_t = len(index_t)
size_c = len(index_c)
matrix = sparse.lil_matrix((size_t, size_c))

for k, f_tc in f_tc_counter.items():
    if f_tc >= 10:
        t, c = k.split('\t')
        ppmi = max(np.log((N * f_tc) / (f_t_counter[t] * f_c_counter[c])), 0)
        matrix[index_t[t], index_c[c]] = ppmi
matrix = matrix.tocsc()
sparse.save_npz('./data10/matrix.npz', matrix)

In [18]:
with open('./data10/index_t.pkl', 'wb') as index_file:
    pickle.dump(index_t, index_file)

### 85. 主成分分析による次元削減

In [19]:
from pathlib import Path
from sklearn.decomposition import TruncatedSVD

load_path = './data10/matrix.npz'
save_path = './data10/matrix_low85'

if not Path(save_path).exists():
    word_matrix = sparse.load_npz(load_path)
    pca = TruncatedSVD(n_components=300)
    word_matrix300 = pca.fit_transform(word_matrix)
    np.save(save_path, word_matrix300)

In [20]:
# word_matrix300 = np.load(save_path+'.npy')
print(word_matrix300.shape)

(1808580, 300)


### 86. 単語ベクトルの表示

85で得た単語の意味ベクトルを読み込み，"United States"のベクトルを表示せよ．ただし，"United States"は内部的には"United_States"と表示されていることに注意せよ．

In [None]:
with open('./data10/index_t.pkl', 'rb') as index_file:
    index_t = pickle.load(index_file)
word_vector = np.load('./data10/matrix_low85.npy')

In [32]:
index = index_t['United_States']
United_States_vector = word_vector[index]
print(United_States_vector.shape)

(300,)


### 87. 単語の類似度

85で得た単語の意味ベクトルを読み込み，"United States"と"U.S."のコサイン類似度を計算せよ．ただし，"U.S."は内部的に"U.S"と表現されていることに注意せよ．

> 同じような文脈で使われていることがわかる

In [36]:
import numpy as np
def cosine_simlarity(vec1, vec2):
    norm_ab = np.linalg.norm(vec1)*np.linalg.norm(vec2)
    if norm_ab != 0:
        return np.dot(vec1, vec2)/norm_ab
    else: return -1

In [37]:
United_States_vector = word_vector[index_t['United_States']]
US_vector = word_vector[index_t['U.S']]
cosine_simlarity(United_States_vector, US_vector)

0.8494296352198178

### 88. 類似度の高い単語10件
85で得た単語の意味ベクトルを読み込み，"England"とコサイン類似度が高い10語と，その類似度を出力せよ．

>イングランドの地名が多い．オーストラリアやニュージーランドはなぜ？  
>きちんとスコットランド，ウェールズ，アイルランドが入っているのがすごい．



In [40]:
import numpy as np
England_vector = word_vector[index_t['England']]
# cosine_simularities = [cosine_simularity(England_vector, word_vector[index]) or 0 for index in index_t.values()]
cosine_similarities = []
for t in index_t:
    cosine_similarities.append((t, cosine_simlarity(England_vector, word_vector[index_t[t]])))
sorted(cosine_similarities, key=lambda x: x[1], reverse=True)[1:11]

[('Scotland', 0.7758510008207012),
 ('Wales', 0.7127774052389219),
 ('Australia', 0.6510417216056822),
 ('Ireland', 0.6438686528431701),
 ("Yard's", 0.6185058232994105),
 ('Yorkshire', 0.6118639381145149),
 ('Somerset', 0.6001176415423898),
 ('Britain', 0.5865217724281352),
 ('New_Zealand', 0.5751876740255513),
 ('Wiltshire', 0.5570767464552019)]

### 89. 加法構成性によるアナロジー
85で得た単語の意味ベクトルを読み込み，vec("Spain") - vec("Madrid") + vec("Athens")を計算し，そのベクトルと類似度の高い10語とその類似度を出力せよ．

>**スペイン（国名） - マドリード（首都） $\fallingdotseq$ ギリシャ - アテネ（ギリシャの首都）**  
>なる関係が成り立つと考えられるので，結果に **Greece**が欲しかった．

In [41]:
import numpy as np
wvector = word_vector[index_t['Spain']] + word_vector[index_t['Madrid']] + word_vector[index_t['Athens']]
cosine_similarities = []
for t in index_t:
    cosine_similarities.append((t, cosine_simlarity(wvector, word_vector[index_t[t]])))
sorted(cosine_similarities, key=lambda x: x[1], reverse=True)[:10]    


[('Spain', 0.9047012984375865),
 ('Portugal', 0.8242625537918804),
 ('Sweden', 0.7904240780085151),
 ('Italy', 0.7859304303471885),
 ('Netherlands', 0.7766299141193055),
 ('France', 0.7760652060704972),
 ('Norway', 0.7730416508507796),
 ('Belgium', 0.7718286086726056),
 ('Argentina', 0.7701562582024696),
 ('Denmark', 0.765747133804346)]

### ゴミ置き場

In [4]:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

def get_countriy_name():
    url = 'https://www.worldometers.info/geography/alphabetical-list-of-countries/'
    save_path = './data/countries.txt'
    if os.path.exists(save_path):
        return
    # Setup Driver
    print('Setup WebDriver')
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--blink-settings=imageEnabled=false')
    browser = webdriver.Chrome(executable_path='./chromedriver', options=options)
    browser.set_page_load_timeout(10*62)
    
    # Scraping
    try:
        browser.get(url)
        table_contents = browser.find_element_by_xpath('//table[@class="table table-hover table-condensed"]/tbody')
        rows = table_contents.find_elements_by_tag_name('tr')
        countries = '\n'.join([row.find_elements_by_tag_name('td')[1].text for row in rows])
        with open(save_path, 'w') as f:
            f.write(countries)
        print('Scraping is success.')
        print(f'Save files at "{save_path}"')
    except Exception as ERROR:
        print(f'[TimeoutException]: {ERROR}')
    browser.close()

get_countriy_name()