In [92]:
# install
import nltk
import numpy as np
import pandas as pd

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

from nltk.metrics import jaccard_distance
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

from itertools import combinations


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kondoutaisyou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kondoutaisyou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kondoutaisyou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [93]:
# https://docs.google.com/spreadsheets/d/1j0d_LVpF_3EWX8xgFIfE6LxGUGC4Ieq1wQ8ROFsbOyo/edit#gid=1294886113
# sampleの文書を利用した。
df = pd.read_csv("wiki.csv")
df = df[:10]



In [94]:
# 文章を集合化
stop_words = set(stopwords.words('english')) 
def to_set(tmp):
    return set([w for w in word_tokenize(tmp) if not w in stop_words])
df['text_set'] = df['Abstract'].map(to_set)

In [95]:
# 全部は必要ないので、10個のみ対象とする
tmp_df = df[:10]
category = ["アジア","北アメリカ","ヨーロッパ","アジア","アジア","アジア","ヨーロッパ","ユーラシア","ヨーロッパ","ヨーロッパ"]
tmp_df['category'] = category
tmp_df

Unnamed: 0,Name,Abstract,text_set,category
0,Japan,Japan is an island country in East Asia. Locat...,"{influential, Upper, fourth-largest, maintaine...",アジア
1,United States,"The United States of America (USA), commonly k...","{organizations, culminating, left, 1964, PPP, ...",北アメリカ
2,England,England is a country that is part of the Unite...,"{inhabited, 7, Upper, 14, However, 1801, Angli...",ヨーロッパ
3,China,"China, officially the People's Republic of Chi...","{disputed, replaced, trillion, fourth-largest,...",アジア
4,India,"India, also known as the Republic of India,[19...","{Maldives, Bengal, era, exchange, pluralistic,...",アジア
5,Korea,Korea is a region in East Asia.[3] Since 1948 ...,"{collapsed, Sino-Japanese, 7, era, However, mo...",アジア
6,Germany,"Germany, officially the Federal Republic of Ge...","{inhabited, replaced, care, influential, fourt...",ヨーロッパ
7,Russia,"Russia, or the Russian Federation[12], is a tr...","{wide, Republic, 19, 12, twelfth, number, Coll...",ユーラシア
8,France,"France, officially the French Republic, is a c...","{7, fourth-largest, 843, culminating, Republic...",ヨーロッパ
9,Italy,"Italy, officially the Italian Republic,[10][11...","{Marino, influential, location, Italian, homel...",ヨーロッパ


In [103]:
result_dic = {}

count = 0
for one, two in combinations(range(10), 2):
    distance = jaccard_distance(tmp_df['text_set'][one],tmp_df['text_set'][two])
    # print("{}:{} 類似度:{}".format(tmp_df['Name'][one],tmp_df['Name'][two], distance))
    result_dic[count] = {
        "Jaccard係数": distance,
        "groups": "{}:{}".format(tmp_df['category'][one],tmp_df['category'][two])
    }
    count += 1

print(result_dic)


{0: {'Jaccard係数': 0.884083044982699, 'groups': 'アジア:北アメリカ'}, 1: {'Jaccard係数': 0.8910675381263616, 'groups': 'アジア:ヨーロッパ'}, 2: {'Jaccard係数': 0.8610567514677103, 'groups': 'アジア:アジア'}, 3: {'Jaccard係数': 0.904862579281184, 'groups': 'アジア:アジア'}, 4: {'Jaccard係数': 0.8667992047713717, 'groups': 'アジア:アジア'}, 5: {'Jaccard係数': 0.8531073446327684, 'groups': 'アジア:ヨーロッパ'}, 6: {'Jaccard係数': 0.8647260273972602, 'groups': 'アジア:ユーラシア'}, 7: {'Jaccard係数': 0.8451730418943534, 'groups': 'アジア:ヨーロッパ'}, 8: {'Jaccard係数': 0.8363636363636363, 'groups': 'アジア:ヨーロッパ'}, 9: {'Jaccard係数': 0.9012345679012346, 'groups': '北アメリカ:ヨーロッパ'}, 10: {'Jaccard係数': 0.8438095238095238, 'groups': '北アメリカ:アジア'}, 11: {'Jaccard係数': 0.8985801217038539, 'groups': '北アメリカ:アジア'}, 12: {'Jaccard係数': 0.8899253731343284, 'groups': '北アメリカ:アジア'}, 13: {'Jaccard係数': 0.8637992831541219, 'groups': '北アメリカ:ヨーロッパ'}, 14: {'Jaccard係数': 0.8158347676419966, 'groups': '北アメリカ:ユーラシア'}, 15: {'Jaccard係数': 0.8539130434782609, 'groups': '北アメリカ:ヨーロッパ'}, 16: {'Jaccard係数':

In [104]:
"""
ユークリッド距離を計算するメソッド
値が小さければ似ていると考えることができる
"""
def euclidean_distance(list_a, list_b):
  diff_vec = np.array(list_a) - np.array(list_b)
  return np.linalg.norm(diff_vec)

In [105]:
# 文章をベクトル化
docs = tmp_df['Abstract'].to_numpy()

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag = count.fit_transform(docs)

In [107]:
bag = count.fit_transform(docs)
vector_list = bag.toarray()


In [108]:
count = 0
for one, two in combinations(range(10), 2):
    distance = euclidean_distance(vector_list[one],vector_list[two])
    print("{}:{} 類似度:{}".format(tmp_df['Name'][one],tmp_df['Name'][two], distance))
    result_dic[count]['ユークリッド距離'] = distance
    count += 1

Japan:United States 類似度:60.81940479813988
Japan:England 類似度:42.30839160261236
Japan:China 類似度:42.8485705712571
Japan:India 類似度:48.249352327259274
Japan:Korea 類似度:48.23898838076935
Japan:Germany 類似度:43.255057507764334
Japan:Russia 類似度:55.48873759602033
Japan:France 類似度:43.393547907494266
Japan:Italy 類似度:60.63002556489647
United States:England 類似度:57.92236183029832
United States:China 類似度:67.22350779303324
United States:India 類似度:79.55501241279521
United States:Korea 類似度:57.463031594234565
United States:Germany 類似度:53.75872022286245
United States:Russia 類似度:49.01020301937138
United States:France 類似度:56.815490845367165
United States:Italy 類似度:56.418082207746124
England:China 類似度:42.14261501141095
England:India 類似度:45.9782557302906
England:Korea 類似度:42.67317658670374
England:Germany 類似度:41.46082488325576
England:Russia 類似度:54.17564028232615
England:France 類似度:42.930175867331364
England:Italy 類似度:59.07622195096772
China:India 類似度:42.261093218230876
China:Korea 類似度:50.547007824400445
China:G

In [113]:
result_df = pd.DataFrame.from_dict(result_dic, orient='index')
result_df.groupby('groups').mean()

Unnamed: 0_level_0,Jaccard係数,ユークリッド距離
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
アジア:アジア,0.893588,48.228771
アジア:ユーラシア,0.859853,60.71658
アジア:ヨーロッパ,0.876185,53.418151
アジア:北アメリカ,0.884083,60.819405
ユーラシア:ヨーロッパ,0.840627,46.982521
ヨーロッパ:アジア,0.900316,43.598016
ヨーロッパ:ユーラシア,0.865534,49.470849
ヨーロッパ:ヨーロッパ,0.83741,46.359353
北アメリカ:アジア,0.877438,68.080517
北アメリカ:ユーラシア,0.815835,49.010203
