In [None]:
# load data

In [29]:
import sys
sys.setrecursionlimit(5000)
print(f"新しい制限: {sys.getrecursionlimit()}")
# 表示する最大行数を設定 (Noneに設定すると制限なし)
pd.set_option('display.max_rows', None)

# 表示する最大列数を設定 (Noneに設定すると制限なし)
pd.set_option('display.max_columns', None)

# DataFrameの列幅（表示幅）を設定 (500文字まで表示)
# 'OxfordWords'のようにリストを含む列がある場合、この設定が重要になります。
pd.set_option('display.width', 500)

新しい制限: 5000


## load 100000 data

In [52]:
# load csv
import pandas as pd
import pickle
oxford_data = pd.read_csv('oxford-5000.csv')

# load words
with open('processed_data/w2v_words_100000.txt', 'r', encoding='utf-8') as f:
    words = f.read().splitlines()

# load condensed tree

tree_file_path = '../18_rapids/result/20251112_044404/condensed_tree_object.pkl'
with open(tree_file_path, 'rb') as f:
    condensed_tree = pickle.load(f)


# check data length
print(f'Oxford data length: {len(oxford_data)}')
print(f'Words length: {len(words)}')
print(f'Condensed tree length: {len(condensed_tree._raw_tree)}')


Oxford data length: 5948
Words length: 100000
Condensed tree length: 100344


## load 500000 data

In [56]:
# load words
with open('processed_data/w2v_words_500000.txt', 'r', encoding='utf-8') as f:
    words = f.read().splitlines()

# load condensed tree

tree_file_path = '../18_rapids/result/20251030_185048/condensed_tree_object.pkl'
with open(tree_file_path, 'rb') as f:
    condensed_tree = pickle.load(f)

# check data length
print(f'Oxford data length: {len(oxford_data)}')
print(f'Words length: {len(words)}')
print(f'Condensed tree length: {len(condensed_tree._raw_tree)}')

Oxford data length: 5948
Words length: 500000
Condensed tree length: 501890


# a

In [57]:
# child sizeが 2以上のノード数を数える
parent = [node for node in condensed_tree._raw_tree if node['child_size'] >= 2]
print(f'Number of nodes with child size >= 2: {len(parent)}')
print(f'Example node: {parent[0]}')
parent_ids = [node['child'] for node in parent]

Number of nodes with child size >= 2: 1890
Example node: (500000, 500001, 0.91182387, 493404)


In [None]:
print(len(_recur))

In [13]:
# dictionary of cluster id to word id
raw_tree = condensed_tree._raw_tree
def _recurse_leaf_dfs(cluster_tree, current_node):
  children = cluster_tree[cluster_tree['parent'] == current_node]['child']
  if len(children) == 0:
      return [current_node,]
  else:
      return sum([_recurse_leaf_dfs(cluster_tree, child) for child in children], [])
cluster_point_map = {id: _recurse_leaf_dfs(raw_tree, id) for id in parent_ids}

# print(len(_recurse_leaf_dfs(raw_tree, parent_ids[0])))




In [59]:
raw_tree = condensed_tree._raw_tree
leaf_rows = raw_tree[raw_tree['child_size'] == 1]
point_cluster_map = {int(row['child']): int(row['parent']) for row in leaf_rows}

# keyの最小、最大が0, len(words)-1になっていることを確認
print(f'Point indices range: {min(point_cluster_map.keys())} to {max(point_cluster_map.keys())}')

Point indices range: 0 to 499999


In [60]:
import pandas as pd
import numpy as np
import os

# 親のクラスタIDをラベルとする
cluster_labels = [point_cluster_map[i] for i in range(len(words))]

# ユニークなクラスタIDの数を確認
unique_clusters = set(cluster_labels)
print(f"Number of unique clusters: {len(unique_clusters)}")

print(f"length of words: {len(words)}")
print(len(range(len(words))))

# --- 1. Oxford 3000の単語リストの準備 ---
# 検索効率のためにSetに変換
oxford_words_set = set(oxford_data['word'].str.lower()) # Oxfordの単語を小文字にしてSetに

# --- 2. Word2Vecに含まれるOxford 3000の単語を特定 ---

# Word2Vecの単語リストをDataFrameに変換し、クラスタIDを結合
word_df = pd.DataFrame({
    'Word': words,
    'ClusterID': cluster_labels,
    'WordIndex': range(len(words))
})

# Oxford 3000に含まれるかどうかを示すフラグ列を追加
word_df['IsOxford'] = word_df['Word'].str.lower().isin(oxford_words_set)

# Word2Vecに含まれるOxford 3000の単語の数を計算
oxford_in_w2v_count = word_df['IsOxford'].sum()
print("\n--- 分析結果 (Oxford in W2V) ---")
print(f"Oxford-3000 (約3000語) の中でWord2Vecに含まれる単語の数: {oxford_in_w2v_count}語")

# --- 3. クラスタごとのOxford 3000単語の分布を調査 ---


clustered_df = word_df

# クラスタIDごとにOxford単語の数と具体的な単語を抽出
cluster_analysis = {}
total_clustered_oxford_words = 0

for cluster_id, group in clustered_df.groupby('ClusterID'):
    # そのクラスタ内のOxford単語のみをフィルタリング
    oxford_words_in_cluster = group[group['IsOxford']]
    
    count = len(oxford_words_in_cluster)
    total_clustered_oxford_words += count
    
    # 具体的な単語リストを取得（最大50語に制限）
    sample_words = oxford_words_in_cluster['Word'].tolist()
    
    cluster_analysis[cluster_id] = {
        'TotalWordsInCluster': len(group),
        'OxfordCount': count,
        'OxfordWords': sample_words,
        'OxfordRatio': count / len(group) if len(group) > 0 else 0
    }

print("\n--- 分析結果 (クラスタごとのOxford単語の分布) ---")
print(f"クラスタ化されたデータポイントに含まれるOxford単語の総数: {total_clustered_oxford_words}語\n")

# 結果をDataFrameにまとめて表示
analysis_results = pd.DataFrame.from_dict(cluster_analysis, orient='index')
analysis_results.index.name = 'ClusterID'
analysis_results = analysis_results.sort_values(by='OxfordCount', ascending=False)

print(analysis_results[['TotalWordsInCluster', 'OxfordCount', 'OxfordRatio']])

print("\n--- クラスタごとの具体的なOxford単語 (上位3クラスタ) ---")
for cluster_id in analysis_results.index:
    info = cluster_analysis[cluster_id]
    
    # 単語が多すぎる場合は一部のみ表示
    words_list = info['OxfordWords']
    display_words = words_list[:50]
    if len(words_list) > 50:
        display_words.append('... (他 ' + str(len(words_list) - 50) + '語)')

    print(f"\n### 📚 Cluster ID: {cluster_id}")
    print(f"  - クラスタ内の総単語数: {info['TotalWordsInCluster']}")
    print(f"  - Oxford単語の数: {info['OxfordCount']} ({info['OxfordRatio']:.2%})")
    print(f"  - Oxford単語リスト: {', '.join(display_words)}")
    print(f"  - クラスタ内の単語例(Oxford以外も含む): {', '.join(word_df[word_df['ClusterID'] == cluster_id]['Word'].tolist()[:10])} ...")

# --- 4. データポイントへのラベル付けについて ---
# クラスタ内の単語リスト (OxfordWords) をクラスタの「要約ラベル」として利用できます。
# 例えば、Cluster ID 5 に "cat", "dog", "pet" などの動物の単語が集中していた場合、
# このクラスタは「動物」または「ペット」というラベルで要約可能です。
# これは手動での確認（上記のリスト）またはより高度なラベル付け手法（TF-IDFなど）が必要です。

Number of unique clusters: 1880
length of words: 500000
500000

--- 分析結果 (Oxford in W2V) ---
Oxford-3000 (約3000語) の中でWord2Vecに含まれる単語の数: 2491語

--- 分析結果 (クラスタごとのOxford単語の分布) ---
クラスタ化されたデータポイントに含まれるOxford単語の総数: 2491語

           TotalWordsInCluster  OxfordCount  OxfordRatio
ClusterID                                               
500001                   74441          957     0.012856
500000                    6589          122     0.018516
500018                   20316          100     0.004922
500008                   10158           65     0.006399
500032                   18941           54     0.002851
500004                   11870           54     0.004549
500030                   13562           47     0.003466
500027                   11970           40     0.003342
500003                    4702           32     0.006806
500024                    6725           29     0.004312
500014                    6004           28     0.004664
500052                    8896           2

In [46]:
import hdbscan
def exemplars(cluster_id, condensed_tree):
    raw_tree = condensed_tree._raw_tree
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)
    result = np.array([])
    for leaf in leaves:
        max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        points = raw_tree['child'][(raw_tree['parent'] == leaf) &
                                   (raw_tree['lambda_val'] == max_lambda)]
        result = np.hstack((result, points))
    return result.astype(np.int)

In [61]:
for i, c in enumerate(unique_clusters):
    # 最大のλ値
    c_exemplars = exemplars(c, condensed_tree)
    # print(f'Cluster {i} (ID: {c}) exemplars: {c_exemplars}')
    word_examples = [words[idx] for idx in c_exemplars]

    # Oxfordの単語かどうかでフィルタする
    oxford_word_examples = [w for w in word_examples if w.lower() in oxford_words_set]
    print(f'Cluster {i} (ID: {c}) Oxford exemplars: {oxford_word_examples}')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return result.astype(np.int)


Cluster 0 (ID: 500000) Oxford exemplars: ['tempt', 'persuade', 'reluctant', 'difficult', 'complicated', 'eleven', 'twelve', 'sixty', 'thirty', 'nineteen', 'Unfortunately', 'Although', 'Thus', 'LETTER', 'bike', 'motorcycle', 'bicycle', 'dad', 'husband', 'son', 'gay', 'misleading', 'annoyed', 'disappointed', 'illustrate', 'reinforce']
Cluster 1 (ID: 500001) Oxford exemplars: ['tempt', 'persuade', 'reluctant', 'difficult', 'complicated', 'eleven', 'twelve', 'sixty', 'thirty', 'nineteen', 'Unfortunately', 'Although', 'Thus', 'LETTER', 'bike', 'motorcycle', 'bicycle', 'dad', 'husband', 'son', 'gay', 'misleading', 'annoyed', 'disappointed', 'illustrate', 'reinforce']
Cluster 2 (ID: 500002) Oxford exemplars: []
Cluster 3 (ID: 500003) Oxford exemplars: ['tempt', 'persuade', 'reluctant', 'difficult', 'complicated', 'eleven', 'twelve', 'sixty', 'thirty', 'nineteen', 'Unfortunately', 'Although', 'Thus', 'LETTER', 'bike', 'motorcycle', 'bicycle', 'dad', 'husband', 'son', 'gay', 'misleading', 'anno

KeyboardInterrupt: 