In [1]:
import numpy as np

# 讀取文件並建立每個文章的tf-idf unit vector的list
all_tf_idf_unit_vector = []
for i in range(0, 1095):
    path = f"./output/{i + 1}.txt"
    with open(path, 'r') as file:
        content = file.read().split()[2:]  # 跳過前兩個元素
        tf_idf_unit_vector = {int(content[j]): float(content[j + 1]) for j in range(0, len(content), 2)} # 讀取tf-idf unit vector
    all_tf_idf_unit_vector.append(tf_idf_unit_vector)

# 計算cosine similarity的函式
def cosine(doc_x, doc_y):
    # 取得文章中t_index的key值
    keys_x = list(doc_x.keys())
    keys_y = list(doc_y.keys())
    union_keys = list(set(keys_x + keys_y))  # union兩文章的t_index的key值
    
    # 補零至相同維度
    vector_x = np.array([doc_x.get(key, 0.00) for key in union_keys])
    vector_y = np.array([doc_y.get(key, 0.00) for key in union_keys])
    
    # 因為已經是unit vector了，所以直接內積就好
    similarity = np.dot(vector_x, vector_y)
    
    return similarity

# 初始化similarity matrix C
n = len(all_tf_idf_unit_vector)
C = np.zeros((n, n))

for i in range(n):
    for j in range(i, n):  # 僅計算上三角部分，對角線和下三角部分對稱，提高運算效率
        sim = cosine(all_tf_idf_unit_vector[i], all_tf_idf_unit_vector[j])
        C[i, j] = sim
        C[j, i] = sim  # 對稱矩陣

# 更新cluster層次的函式
def update_cluster_hierarchy(cluster, idx1, idx2):
    cluster[idx1] += cluster[idx2]  # 合併cluster
    cluster[idx2] = []  # 清空被合併的cluster
    
# 更新similarity matrix的函式
def update_similarity_matrix(similarity_matrix, idx1, idx2):
    min_vals = np.minimum(similarity_matrix[idx1, :], similarity_matrix[idx2, :]) # complete-link，找similarity最小的merge pair
    similarity_matrix[idx1, :] = min_vals
    similarity_matrix[:, idx1] = min_vals
    similarity_matrix[idx1, idx1] = -np.inf  # 將自己與自己的similarity設為負無窮大，避免後續合併時對自己的影響
    similarity_matrix[idx2, :] = -np.inf # 清空被合併cluster的similarity
    similarity_matrix[:, idx2] = -np.inf # 清空被合併cluster的similarity
    
# 找到最大similarity位置的函式
def find_max_position(test_C):
    max_position = np.unravel_index(np.argmax(test_C), test_C.shape)  # 取得最大值所在的行和列index
    return max_position

# 將分群結果寫入文件的函式
def write_to_file(cluster, k):
    file_name = f"./{k}.txt"
    with open(file_name, "w") as f:
        for group in cluster:
            if group:
                # 將分群結果寫入文件
                f.write('\n')
                f.write('\n'.join(map(str, sorted(group))))  # 將文章id排序後寫入文件
                f.write('\n')

# 分成8、13、20群，並將各自結果輸出
for k in [8, 13, 20]:
    output_k = k
    test_C = np.copy(C)
    np.fill_diagonal(test_C, 0)  # 將similarity matrix的對角線值設為零

    clusters = [[i + 1] for i in range(len(test_C))]  # 初始化每個文章為各自為一個獨立的cluster

    # apply simple HAC，直到達到指定的cluster數量
    while k < len(test_C):
        max_pos = max(find_max_position(test_C))  # 找到similarity matrix中similarity最大的位置
        min_pos = min(find_max_position(test_C))  # 找到similarity matrix中similarity最小的位置
        update_cluster_hierarchy(clusters, min_pos, max_pos)  # 更新cluster層次
        update_similarity_matrix(test_C, min_pos, max_pos)  # 更新similarity matrix
        k += 1

    clusters = [group for group in clusters if group]  # 去除空的cluster

    for group in clusters:
        group.sort()  # 對每個群組的文章id進行排序

    # 將分群結果寫入文件
    write_to_file(clusters, output_k)

In [2]:
clusters

[[1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  30,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  43,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  69,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  84,
  85,
  87,
  88,
  89,
  90,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  101,
  103,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  114,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  124,
  125,
  126,
  127,
  128,
  183,
  226,
  232,
  259,
  631,
  632,
  649,
  664,
  688,
  689,
  712,
  729,
  773],
 [11,
  19,
  29,
  113,
  115,
  169,
  278,
  301,
  316,
  317,
  321,
  324,
  325,
  338,
  341,
  357,
  369,
  372,
  377,
  381,
  383,
  384,
  386,
  388,
  389,
  396,
  400,
  402,
  405,
  419,
  422,
  423,
  425,
  429,