# 伪代码

```python
for G in dataloader:
```
### &emsp;数据增强
```python
    G1, G2 = T1(G), T2(G)
```

### &emsp;表示计算
```python
    z = GNN(cat(G1, G2))
```

### &emsp;prototypes打分计算
```python
    prot_scores = torch.mm(z, model.prototypes)
    prot_scores1, prot_scores2 = prot_scores[:N], prot_scores[-N:]
```

### &emsp;根据Sinkhorn计算Q1, Q2
```python
    with torch.no_grad():
        Q1 = Sinkhorn(prot_scores1)
        Q2 = Sinkhorn(prot_scores2)
```

### &emsp;根据打分归一化得到预测概率P1, P2；为各图选择概率最大的prototypes作为原型assignment，即得到prot_assign
```python
    P1 = softmax(prot_scores1/ temp)
    # prot_assign1 = torch.argmax(P1, dim=1)
    P2 = softmax(prot_scores2/ temp)
    # prot_assign2 = torch.argmax(P2, dim=1)
```

### &emsp;采用DBSCAN对model.prototypes进行聚类，得到各样本的聚类assignment，即得到cluster_assign
```python
    # ！注意！DBSCAN返回的label array中可能包含-1，-1意味着对应位置的sample是noisy sample（即：根据目前选定的eps和min_samples无法将其与其他samples聚类）
    # 此处默认prot_2_cluster_dict中的cluster idx取值范围是0~(cluster_num-1)之间的连续整数
    prot_2_cluster_dict, cluster_num = DBSCAN(eps=epsilon, min_samples=min_samples).fit(model.prototypes) 
    
    # cluster_P1 = prob_agg(prot_2_cluster_dict, P1, cluster_num)
    # cluster_P2 = prob_agg(prot_2_cluster_dict, P2, cluster_num)
    cluster_score1 = score_agg(prot_2_cluster_dict, prot_scores1, cluster_num)
    cluster_score2 = score_agg(prot_2_cluster_dict, prot_scores2, cluster_num)
    cluster_score = cat([cluster_score1, cluster_score2])
    cluster_P1 = softmax(cluster_score1/ temp)
    cluster_P2 = softmax(cluster_score2/ temp)
    cluster_P = cat([cluster_P1, cluster_P2])
    cluster_assign1 = torch.argmax(cluster_P1, dim=1)
    cluster_assign2 = torch.argmax(cluster_P2, dim=1)
    # cluster_assign1 = get_cluster_assign(prot_2_cluster_dict, prot_assign1)
    # cluster_assign2 = get_cluster_assign(prot_2_cluster_dict, prot_assign2)
    cluster_assign = cat([cluster_assign1, cluster_assign2])

    # 根据Sinkhorn计算cluster_Q1, cluster_Q2
    with torch.no_grad():
        cluster_Q1 = Sinkhorn(cluster_score1)
        cluster_Q2 = Sinkhorn(cluster_score2)
```

### &emsp;计算clustering consistency loss，基于（P,Q）或者（cluster_P， cluster_Q）
```python
    L_cluster_consistency = -0.5(cluster_Q1 log cluster_P2 + cluster_Q2 log cluster_P1) # L_cluster_consistency = -0.5(Q1 logP2 + Q2 logP1)
```

### &emsp;根据prototypes/聚类概率，计算各样本的信息熵并判断各样本是否可信
```python
    info_ent1 = get_info_ent(cluster_P1) # info_ent1 = get_info_ent(P1)
    info_ent2 = get_info_ent(cluster_P2) # info_ent2 = get_info_ent(P2)
    info_ent = cat([info_ent1, info_ent2])
    info_ent_avg = mean(info_ent)

    reliab_num = reliab_pacing(info_ent_avg, args.info_ent_threshold, args.reliab_pacing_type, sample_num, t, T) # pacing function
    reliab_mask, reliab_idx = get_relib_mask(info_ent1, info_ent2, reliab_num)
```

### &emsp;根据样本是否可信、聚类结果以及是否来自同个锚图，选定【待选正样本对】pos_cand_mask; 进一步根据正样本对的【双向KL散度】或【JS散度】确定要选取的正样本对reliab_pos_mask
```python
    graph_num = sample_num/2
    # 来自同一个锚图的两个视图均可信时，二者构成的正样本对才是可信的
    pos_cand_mask = reliab_mask[:graph_num] & reliab_mask[-graph_num:]

    # 来自同一个锚图的两个视图能聚到同一类时，二者构成的正样本对才是可信的
    assgin_mask = [cluster_assign1[i] == cluster_assign2[i] for i in range(graph_num)] # assgin_mask = [prot_assign1[i] == prot_assign2[i] for i in range(graph_num)]
    pos_cand_mask = pos_cand_mask & assgin_mask
    pos_cand_num = pos_cand_mask.count_nonzero()

    # 计算待选正样本对聚类概率的差异度，即：【双向KL散度】或【JS散度】；非待选负样本对的位置置为inf
    pos_div = [get_div(cluster_P1[i], cluster_P2[i]) if pos_cand_mask[i]==True else inf for i in range(graph_num)] 
    pos_cand_div_avg = sum(pos_div * pos_cand_mask) / pos_cand_num

    reliab_pos_num = reliab_pacing(pos_cand_div_avg, args.pos_div_threshold, args.pos_reliab_pacing_type, pos_cand_num, t, T)
    reliab_pos_mask, reliab_pos_idx = get_reliab_pos_mask(pos_div, reliab_pos_num)
```

### &emsp;根据样本是否可信、是否被选作正样本、聚类结果是否不同以及是否来自不同的锚图，选定【待选负样本对】neg_cand_mask; 进一步根据负样本对的【双向KL散度】或【JS散度】确定要选取的负样本对reliab_neg_mask
```python
    # 初始化neg_cand_mask
    neg_cand_mask = torch.zeros(sample_num, sample_num).bool()
    # 计算待选负样本对聚类概率的差异度，即：【双向KL散度】或【JS散度】；非待选负样本对的位置置为0
    neg_div = torch.zeros(sample_num, sample_num)
    # 当样本i拥有正样本对时，才为其选择负样本
    for i in reliab_pos_idx:
        # 当样本j是可信样本时，才有资格被选为负样本
        for j in reliab_idx:
            not_self = (i != j)
            not_same_graph = (i != j+graph_num)
            i_cluster_idx, j_cluster_idx = cluster_assign[i], cluster_assign[j] #样本i,j的所属类别的索引
            not_same_cluster = (i_cluster_idx != j_cluster_idx) # not_same_cluster = (prot_assign[i] != prot_assign[j])
            if not_self and not_same_graph and not_same_cluster:
                neg_cand_mask[i][j] = True
                # 只关心负样本对二者所属类别上的概率差异度
                ij_cluster_idx  = [i_cluster_idx, j_cluster_idx]
                score_i = cluster_score[i][ij_cluster_idx]
                score_j = cluster_score[j][ij_cluster_idx]
                p_i = softmax(score_i/ temp)
                p_j = softmax(score_j/ temp)
                neg_div[i][j] = get_div(p_i, p_j)

    neg_cand_num = neg_cand_mask.count_nonzero()
    neg_cand_div_avg = sum(neg_div) / neg_cand_num

    # ****************已与上述代码块进行合并****************
    # # 计算待选负样本对聚类概率的差异度，即：【双向KL散度】或【JS散度】；非待选负样本对的位置置为0
    # neg_div = torch.zeros(sample_num, sample_num)
    # for i in reliab_pos_idx:
    #     for j in reliab_idx:
    #         if neg_cand_mask[i][j] == True:
    #             ij_cluster_idx = [cluster_assign[i], cluster_assign[j]]
    #             neg_div[i][j] = get_div(cluster_P[i][ij_cluster_idx], cluster_P[j][ij_cluster_idx])
    # neg_cand_div_avg = sum(neg_div) / neg_cand_num

    reliab_neg_num = reliab_neg_pacing(neg_cand_div_avg, args.neg_div_threshold, args.neg_reliab_pacing_type,neg_cand_num, t, T)
    reliab_neg_mask, reliab_neg_row_idx, reliab_neg_col_idx = get_reliab_neg_mask(neg_div, reliab_neg_num)
    # 
```

### &emsp;计算负样本权重：使得距离适中的负样本权重最大
```python
    # ##--1） 基于所有样本对的距离来评估负样本对的距离是否适中计算reweight
    # # 即：平均值、标准差针对所有样本对进行计算，而不是计算所有可行负样本对的平均值、标准差，在此基础上再计算reweight
    # # 两两样本差异度矩阵计算
    # sample_dist = 1 - torch.mm(z, z.t().contiguous())
    # mu=torch.mean(sample_dist,dim=1)
    # std = torch.std(sample_dist,dim=1)
    # reweight = torch.exp(-torch.pow(sample_dist - mu,2)/(2 * torch.pow(std,2))).to(device)
    # reweight= reweight * reliab_neg_mask

    ##--2) 基于可信负样本对来衡量各负样本对的距离是否适中计算reweight，即:计算所有可行负样本对的平均值、标准差，在此基础上再计算reweight
    # 两两样本差异度矩阵计算
    sample_dist = 1 - torch.mm(z, z.t().contiguous())
    mu = torch.sum(sample_dist*reliab_neg_mask, dim=1) / torch.sum(reliab_neg_mask, dim=1) # 注意除0错误
    reshape_mu = torch.reshape(mu, (-1,))
    temp_res = torch.pow(sample_dist.sub(reshape_mu[:, None]) , 2) * reliab_neg_mask # ((sample_dist-mu)**2)* reliab_neg_mask: sample_dist第i行的每个元素都减去mu[i]
    std = torch.sqrt( temp_res / torch.sum(reliab_neg_mask, dim=1) ) # 注意除0错误
    reweight = torch.exp(-torch.pow(sample_dist - mu,2)/(2 * torch.pow(std,2))).to(device)
    reweight= reweight * reliab_neg_mask

    # 计算权重矩阵每行的归一化系数
    reweight_normalize = torch.sum(reliab_neg_mask) / torch.sum(reweight, dim=1) # 逐元素的除法,reweight_normalize的size应该是(reweight.size()[0], 1)
    reweight = reweight * reweight_normalize.reshape((reweight.size()[0], 1))


    # 计算样本对的相似度并且进行加权
    sim_matrix  = torch.exp(torch.mm(z, z.t().contiguous()) / temperature)
    sim_matrix = (sim_matrix * reweight) * reliab_neg_mask
    # 计算正样本对相似度
    pos_sim = torch.exp(torch.sum(z[:sample_num] * z[-sample_num:],dim=-1) / temperature)
    pos_sim = pos_sim * reliab_pos_mask
    pos_sim = torch.cat([pos_sim, pos_sim], dim=0)

    # 计算对比损失L_contrastive
    L_contrastive = -(torch.log((pos_sim / (pos_sim + sim_matrix.sum(dim=-1)))[reliab_pos_idx])).mean()

```

### &emsp;计算聚类正则化项L_cluster_reg
```python
    cluster_2_protList_dict = {cluster:[] for cluster in range(cluster_num)} #value初始化为[]
    for prot, cluster in prot_2_cluster_dict:
        cluster_2_protList_dict[cluster].append(prot)

    # 得到prot_pos_mask和prot_neg_mask，尺寸都是prototype_num x prototype_num，prototype_num：prototypes总数目
    prot_pos_mask = torch.zeros(prototype_num, prototype_num)
    # prot_neg_mask = torch.ones(prototype_num, prototype_num)
    for cluster in range(cluster_num):
        prot_list = cluster_2_protList_dict[cluster]
        prot_pos_mask[prot_list][:, prot_list] = 1 #将属于同一个cluster的prototypes作为正样本（此处自身也被选为自己的正样本）
        # prot_neg_mask[prot_list][:, prot_list] = 0
    prot_neg_mask = torch.ones(prototype_num, prototype_num) - prot_pos_mask #将不属于同一个cluster的prototypes都选为负样本

    prot_pos_mask = prot_pos_mask.bool()
    prot_pos_mask[range(prototype_num), range(prototype_num)] = False #不能将自身作为自己的正样本
    prot_neg_mask = prot_neg_mask.bool()

    ## 此处也可以类似前面的--1)去计算加权矩阵
    ##--2) 基于可信负样本对来衡量各负样本对的距离是否适中计算reweight，即:计算所有可行负样本对的平均值、标准差，在此基础上再计算reweight
    # 两两样本差异度矩阵计算
    prot_dist = 1 - torch.mm(model.prototypes, model.prototypes.t().contiguous())
    mu_ = torch.sum(prot_dist*prot_neg_mask, dim=1) / torch.sum(prot_neg_mask, dim=1) # 注意除0错误
    reshape_mu_ = torch.reshape(mu_, (-1,))
    temp_res_ = torch.pow(prot_dist.sub(reshape_mu_[:, None]) , 2) * prot_neg_mask # ((prot_dist-mu_)**2)* prot_neg_mask: prot_dist第i行的每个元素都减去mu[i]
    std_ = torch.sqrt( temp_res_ / torch.sum(prot_neg_mask, dim=1) ) # 注意除0错误
    prot_reweight = torch.exp(-torch.pow(prot_dist - mu_,2)/(2 * torch.pow(std_,2))).to(device)
    prot_reweight= prot_reweight * prot_neg_mask

    # 计算权重矩阵每行的归一化系数
    prot_reweight_normalize = torch.sum(prot_neg_mask) / torch.sum(prot_reweight, dim=1) # 逐元素的除法,prot_reweight_normalize的size应该是(prot_reweight.size()[0], 1)
    prot_reweight = prot_reweight * prot_reweight_normalize.reshape((prot_reweight.size()[0], 1))

    # 计算prot_sim_matrix，加权并根据prot_neg_mask得到得到prot_neg_sim_matrix，根据prot_pos_mask得到prot_pos_sim_matrix
    prot_sim_matrix = torch.exp(torch.mm(model.prototypes, model.prototypes.t().contiguous()) / temperature)
    prot_neg_sim_matrix = (prot_sim_matrix * prot_reweight) * prot_neg_mask
    prot_pos_sim_matrix = prot_sim_matrix * prot_pos_mask

    # 计算L_cluster_reg，由于分子中有多个正样本对，所以要进行归一化处理，即：分子乘 1/prot_pos_mask.sum(dim=-1)
    L_cluster_reg = -( torch.log(( (prot_pos_sim_matrix.sum(dim=-1) / prot_pos_mask.sum(dim=-1)) / (prot_pos_sim_matrix.sum(dim=-1) + prot_neg_sim_matrix.sum(dim=-1)) )) ).mean()
    
```

### &emsp;计算最终的损失函数L
```python
L = L_contrastive + args.lambda_1 * L_cluster_consistency + args.lambda_2 * L_cluster_reg
```

### 函数定义
```python
# def get_cluster_assign(prot_2_cluster_dict, prot_assign):
#     cluster_assign = [prot_2_cluster_dict[i] for i in prot_assign]
#     return cluster_assign

# def prob_agg(prot_2_cluster_dict, prot_prob, cluster_num):
#     cluster_prob = [0 for i in cluster_num]
#     for prot in range(len(prot_prob)):
#         cluster = prot_2_cluster_dict[prot]
#         cluster_prob[cluster] = max(cluster_prob[cluster], prot_prob[prot])
#     return cluster_prob

def score_agg(prot_2_cluster_dict, prot_score, cluster_num):
    cluster_score = [0 for i in cluster_num]
    for prot in range(len(prot_score)):
        cluster = prot_2_cluster_dict[prot]
        cluster_score[cluster] = max(cluster_score[cluster], prot_score[prot])
    return cluster_score

def reliab_pacing(avg, threshold, pacing_type, sample_num, t, T):
    # 模型的相对容量
    relative_cap = max(1, threshold/avg) # relative_cap = threshold / (threshold + avg)

    # 根据pacing type进行计算，pacing type \in [logarithmic, polynomial_1, polynomial_2, polynomial_3]
    if pacing_type=="logarithmic":
        reliab_num = (1 + 0.1 log(relative_cap * (t/T) + e ** (-10))) * sample_num
    elif pacing_type=="polynomial_1":
        reliab_num = (relative_cap * (t/T)) * sample_num
    elif pacing_type=="polynomial_2":
        reliab_num = (relative_cap * (t/T))**2 * sample_num
    elif pacing_type=="polynomial_3":
        reliab_num = (relative_cap * (t/T))**3 * sample_num
    else:
        error
    return reliab_num

def get_relib_mask(info_ent1, info_ent2, reliab_num):
    info_ent = cat([info_ent1, info_ent2])
    reliab_idx = info_ent.argsort()[-reliab_num:] # 选取信息熵最低的realib_num个样本作为可信样本

    sample_num = len(info_ent)
    reliab_mask = [False for i in range(sample_num)] # 初始化 
    reliab_mask[reliab_idx] = True # 将可信样本处置为True

    # # 当且仅当来自同一个锚图的两个视图样本均为可信时二者才是可信的，若只有一个可信则无法组成可信的正样本对
    # reliab_mask1, reliab_mask2 = reliab_mask[:sample_num/2], reliab_mask[-sample_num/2:]
    # reliab_mask = reliab_mask1 & reliab_mask2 
    # reliab_mask = cat([reliab_mask, reliab_mask])

    return reliab_mask, reliab_idx

def get_reliab_pos_mask(pos_div, reliab_pos_num): # 类似get_relib_mask()
    reliab_pos_idx = pos_div.argsort()[-reliab_pos_num:] # reliab_pos_num
    sample_num = len(pos_div)
    reliab_pos_mask = [False for i in range(sample_num)] # 初始化 
    reliab_pos_mask[reliab_pos_idx] = True # 将可信样本处置为True

    return reliab_pos_mask, reliab_pos_idx

def reliab_neg_pacing(neg_cand_div_avg, neg_div_threshold, pacing_type, sample_num, t, T):
    # 模型的相对容量
    relative_cap = max(1, neg_cand_div_avg/neg_div_threshold) 

    # 根据pacing type进行计算，pacing type \in [logarithmic, polynomial_1, polynomial_2, polynomial_3]
    if pacing_type=="logarithmic":
        reliab_num = (1 + 0.1 log(relative_cap * (t/T) + e ** (-10))) * sample_num
    elif pacing_type=="polynomial_1":
        reliab_num = (relative_cap * (t/T)) * sample_num
    elif pacing_type=="polynomial_2":
        reliab_num = (relative_cap * (t/T))**2 * sample_num
    elif pacing_type=="polynomial_3":
        reliab_num = (relative_cap * (t/T))**3 * sample_num
    else:
        error
    return reliab_num

def get_reliab_neg_mask(neg_div, reliab_neg_num):
    sample_num = neg_div.shape[0]
    reliab_neg_mask = torch.zeros(sample_num, sample_num).bool()

    neg_div = neg_div.reshape(-1)
    reliab_neg_idx = neg_div.argsort()[:reliab_neg_num] 
    reliab_neg_row_idx = []
    reliab_neg_col_idx = []
    for idx in reliab_neg_idx:
        i = idx / sample_num # 取商
        j = idx % sample_num # 取模
        reliab_neg_mask[i][j] = True
        reliab_neg_row_idx.append(i)
        reliab_neg_col_idx.append(j)
    return reliab_neg_mask, reliab_neg_row_idx, reliab_neg_col_idx

def get_cluster_num(prot_2_cluster):
    if -1 not in prot_2_cluster:
        # print("-1 not in")
        cluster_num = max(prot_2_cluster)+1
    else:
        # print("-1 in")
        unnoisy_cluster_num = max(prot_2_cluster)+1
        noisy_cluster_num = sum(i==-1 for i in prot_2_cluster)
        cluster_num = unnoisy_cluster_num + noisy_cluster_num
    return cluster_num
```

In [25]:
# prot_2_cluster = [0,1,1,2,1]
# prot_2_cluster_dict=dict()
# for i in range(len(prot_2_cluster)):
#     prot_2_cluster_dict[i]=prot_2_cluster[i]
# for key,value in prot_2_cluster_dict.items():
#     print(key)
#     print(value)
#     print("\n")

# prot_2_cluster_dict = dict(sorted(prot_2_cluster_dict.items(),key=lambda x:x[1]))
# for key,value in prot_2_cluster_dict.items():
#     print(key)
#     print(value)
#     print("\n")

prot_2_cluster = [0, -1, 1, 1, -1] #总共有1+1+2=4类
# if -1 not in prot_2_cluster:
#     print("-1 not in")
#     cluster_num = max(prot_2_cluster)+1
# else:
#     print("-1 in")
#     unnoisy_cluster_num = max(prot_2_cluster)+1
#     noisy_cluster_num = sum(i==-1 for i in prot_2_cluster)
#     cluster_num = unnoisy_cluster_num + noisy_cluster_num
# print(cluster_num)
unnoisy_cluster_num = max(prot_2_cluster)+1
noisy_cluster_idx = unnoisy_cluster_num
noisy_cluster_num = len([1 for i in prot_2_cluster if i<0])#sum(i==-1 for i in prot_2_cluster)
print(noisy_cluster_num)
# cluster_num = unnoisy_cluster_num + noisy_cluster_num
for i in range(len(prot_2_cluster)):
    print(i)
    if prot_2_cluster[i] != -1:
        continue
    prot_2_cluster[i] = noisy_cluster_idx
    noisy_cluster_idx += 1
    noisy_cluster_num -= 1
    if noisy_cluster_num==0: # prot_2_cluster中的-1已经被遍历完成
        print("break")
        break
print(prot_2_cluster)
print(unnoisy_cluster_num)
print(noisy_cluster_num)


2
0
1
2
3
4
break
[0, 2, 1, 1, 3]
2
0


In [27]:
prot_2_cluster = [0, -1, 1, -1, -1, -1]
unnoisy_cluster_num = max(prot_2_cluster)+1
noisy_cluster_idx = unnoisy_cluster_num
noisy_cluster_num = sum(i==-1 for i in prot_2_cluster)
prot_num = len(prot_2_cluster)
# cluster_num = unnoisy_cluster_num + noisy_cluster_num
for i in range(prot_num):
    if prot_2_cluster[i] == -1:
        prot_2_cluster[i] = noisy_cluster_idx
        noisy_cluster_idx += 1
        noisy_cluster_num -= 1
        if noisy_cluster_num==0: # prot_2_cluster中的-1已经被遍历完成
            break
print(prot_2_cluster)

[0, 2, 1, 3, 4, 5]


not in


-1 in
6
