In [1]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

In [2]:
# 准备序列数据
sequences = [
    [1, 2, 3, 4, 5],
    [1, 2, 3, 4, 6],
    [10, 11, 12, 13, 14],
    [10, 11, 12, 13, 15],
    [20, 21, 22, 23, 24]
]

# 计算相似度矩阵（这里使用欧氏距离）
distance_matrix = pairwise_distances(sequences, metric='euclidean')

# 设置聚类数量
num_clusters = 2

# 进行层次聚类
clustering = AgglomerativeClustering(n_clusters=num_clusters, affinity='precomputed', linkage='average')
labels = clustering.fit_predict(distance_matrix)

# 输出聚类结果
print("Cluster labels:", labels)

Cluster labels: [0 0 0 0 1]




In [3]:
def pad_sequence(sequence, regions, fixed_lengths, padding_char='-'):
    padded_sequence = ''
    start = 0
    for i, length in enumerate(fixed_lengths):
        region = sequence[start:start + len(regions[i])]
        if len(region) < length:
            region += padding_char * (length - len(region))
        padded_sequence += region
        start += len(regions[i])
    return padded_sequence

# 序列
sequence = 'QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDINWVRQATGQGLEWMGWMNPNSGNIGYAQKFQGRVTMTRNTSISTAYMELSSLRSEDTAVYYCARGLSSSGWYYPHYYYYGMDVWGQGTTVTVSS'

# 区域划分
regions = [
    'QVQLVQSGAEVKKPGASV',
    'KVSCKASGYTFTSYDINW',
    'VRQATGQGLEWMGWMNPN',
    'SGNIGYAQKFQGRVTMTR',
    'NTSISTAYMELSSLRSED',
    'TAVYYCARGLSSSGWYYP',
    'HYYYYGMDVWGQGTTVTV',
    'SS'
]

# 固定区域长度
fixed_lengths = [26, 12, 17, 10, 38, 22, 11]

# 对序列进行padding
padded_sequence = pad_sequence(sequence, regions, fixed_lengths)
print(padded_sequence)

QVQLVQSGAEVKKPGASV--------KVSCKASGYTFTSYDINWVRQATGQGLEWMGWMNPNSGNIGYAQKFQGRVTMTRNTSISTAYMELSSLRSED--------------------TAVYYCARGLSSSGWYYP----HYYYYGMDVWGQGTTVTV
