In [1]:
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd

In [2]:
def perform_kmeans_clustering(data, num_clusters):
    """
    使用K-Means算法对数据进行聚类

    参数：
    data: 三维数据，形状为 (n_samples, n_features, n_dimensions)
    num_clusters: 聚类的数量

    返回值：
    labels: 聚类后每个数据点所属的簇标签
    centers: 每个簇的中心点坐标
    """
    # 将三维数据转换为二维数据以适应K-Means
    n_samples, n_features, n_dimensions = data.shape
    flattened_data = data.reshape(n_samples, -1)

    # 创建K-Means模型
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)

    # 执行聚类
    labels = kmeans.fit_predict(flattened_data)
    centers = kmeans.cluster_centers_

    return labels, centers


In [3]:
y1= pd.read_excel('fluorescence_w_20799.xlsx',header=None)
X= pd.read_excel('promoter_w_20799.xlsx',header=None)


In [4]:
concatenated_df = pd.concat([y1, X], axis=1)

In [5]:
concatenated_df.columns = ['Feature1', 'Feature2']
concatenated_df.head()

Unnamed: 0,Feature1,Feature2
0,1.284074,TTGATAATTAATCATCCGGCTCGTATAAT
1,1.038824,TTGACAATTAATCATCCGGCTCGTATAAT
2,1.742736,TTGGTAATTAATCATCCGGCTCGTATAAT
3,0.892131,TTGATAATTAATCATCCGGCTCGTAAAAT
4,1.542778,TTGGCAATTAATCATCCGGCTCGTATAAT


In [6]:
dna_alphabet = 'ACGT'
char_to_int = {char: i for i, char in enumerate(dna_alphabet)}
sequences= concatenated_df['Feature2']

In [7]:
# 对数据进行onehot编码。

# 初始化一个空的One-Hot编码矩阵
num_sequences = len(sequences)
sequence_length = len(sequences[0])
num_chars = len(dna_alphabet)
one_hot_matrix = np.zeros((num_sequences, sequence_length, num_chars))

# 遍历每个DNA序列并进行One-Hot编码
for i, sequence in enumerate(sequences):
    for j, char in enumerate(sequence):
        if char in char_to_int:
            index = char_to_int[char]
            one_hot_matrix[i, j, index] = 1

# 将One-Hot编码矩阵转换为Pandas DataFrame
columns = [f"Position_{i+1}_{char}" for i in range(sequence_length) for char in dna_alphabet]
one_hot_df = pd.DataFrame(one_hot_matrix.reshape(num_sequences, -1), columns=columns)

# 打印One-Hot编码的DataFrame
print(one_hot_df)
# 这将生成一个包含One-Hot编码的Pandas DataFrame，其中每一行代表一个DNA序列，每一列代表一个位置和DNA字母的组合。每个位置上，只有一个字母对应的元素为1，表示该字母的类型。

concatenated_df = pd.concat([concatenated_df,one_hot_df],axis=1)

       Position_1_A  Position_1_C  Position_1_G  Position_1_T  Position_2_A  \
0               0.0           0.0           0.0           1.0           0.0   
1               0.0           0.0           0.0           1.0           0.0   
2               0.0           0.0           0.0           1.0           0.0   
3               0.0           0.0           0.0           1.0           0.0   
4               0.0           0.0           0.0           1.0           0.0   
...             ...           ...           ...           ...           ...   
20794           0.0           1.0           0.0           0.0           1.0   
20795           0.0           0.0           0.0           1.0           1.0   
20796           0.0           0.0           0.0           1.0           0.0   
20797           0.0           1.0           0.0           0.0           0.0   
20798           0.0           1.0           0.0           0.0           0.0   

       Position_2_C  Position_2_G  Position_2_T  Po

In [8]:
concatenated_df.head()

Unnamed: 0,Feature1,Feature2,Position_1_A,Position_1_C,Position_1_G,Position_1_T,Position_2_A,Position_2_C,Position_2_G,Position_2_T,...,Position_27_G,Position_27_T,Position_28_A,Position_28_C,Position_28_G,Position_28_T,Position_29_A,Position_29_C,Position_29_G,Position_29_T
0,1.284074,TTGATAATTAATCATCCGGCTCGTATAAT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.038824,TTGACAATTAATCATCCGGCTCGTATAAT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.742736,TTGGTAATTAATCATCCGGCTCGTATAAT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.892131,TTGATAATTAATCATCCGGCTCGTAAAAT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.542778,TTGGCAATTAATCATCCGGCTCGTATAAT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
X=concatenated_df.drop('Feature1',axis=1)
X=X.drop('Feature2',axis=1)
y=concatenated_df['Feature1']
X.head()

Unnamed: 0,Position_1_A,Position_1_C,Position_1_G,Position_1_T,Position_2_A,Position_2_C,Position_2_G,Position_2_T,Position_3_A,Position_3_C,...,Position_27_G,Position_27_T,Position_28_A,Position_28_C,Position_28_G,Position_28_T,Position_29_A,Position_29_C,Position_29_G,Position_29_T
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
new_shape = (X.shape[0], X.shape[1] // 4, 4)
X = X.values.reshape(new_shape)
y= y.to_numpy(dtype=float)
X.shape,y.shape

((20799, 29, 4), (20799,))

In [11]:
num_clusters = 10
labels, centers = perform_kmeans_clustering(X, num_clusters)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f2050217f70>
Traceback (most recent call last):
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f2050217f70>
Traceback (most recent call last):
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/ly534/anaconda3/envs/EDA/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_w

In [12]:
print("聚类结果：")
print(labels)
print("簇中心点坐标：")
print(centers)

聚类结果：
[0 7 4 ... 8 6 6]
簇中心点坐标：
[[ 1.72496984e-01  1.21833534e-01  1.48974668e-01 ...  1.50784077e-02
   2.95536791e-02  9.55367913e-01]
 [ 1.94749403e-01  8.40095465e-02  1.96181384e-01 ...  5.25059666e-03
   1.52744630e-02  9.79474940e-01]
 [ 1.94306931e-01  9.46782178e-02  1.81930693e-01 ...  4.51028104e-17
  -4.16333634e-17  9.57301980e-01]
 ...
 [ 2.28404669e-01  1.24513619e-01  2.21789883e-01 ...  7.63278329e-17
  -7.63278329e-17  1.00000000e+00]
 [ 1.28611370e-01  2.77555756e-17  7.96831314e-02 ...  3.72786580e-02
   5.35880708e-02  8.34109972e-01]
 [ 2.08103131e-01  1.07734807e-01  1.95211786e-01 ...  6.24500451e-17
  -7.63278329e-17  1.00000000e+00]]
