In [7]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from collections import defaultdict
from typing import List
from tqdm.auto import tqdm

In [8]:
"""reconstruct train/val, keep test_data
concat (train, val)
-> KMeans (1000)
-> select 1/10 per class
-> reconstruct
"""

'reconstruct train/val, keep test_data\nconcat (train, val)\n-> KMeans (1000)\n-> select 1/10 per class\n-> reconstruct\n'

In [115]:
train_path = '../data/AOMP_ALL_RAW/train_data_fold0.csv'
val_path = '../data/AOMP_ALL_RAW/val_data_fold0.csv'
new_train_path = '../data/AOMP_KMEANS/train.csv'
new_val_path = '../data/AOMP_KMEANS/val.csv'

In [10]:
token2int = {x: i for i, x in enumerate('XGAVLIPFYWSTCMNQDEKRH')}


def str2int(seq: str, max_len: int = 15) -> List[int]:
    seq = seq.ljust(max_len, 'X')  # padding X at right
    # truncate  at max_len
    seq = seq[:max_len]
    return [token2int[c] for c in list(seq)]

In [12]:
df = pd.concat([pd.read_csv(train_path, index_col=0), pd.read_csv(val_path, index_col=0)])

In [13]:
df.head()

Unnamed: 0,peptide,length,HLA,label,HLA_sequence
0,FLTGTFVTA,9,HLA-A*24:02,0,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY
1,HRKVMSQNF,9,HLA-B*27:05,1,YHTEYREICAKTDEDTLYLNYHDYTWAVLAYEWY
2,VMDKYILDN,9,HLA-B*44:03,0,YYTKYREISTNTYENTAYIRYDDYTWAVLAYLSY
3,MTFDGEVKT,9,HLA-B*15:01,0,YYAMYREISTNTYESNLYLRYDSYTWAEWAYLWY
4,CLLTPGVQG,9,HLA-A*03:01,0,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY


In [14]:
unique_peptides = df.peptide.drop_duplicates()  # Series

In [15]:
input_ids = []
with tqdm(unique_peptides) as pbar:
    for peptide in pbar:
        input_ids.append(str2int(peptide))

  0%|          | 0/583866 [00:00<?, ?it/s]

In [16]:
# to numpy
X = np.array(input_ids)

In [17]:
len(X)

583866

In [19]:
%%time
kmeans = KMeans(n_clusters=1000, random_state=0).fit(X)

CPU times: user 4h 44min 15s, sys: 1h 55min 7s, total: 6h 39min 23s
Wall time: 1h 5min 9s


In [20]:
kmeans.labels_

array([ 44, 300, 692, ..., 613, 235,   8], dtype=int32)

In [21]:
"""获取预测结果"""
X2 = []
with tqdm(df.peptide) as pbar:
    for peptide in pbar:
        X2.append(str2int(peptide))
X2 = np.array(X2)

  0%|          | 0/718332 [00:00<?, ?it/s]

In [22]:
preds = kmeans.predict(X2)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f86bfa8bee0>
Traceback (most recent call last):
  File "/home/seeyou/anaconda3/envs/pyg/lib/python3.8/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/seeyou/anaconda3/envs/pyg/lib/python3.8/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/seeyou/anaconda3/envs/pyg/lib/python3.8/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/seeyou/anaconda3/envs/pyg/lib/python3.8/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


In [30]:
df['kmeans_cluster'] = preds.tolist()

In [31]:
df.to_csv('../.cache/train_val.csv')

In [103]:
df = pd.read_csv('../.cache/train_val.csv', index_col=0)

In [104]:
df.head()

Unnamed: 0,peptide,length,HLA,label,HLA_sequence,kmeans_cluster
0,FLTGTFVTA,9,HLA-A*24:02,0,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY,44
1,HRKVMSQNF,9,HLA-B*27:05,1,YHTEYREICAKTDEDTLYLNYHDYTWAVLAYEWY,300
2,VMDKYILDN,9,HLA-B*44:03,0,YYTKYREISTNTYENTAYIRYDDYTWAVLAYLSY,692
3,MTFDGEVKT,9,HLA-B*15:01,0,YYAMYREISTNTYESNLYLRYDSYTWAEWAYLWY,356
4,CLLTPGVQG,9,HLA-A*03:01,0,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY,664


In [111]:
from sklearn.utils import shuffle
from tqdm.auto import trange
train_sets = []
val_sets = []
for i in trange(1000):
    bool = df['kmeans_cluster'] == i
    n = bool.sum()
    sep = int(n*0.1)
    ids_shuffled = shuffle(np.arange(n), random_state=42)
    val_set_ids = ids_shuffled[:sep]
    train_set_ids = ids_shuffled[sep:]
    val_sets.append(df[bool].iloc[val_set_ids,])
    train_sets.append(df[bool].iloc[train_set_ids,])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [112]:
new_train_df = pd.concat(train_sets)
new_val_df = pd.concat(val_sets)

In [113]:
len(new_val_df), len(new_train_df)

(71387, 646945)

In [116]:
new_train_df.to_csv(new_train_path)
new_val_df.to_csv(new_val_path)

In [117]:
from src.tools.my_pandas import show_ratio

show_ratio(new_train_df, 'label')
show_ratio(new_val_df, 'label')

标签 0 比例为: 50.00%, 个数为: 323494
标签 1 比例为: 50.00%, 个数为: 323451
标签 1 比例为: 50.03%, 个数为: 35715
标签 0 比例为: 49.97%, 个数为: 35672
