In [2]:
import sys
from torch import Tensor
import re

sys.path.append('..')
from tqdm import trange
from src.datamodules.pepsmi_datamodule import *
from src.tools.pandas import show_ratio

In [3]:
p = Peptide_smilesModule(vocab_path='../data/AOMP_SMILES/vocab.txt',
                         batch_size=512,
                         train_data_path='../data/AOMP/train_data_fold0.csv',
                         test_data_path='../data/AOMP/independent_set.csv',
                         val_data_path='../data/AOMP/val_data_fold0.csv',
                         toy_data=None,
                         )

p.prepare_data()  # 准备数据获得三个数据集合的 df

In [4]:
p._train_data.head()

Unnamed: 0,peptide,length,HLA,label,HLA_sequence
0,FLTGTFVTA,9,HLA-A*24:02,0,YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY
1,HRKVMSQNF,9,HLA-B*27:05,1,YHTEYREICAKTDEDTLYLNYHDYTWAVLAYEWY
2,VMDKYILDN,9,HLA-B*44:03,0,YYTKYREISTNTYENTAYIRYDDYTWAVLAYLSY
3,MTFDGEVKT,9,HLA-B*15:01,0,YYAMYREISTNTYESNLYLRYDSYTWAEWAYLWY
4,CLLTPGVQG,9,HLA-A*03:01,0,YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY


In [5]:
"""分析数据: 查看去重后的数据"""
deduplicated_df = p._train_data.drop_duplicates(subset=['peptide']).reset_index(drop=True)
print(f'去重前的 len: {len(p._train_data)}')
print(f'去重后的 len: {len(deduplicated_df)}')
show_ratio(deduplicated_df, 'label')

去重前的 len: 574658
去重后的 len: 479539
标签 0 比例为: 58.49%, 个数为: 280479
标签 1 比例为: 41.51%, 个数为: 199060


In [6]:
"""分析数据: HLA"""
hla_df = p._train_data
HLA_class2id = {'A': 0, 'B': 1, 'C': 2}
hla_df['HLA_class'] = hla_df.HLA.apply(lambda x: HLA_class2id[re.search(r'-(.)\*', x).group(1)])
hla_df['HLA_subclass'] = hla_df.HLA.apply(lambda x: int(re.search(r'\*(.+):', x).group(1)))
hla_df['HLA_subsubclass'] = hla_df.HLA.apply(lambda x: int(re.search(r':(.+)$', x).group(1)))

In [7]:
from pandas_profiling import ProfileReport
# 使用 ProfileReport
hla_df_reconstructed = hla_df.loc[:, ['length', 'label', 'HLA_class', 'HLA_subclass', 'HLA_subsubclass']]  # 重组数据
profile = ProfileReport(hla_df_reconstructed, title="Pandas Profiling Report")
profile.to_file("tmp_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [9]:
show_ratio(hla_df, 'HLA_class', sort='value')

标签 1 比例为: 52.17%, 个数为: 299818
标签 0 比例为: 32.01%, 个数为: 183976
标签 2 比例为: 15.81%, 个数为: 90864


In [10]:
show_ratio(hla_df, 'HLA_subclass', sort='value')

标签 27 比例为: 14.94%, 个数为: 85852
标签 2 比例为: 13.11%, 个数为: 75312
标签 7 比例为: 6.12%, 个数为: 35188
标签 3 比例为: 5.27%, 个数为: 30284
标签 15 比例为: 5.09%, 个数为: 29278


In [11]:
show_ratio(hla_df, 'HLA_subsubclass', sort='value')

标签 1 比例为: 49.91%, 个数为: 286836
标签 2 比例为: 22.97%, 个数为: 131986
标签 5 比例为: 10.82%, 个数为: 62188
标签 3 比例为: 6.92%, 个数为: 39756
标签 4 比例为: 2.71%, 个数为: 15570


In [21]:
"""拿到数据量最多的种类"""
hla_df.HLA.value_counts()

HLA-B*27:05    58672
HLA-A*02:01    38452
HLA-B*15:01    22096
HLA-B*07:02    20772
HLA-B*57:01    19134
               ...  
HLA-A*32:07      112
HLA-A*26:03      106
HLA-A*68:23      102
HLA-A*32:15       92
HLA-B*44:27       86
Name: HLA, Length: 112, dtype: int64

HLA-B*27:05    58672
HLA-A*02:01    38452
HLA-B*15:01    22096
HLA-B*07:02    20772
HLA-B*57:01    19134
               ...  
HLA-A*32:07      112
HLA-A*26:03      106
HLA-A*68:23      102
HLA-A*32:15       92
HLA-B*44:27       86
Name: HLA, Length: 112, dtype: int64

In [19]:
hla_df.columns

Index(['peptide', 'length', 'HLA', 'label', 'HLA_sequence', 'HLA_class',
       'HLA_subclass', 'HLA_subsubclass'],
      dtype='object')

In [85]:
sum_i = 0
sum_j = 0
sum_k = 0

for i in list(hla_df.groupby('HLA_class')):
    sum_i += 1
    print(f'class: {i[0]}, num: {len(i[1])}')
    for j in list(hla_df.groupby('HLA_subclass')):
        sum_j += 1
        print(f'  - class: {j[0]}, num: {len(j[1])}')
        for k in list(hla_df.groupby('HLA_subsubclass')):
            sum_k += 1
            print(f'    - class: {k[0]}, num: {len(k[1])}')

class: A, num: 183976
  - class: 01, num: 16640
    - class: 01, num: 286836
    - class: 02, num: 131986
    - class: 03, num: 39756
    - class: 04, num: 15570
    - class: 05, num: 62188
    - class: 06, num: 10832
    - class: 07, num: 6890
    - class: 08, num: 5068
    - class: 09, num: 7236
    - class: 11, num: 1704
    - class: 12, num: 442
    - class: 13, num: 188
    - class: 15, num: 92
    - class: 16, num: 244
    - class: 17, num: 1000
    - class: 18, num: 1378
    - class: 19, num: 316
    - class: 20, num: 1400
    - class: 23, num: 102
    - class: 24, num: 778
    - class: 27, num: 86
    - class: 42, num: 452
    - class: 50, num: 114
  - class: 02, num: 75312
    - class: 01, num: 286836
    - class: 02, num: 131986
    - class: 03, num: 39756
    - class: 04, num: 15570
    - class: 05, num: 62188
    - class: 06, num: 10832
    - class: 07, num: 6890
    - class: 08, num: 5068
    - class: 09, num: 7236
    - class: 11, num: 1704
    - class: 12, num: 442
    -

In [87]:
print('各类数量', sum_i, sum_j, sum_k)
print('总类数量', sum_k * sum_j * sum_i)

各类数量 3 153 3519
总类数量 1615221


In [80]:
list(hla_df.groupby('HLA_class'))

[('A',
              peptide  length          HLA  label  \
  0         FLTGTFVTA       9  HLA-A*24:02      0   
  4         CLLTPGVQG       9  HLA-A*03:01      0   
  6         YNIMVPFGP       9  HLA-A*02:05      0   
  8        ILSEKRKDTI      10  HLA-A*02:03      1   
  11        YLHSLNIVY       9  HLA-A*32:01      1   
  ...             ...     ...          ...    ...   
  574616  ATVLGIAGGVY      11  HLA-A*01:01      1   
  574619   KLGFKVTLPP      10  HLA-A*03:01      0   
  574629    KVQEVIFGL       9  HLA-A*32:01      1   
  574633    LADKGSRPQ       9  HLA-A*68:02      0   
  574648    KITTVIQHV       9  HLA-A*02:20      1   
  
                                HLA_sequence HLA_class HLA_subclass  \
  0       YSAMYEEKVAHTDENIAYLMFHYYTWAVQAYTGY         A           24   
  4       YFAMYQENVAQTDVDTLYIIYRDYTWAELAYTWY         A           03   
  6       YYAMYGEKVAHTHVDTLYLRYHYYTWAVWAYTWY         A           02   
  8       YFAMYGEKVAHTHVDTLYVRYHYYTWAEWAYTWY         A           02   

In [32]:
"""计算多肽 smi 的最大长度"""


def count_max_len(data: Tuple[Tensor]) -> int:
    """计算张量的行的最大非零个数
    """
    max_len = 0
    for i in trange(len(data)):
        c_len = torch.nonzero(data[i][0]).shape[0]
        if c_len > max_len:
            max_len = c_len
        else:
            pass
    return max_len

In [33]:
p.setup(stage='fit')  # 加载 fit 时数据成张量

In [36]:
print(f'训练集最大长度 {count_max_len(p.train_data)}')
print(f'验证集最大长度 {count_max_len(p.val_data)}')
print(f'测试集最大长度 {count_max_len(p.test_data)}')

100%|██████████| 479539/479539 [00:02<00:00, 163579.13it/s]


训练集最大长度 252


100%|██████████| 134745/134745 [00:00<00:00, 164907.76it/s]


验证集最大长度 252


100%|██████████| 158145/158145 [00:00<00:00, 166888.29it/s]

测试集最大长度 246





# 测试 fake data

In [1]:
import sys

sys.path.append('..')
from src.datamodules.pepsmi_datamodule import *

p = Peptide_smilesModuleV0(vocab_path='../data/AOMP_SMILES/vocab.txt')
p.prepare_data()
print('数据大小:', len(p._train_data), len(p._val_data), len(p._test_data))

数据大小: 8000 1000 1000


In [2]:
p._train_data.head()

Unnamed: 0,peptide,label
2000,HGVRYTSACC,0
2001,ERTGIPYHNDEE,1
2002,AEWIWYWWEHYK,1
2003,DLLRYVCQDR,1
2004,EKVNQGQII,0


In [3]:
p._train_data.describe()  # 看均值

Unnamed: 0,label
count,8000.0
mean,0.475125
std,0.499412
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0
