# DeepBind Reproduction
## PBM Dataset
### PBM 数据集  
<b> (1)sequences.tsv.gz </b>  
一列或多列的tsv,每行对应一个短序列(一个probe)   
<b> (2)targets.tsv.gz </b>  
对于sequences.tsv.gz的强度表，每一行对应一个probe,每一列对应一个TF  
<b> (3)tfids.txt </b>  
一行一个TFid  




In [1]:
import numpy as np
import pandas as pd
import gzip

# 1) 看 tfids.txt
with open('../DeepBind/data/dream5/pbm/tfids.txt', 'r') as f:
    tfids = [ln.strip() for ln in f if ln.strip()]
print("Found TF ids (first 20):", tfids[:20])
print("Total TF columns:", len(tfids))

# 2) 读 sequences.tsv.gz（如果每行是序列）
seqs = pd.read_csv('../DeepBind/data/dream5/pbm/sequences.tsv.gz', sep='\t', header=None, compression='gzip', engine='python')
print("sequences shape:", seqs.shape)
print("first rows:\n", seqs.head())

# 常见：如果只有一列，序列在第0列
# seqs[0].head()

# 3) 读 targets.tsv.gz（检查列名/形状）
targets = pd.read_csv('../DeepBind/data/dream5/pbm/targets.tsv.gz', sep='\t', header=0, compression='gzip', engine='python')
print("targets shape:", targets.shape)
print("targets columns:", targets.columns[:20])
print("first rows:\n", targets.head())

# 4) 检查 probe_biases.npz
npz = np.load('../DeepBind/data/dream5/pbm/probe_biases.npz', allow_pickle=True)
print("npz keys:", list(npz.keys()))
for k in npz:
    arr = npz[k]
    print(k, arr.shape, arr.dtype)
    print("sample:", arr[:5])

Found TF ids (first 20): ['TF_1', 'TF_2', 'TF_3', 'TF_4', 'TF_5', 'TF_6', 'TF_7', 'TF_8', 'TF_9', 'TF_10', 'TF_11', 'TF_12', 'TF_13', 'TF_14', 'TF_15', 'TF_16', 'TF_17', 'TF_18', 'TF_19', 'TF_20']
Total TF columns: 86
sequences shape: (80857, 3)
first rows:
          0               1                                         2
0  Fold ID        Event ID                                       seq
1        B  MEreverse14075  TAAAACTATGAGGAAGGATTCAGGGTCGGACAGTGCCTGT
2        B  MEforward19438  CTTATGATCAGAAGCGGCTAGGTGTATTACATGTCCCTGT
3        B  MEforward19439  CCGCCGTAGGCCCCGAAACAGTACCAGACATGTAACCTGT
4        B  MEforward19436  GACCAAACGAGTCCTAGGATTCCAAGCGTTACGACCCTGT
targets shape: (80856, 86)
targets columns: Index(['TF_40', 'TF_41', 'TF_42', 'TF_43', 'TF_44', 'TF_45', 'TF_46', 'TF_47',
       'TF_48', 'TF_49', 'TF_7', 'TF_6', 'TF_5', 'TF_4', 'TF_3', 'TF_2',
       'TF_1', 'TF_9', 'TF_8', 'TF_35'],
      dtype='object')
first rows:
          TF_40         TF_41        TF_42        TF_43 

  arr = npz[k]


用pbm.py处理pbm数据集

In [1]:
from torch.utils.data import DataLoader
from data.pbm import PBMDataset
import torch

seq_file = '../DeepBind/data/dream5/pbm/sequences.tsv.gz'
tgt_file = '../DeepBind/data/dream5/pbm/targets.tsv.gz'
tf_col = 'TF_1'

dataset = PBMDataset(seq_file, tgt_file, tf_col=tf_col, max_len=36)

loader = DataLoader(dataset, batch_size=4, shuffle=True)

for xb, yb in loader:
    print("xb shape:", xb.shape)
    print("yb shape:", yb.shape)
    print("yb:", yb)
    break



xb shape: torch.Size([4, 4, 36])
yb shape: torch.Size([4])
yb: tensor([ 484.5081, 1137.2778,  982.1860, 1511.8501])


In [2]:
with open('../DeepBind/data/dream5/pbm/tfids.txt') as f:
    tfids = [l.strip() for l in f if l.strip()]
    print("Available TFs:", tfids[:50])

Available TFs: ['TF_1', 'TF_2', 'TF_3', 'TF_4', 'TF_5', 'TF_6', 'TF_7', 'TF_8', 'TF_9', 'TF_10', 'TF_11', 'TF_12', 'TF_13', 'TF_14', 'TF_15', 'TF_16', 'TF_17', 'TF_18', 'TF_19', 'TF_20', 'TF_21', 'TF_22', 'TF_23', 'TF_24', 'TF_25', 'TF_26', 'TF_27', 'TF_28', 'TF_29', 'TF_30', 'TF_31', 'TF_32', 'TF_33', 'TF_34', 'TF_35', 'TF_36', 'TF_37', 'TF_38', 'TF_39', 'TF_40', 'TF_41', 'TF_42', 'TF_43', 'TF_44', 'TF_45', 'TF_46', 'TF_47', 'TF_48', 'TF_49', 'TF_50']


In [9]:
%%bash
python train.py \
  --data_type PBM \
  --seq_file ../DeepBind/data/dream5/pbm/sequences.tsv.gz \
  --tgt_file ../DeepBind/data/dream5/pbm/targets.tsv.gz \
  --tf_col TF_1 \
  --max_len 36 \
  --epochs 20 \
  --device cuda

[DEBUG] Fold 0 got NaN score
 labels: mean=-0.0041, std=0.9680, unique=[-1.6008122 -1.5992286 -1.5940433 -1.5902673 -1.5876826 -1.585011
 -1.5823246 -1.5776969 -1.577265  -1.5771194]
 preds : mean=0.0820, std=0.0000, min=0.0820, max=0.0820
[DEBUG] Fold 0 got NaN score
 labels: mean=-0.0041, std=0.9680, unique=[-1.6008122 -1.5992286 -1.5940433 -1.5902673 -1.5876826 -1.585011
 -1.5823246 -1.5776969 -1.577265  -1.5771194]
 preds : mean=0.0161, std=0.0000, min=0.0161, max=0.0161
[Calib 01/30] hp={'num_kernels': 16, 'kernel_size': 24, 'fc_hidden': 0, 'lr': 0.027563155364944907, 'momentum': 0.9858565380848294, 'batch_size': 64, 'checkpoint_steps': [4000, 8000, 12000, 16000, 20000], 'init_scale_motifs': 1.1203730030100611e-05, 'init_scale_nn': 0.003484547305867, 'weight_decay_motifs': 3.884834472354271e-09, 'weight_decay_nn': 1.2609275325509064e-05, 'dropout_rate': 0.5} CV_score=0.1681
[Calib 02/30] hp={'num_kernels': 16, 'kernel_size': 24, 'fc_hidden': 0, 'lr': 0.029603820055128814, 'momentu

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


[Train] Epoch01 loss=0.9774
[Train] Epoch02 loss=0.9749
[Train] Epoch03 loss=0.9748
[Train] Epoch04 loss=0.9750
[Train] Epoch05 loss=0.9749
[Train] Epoch06 loss=0.9748
[Train] Epoch07 loss=0.9750
[Train] Epoch08 loss=0.9748
[Train] Epoch09 loss=0.9750
[Train] Epoch10 loss=0.9749
[Train] Epoch11 loss=0.9750
[Train] Epoch12 loss=0.9750
[Train] Epoch13 loss=0.9752
[Train] Epoch14 loss=0.9748
[Train] Epoch15 loss=0.9747
[Train] Epoch16 loss=0.9754
[Train] Epoch17 loss=0.9752
[Train] Epoch18 loss=0.9749
[Train] Epoch19 loss=0.9751
[Train] Epoch20 loss=0.9753
>> Test Pearson: 0.0826


In [12]:
%%bash
python train.py \
  --data_type PBM \
  --seq_file ../DeepBind/data/dream5/pbm/sequences.tsv.gz \
  --tgt_file ../DeepBind/data/dream5/pbm/targets.tsv.gz \
  --tf_col TF_2 \
  --max_len 36 \
  --epochs 20 \
  --device cuda

[Calib 01/30] hp={'num_kernels': 16, 'kernel_size': 24, 'fc_hidden': 0, 'lr': 0.0011480914295024973, 'momentum': 0.9821899920516143, 'batch_size': 64, 'checkpoint_steps': [4000, 8000, 12000, 16000, 20000], 'init_scale_motifs': 0.0008640157554929006, 'init_scale_nn': 5.867932210863854e-05, 'weight_decay_motifs': 8.590037832094837e-10, 'weight_decay_nn': 2.3268530601033645e-05, 'dropout_rate': 0.0} CV_score=0.7672
[Calib 02/30] hp={'num_kernels': 16, 'kernel_size': 24, 'fc_hidden': 32, 'lr': 0.035106231070142456, 'momentum': 0.9892113028210163, 'batch_size': 64, 'checkpoint_steps': [4000, 8000, 12000, 16000, 20000], 'init_scale_motifs': 5.839050380954199e-06, 'init_scale_nn': 0.0031699176125715273, 'weight_decay_motifs': 3.3458339071104138e-12, 'weight_decay_nn': 0.0002891076155385852, 'dropout_rate': 0.25} CV_score=0.4417
[Calib 03/30] hp={'num_kernels': 16, 'kernel_size': 24, 'fc_hidden': 32, 'lr': 0.02211650232615001, 'momentum': 0.9893317427003273, 'batch_size': 64, 'checkpoint_steps

  return F.mse_loss(input, target, reduction=self.reduction)


[Train] Epoch01 loss=0.9916
[Train] Epoch02 loss=0.9914
[Train] Epoch03 loss=0.9913
[Train] Epoch04 loss=0.9915
[Train] Epoch05 loss=0.9916
[Train] Epoch06 loss=0.9914
[Train] Epoch07 loss=0.9915
[Train] Epoch08 loss=0.9915
[Train] Epoch09 loss=0.9915
[Train] Epoch10 loss=0.9914
[Train] Epoch11 loss=0.9916
[Train] Epoch12 loss=0.9914
[Train] Epoch13 loss=0.9913
[Train] Epoch14 loss=0.9915
[Train] Epoch15 loss=0.9912
[Train] Epoch16 loss=0.9915
[Train] Epoch17 loss=0.9915
[Train] Epoch18 loss=0.9915
[Train] Epoch19 loss=0.9914
[Train] Epoch20 loss=0.9913
>> Test Pearson: nan (constant preds or labels; pearson undefined)


不知为何，模型一直输出常数，Pearson系数非常低。

In [5]:
%%bash
python train.py \
  --data_type PBM \
  --seq_file ../DeepBind/data/dream5/pbm/sequences.tsv.gz \
  --tgt_file ../DeepBind/data/dream5/pbm/targets.tsv.gz \
  --tf_col TF_3 \
  --max_len 36 \
  --epochs 20 \
  --device cuda

[Calib 01/30] hp={'lr': 0.00014835053740083198, 'weight_decay': 1.7413845639000577e-05, 'batch_size': 128, 'num_kernels': 8, 'kernel_size': 36, 'fc_hidden': 64} CV_score=0.3556
[Calib 02/30] hp={'lr': 0.0012096264455458406, 'weight_decay': 0.0008403821697095179, 'batch_size': 64, 'num_kernels': 32, 'kernel_size': 36, 'fc_hidden': 16} CV_score=0.3821
[Calib 03/30] hp={'lr': 0.0062721644073863375, 'weight_decay': 2.7217277511657373e-06, 'batch_size': 32, 'num_kernels': 32, 'kernel_size': 12, 'fc_hidden': 64} CV_score=0.8608
[Calib 04/30] hp={'lr': 0.00013274653952020547, 'weight_decay': 9.464889848951976e-05, 'batch_size': 32, 'num_kernels': 8, 'kernel_size': 12, 'fc_hidden': 16} CV_score=0.3716
[Calib 05/30] hp={'lr': 0.0053032369330071535, 'weight_decay': 0.00010816246773556173, 'batch_size': 64, 'num_kernels': 8, 'kernel_size': 12, 'fc_hidden': 16} CV_score=0.8052
[Calib 06/30] hp={'lr': 0.00015632841683064954, 'weight_decay': 0.0004314290406749512, 'batch_size': 64, 'num_kernels': 8,

再试一下TF_1 & TF_2

In [6]:
%%bash
python train.py \
  --data_type PBM \
  --seq_file ../DeepBind/data/dream5/pbm/sequences.tsv.gz \
  --tgt_file ../DeepBind/data/dream5/pbm/targets.tsv.gz \
  --tf_col TF_1 \
  --max_len 36 \
  --epochs 20 \
  --device cuda

[Calib 01/30] hp={'lr': 0.0010618096556056875, 'weight_decay': 0.00035152882381754515, 'batch_size': 32, 'num_kernels': 16, 'kernel_size': 24, 'fc_hidden': 32} CV_score=0.5597
[Calib 02/30] hp={'lr': 0.00043968942294645276, 'weight_decay': 3.728148514167497e-06, 'batch_size': 128, 'num_kernels': 16, 'kernel_size': 12, 'fc_hidden': 64} CV_score=0.3929
[Calib 03/30] hp={'lr': 0.0015884997200713304, 'weight_decay': 0.00012847898922006124, 'batch_size': 32, 'num_kernels': 32, 'kernel_size': 24, 'fc_hidden': 64} CV_score=0.6449
[Calib 04/30] hp={'lr': 0.0044260304662261055, 'weight_decay': 0.00019402007410656294, 'batch_size': 64, 'num_kernels': 8, 'kernel_size': 12, 'fc_hidden': 16} CV_score=0.6306
[Calib 05/30] hp={'lr': 0.00033176857821930623, 'weight_decay': 0.00020076588824052998, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 12, 'fc_hidden': 16} CV_score=0.4136
[Calib 06/30] hp={'lr': 0.004647656086274488, 'weight_decay': 0.00021269539109714837, 'batch_size': 64, 'num_kernels': 

使用文章里面推荐的超参数采样方法

In [1]:
%%bash
python train.py \
  --data_type PBM \
  --seq_file ../DeepBind/data/dream5/pbm/sequences.tsv.gz \
  --tgt_file ../DeepBind/data/dream5/pbm/targets.tsv.gz \
  --tf_col TF_2 \
  --max_len 36 \
  --epochs 20 \
  --device cuda

[Calib 01/30] hp={'lr': 0.0024657439749198296, 'weight_decay': 1.6606963364231383e-10, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 32} CV_score=0.7985
[Calib 02/30] hp={'lr': 0.0015281954376632535, 'weight_decay': 3.340611409404236e-05, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 64} CV_score=0.7690
[Calib 03/30] hp={'lr': 0.0028940286333845517, 'weight_decay': 0.0002971700846927089, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 32} CV_score=0.7954
[Calib 04/30] hp={'lr': 0.03387200383942684, 'weight_decay': 1.7769653136474043e-08, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 32} CV_score=0.5323
[Calib 05/30] hp={'lr': 0.024063648319758532, 'weight_decay': 5.374363135264486e-07, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 32} CV_score=0.6749


  metrics.append(pearsonr(logits, labels)[0])


[Calib 06/30] hp={'lr': 0.043598211480921134, 'weight_decay': 2.6336348308598094e-08, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 64} CV_score=nan
[Calib 07/30] hp={'lr': 0.0007967778864187729, 'weight_decay': 5.469358463508562e-10, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 32} CV_score=0.6352
[Calib 08/30] hp={'lr': 0.0033936083208001605, 'weight_decay': 1.638609102619102e-05, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 32} CV_score=0.7943
[Calib 09/30] hp={'lr': 0.0012194245506612484, 'weight_decay': 1.3593557516519648e-06, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 64} CV_score=0.7518
[Calib 10/30] hp={'lr': 0.02555887287373827, 'weight_decay': 0.0007224342252935954, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 16, 'fc_hidden': 64} CV_score=0.5766
[Calib 11/30] hp={'lr': 0.026478229790624384, 'weight_decay': 1.965103064448779e-08, 'batch_size': 64, 'num_kernels': 16, 'kernel

In [11]:
%%bash
mkdir -p out
for f in ../DeepBind/data/dream5/chipseq/*.seq; do
  base=$(basename "$f" .tsv)
  echo "Training on $base ..."
  python train.py --data_type ChIPSeq --seq_file "$f" --max_len 101 --epochs 10 --device cuda
  mv deepbind.final.pth out/deepbind_${base}.pth
done

Training on TF_23_CHIP_100_dinuc.seq ...


Traceback (most recent call last):
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 320, in <module>
    main()
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 251, in main
    score = cv_score(rest_ds, hp, device, binary=False)
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 148, in cv_score
    model = DeepBind(
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1369, in to
    return self._apply(convert)
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/nn/modules/module.py", line 928, in _apply
    module._apply(fn)
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/nn/modules/module.py", line 955, in _apply
    param_applied = fn(param)
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1355, in convert
    return t.to(
KeyboardInterrupt


Error while terminating subprocess (pid=84024): 


In [None]:
%%bash
strings $(find $CONDA_PREFIX -name "libstdc++.so.6") | grep CXXABI


CalledProcessError: Command 'b'strings $(find $CONDA_PREFIX -name "libstdc++.so.6") | grep CXXABI\n'' returned non-zero exit status 1.