# DeepBind Reproduction
## PBM Dataset
### PBM 数据集  
<b> (1)sequences.tsv.gz </b>  
一列或多列的tsv,每行对应一个短序列(一个probe)   
<b> (2)targets.tsv.gz </b>  
对于sequences.tsv.gz的强度表，每一行对应一个probe,每一列对应一个TF  
<b> (3)tfids.txt </b>  
一行一个TFid  




In [1]:
import numpy as np
import pandas as pd
import gzip

# 1) 看 tfids.txt
with open('../DeepBind/data/dream5/pbm/tfids.txt', 'r') as f:
    tfids = [ln.strip() for ln in f if ln.strip()]
print("Found TF ids (first 20):", tfids[:20])
print("Total TF columns:", len(tfids))

# 2) 读 sequences.tsv.gz（如果每行是序列）
seqs = pd.read_csv('../DeepBind/data/dream5/pbm/sequences.tsv.gz', sep='\t', header=None, compression='gzip', engine='python')
print("sequences shape:", seqs.shape)
print("first rows:\n", seqs.head())

# 常见：如果只有一列，序列在第0列
# seqs[0].head()

# 3) 读 targets.tsv.gz（检查列名/形状）
targets = pd.read_csv('../DeepBind/data/dream5/pbm/targets.tsv.gz', sep='\t', header=0, compression='gzip', engine='python')
print("targets shape:", targets.shape)
print("targets columns:", targets.columns[:20])
print("first rows:\n", targets.head())

# 4) 检查 probe_biases.npz
npz = np.load('../DeepBind/data/dream5/pbm/probe_biases.npz', allow_pickle=True)
print("npz keys:", list(npz.keys()))
for k in npz:
    arr = npz[k]
    print(k, arr.shape, arr.dtype)
    print("sample:", arr[:5])

Found TF ids (first 20): ['TF_1', 'TF_2', 'TF_3', 'TF_4', 'TF_5', 'TF_6', 'TF_7', 'TF_8', 'TF_9', 'TF_10', 'TF_11', 'TF_12', 'TF_13', 'TF_14', 'TF_15', 'TF_16', 'TF_17', 'TF_18', 'TF_19', 'TF_20']
Total TF columns: 86
sequences shape: (80857, 3)
first rows:
          0               1                                         2
0  Fold ID        Event ID                                       seq
1        B  MEreverse14075  TAAAACTATGAGGAAGGATTCAGGGTCGGACAGTGCCTGT
2        B  MEforward19438  CTTATGATCAGAAGCGGCTAGGTGTATTACATGTCCCTGT
3        B  MEforward19439  CCGCCGTAGGCCCCGAAACAGTACCAGACATGTAACCTGT
4        B  MEforward19436  GACCAAACGAGTCCTAGGATTCCAAGCGTTACGACCCTGT
targets shape: (80856, 86)
targets columns: Index(['TF_40', 'TF_41', 'TF_42', 'TF_43', 'TF_44', 'TF_45', 'TF_46', 'TF_47',
       'TF_48', 'TF_49', 'TF_7', 'TF_6', 'TF_5', 'TF_4', 'TF_3', 'TF_2',
       'TF_1', 'TF_9', 'TF_8', 'TF_35'],
      dtype='object')
first rows:
          TF_40         TF_41        TF_42        TF_43 

  arr = npz[k]


用pbm.py处理pbm数据集

In [3]:
from torch.utils.data import DataLoader
from data.pbm import PBMDataset
import torch

seq_file = '../DeepBind/data/dream5/pbm/sequences.tsv.gz'
tgt_file = '../DeepBind/data/dream5/pbm/targets.tsv.gz'
tf_col = 'TF_1'

dataset = PBMDataset(seq_file, tgt_file, tf_col=tf_col, max_len=36)

loader = DataLoader(dataset, batch_size=4, shuffle=True)

for xb, yb in loader:
    print("xb shape:", xb.shape)
    print("yb shape:", yb.shape)
    print("yb:", yb)
    break



xb shape: torch.Size([4, 4, 36])
yb shape: torch.Size([4])
yb: tensor([ 532.3791, 1436.0115,  712.4257,  546.6063], dtype=torch.float64)


In [5]:
with open('../DeepBind/data/dream5/pbm/tfids.txt') as f:
    tfids = [l.strip() for l in f if l.strip()]
    print("Available TFs:", tfids[:50])

Available TFs: ['TF_1', 'TF_2', 'TF_3', 'TF_4', 'TF_5', 'TF_6', 'TF_7', 'TF_8', 'TF_9', 'TF_10', 'TF_11', 'TF_12', 'TF_13', 'TF_14', 'TF_15', 'TF_16', 'TF_17', 'TF_18', 'TF_19', 'TF_20', 'TF_21', 'TF_22', 'TF_23', 'TF_24', 'TF_25', 'TF_26', 'TF_27', 'TF_28', 'TF_29', 'TF_30', 'TF_31', 'TF_32', 'TF_33', 'TF_34', 'TF_35', 'TF_36', 'TF_37', 'TF_38', 'TF_39', 'TF_40', 'TF_41', 'TF_42', 'TF_43', 'TF_44', 'TF_45', 'TF_46', 'TF_47', 'TF_48', 'TF_49', 'TF_50']


In [5]:
%%bash
python train.py \
  --data_type PBM \
  --seq_file ../DeepBind/data/dream5/pbm/sequences.tsv.gz \
  --tgt_file ../DeepBind/data/dream5/pbm/targets.tsv.gz \
  --tf_col TF_1 \
  --max_len 36 \
  --epochs 20 \
  --device cuda

Traceback (most recent call last):
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 322, in <module>
    main()
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 253, in main
    score = cv_score(rest_ds, hp, device, binary=False)
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 194, in cv_score
    for xb, yb in train_dl:
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 734, in __next__
    data = self._next_data()
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 790, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 50, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/dataset.py", line 414, in __getitems_

Error while terminating subprocess (pid=14140): 


In [4]:
%%bash
mkdir -p out
for f in ../DeepBind/data/dream5/chipseq/*.seq; do
  base=$(basename "$f" .tsv)
  echo "Training on $base ..."
  python train.py --data_type ChIPSeq --seq_file "$f" --max_len 101 --epochs 10 --device cuda
  mv deepbind.final.pth out/deepbind_${base}.pth
done

Training on TF_23_CHIP_100_dinuc.seq ...
[Calib 01/30] hp={'lr': 0.0005197010971552333, 'weight_decay': 2.6846351535304474e-05, 'batch_size': 128, 'num_kernels': 8, 'kernel_size': 36, 'fc_hidden': 64} CV_score=0.0366
[Calib 02/30] hp={'lr': 0.0027441525887873237, 'weight_decay': 3.0153015189842353e-06, 'batch_size': 64, 'num_kernels': 32, 'kernel_size': 36, 'fc_hidden': 64} CV_score=0.0734
[Calib 03/30] hp={'lr': 0.00048265158720140806, 'weight_decay': 3.808570266314045e-06, 'batch_size': 64, 'num_kernels': 32, 'kernel_size': 36, 'fc_hidden': 64} CV_score=0.0080
[Calib 04/30] hp={'lr': 0.005254051882727946, 'weight_decay': 0.0006056060838808088, 'batch_size': 64, 'num_kernels': 16, 'kernel_size': 12, 'fc_hidden': 16} CV_score=0.0621
[Calib 05/30] hp={'lr': 0.00953275883311439, 'weight_decay': 0.00017639123243324322, 'batch_size': 128, 'num_kernels': 16, 'kernel_size': 12, 'fc_hidden': 32} CV_score=0.0244
[Calib 06/30] hp={'lr': 0.006721222492655391, 'weight_decay': 0.000376958561780029

Traceback (most recent call last):
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 165, in <module>
    main()
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 117, in main
    score = cv_score(rest_ds, hp, device, binary=False)
  File "/home/syl/TFBS_prediction/my_repro/train.py", line 79, in cv_score
    for xb, yb in train_dl:
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 734, in __next__
    data = self._next_data()
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 790, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 50, in fetch
    data = self.dataset.__getitems__(possibly_batched_index)
  File "/home/syl/anaconda3/envs/tfbs/lib/python3.9/site-packages/torch/utils/data/dataset.py", line 414, in __getitems__

Error while terminating subprocess (pid=33558): 


In [None]:
%%bash
strings $(find $CONDA_PREFIX -name "libstdc++.so.6") | grep CXXABI


CalledProcessError: Command 'b'strings $(find $CONDA_PREFIX -name "libstdc++.so.6") | grep CXXABI\n'' returned non-zero exit status 1.