In [10]:
import sys, os, csv, math, random, itertools, pickle, logging, yaml, psutil
from collections import defaultdict
import concurrent.futures as cf
from tqdm import tqdm
from addict import Dict
import numpy as np, pandas as pd
sys.path.append('/workspace')
from tools.logger import add_stream_handler, get_logger
sys.path.append("/workspace/cplm")
from src.utils.lmdb import new_lmdb
from src.data.lmdb import npy_to_lmdb
logger = get_logger()
add_stream_handler(logger, logging.DEBUG)


In [3]:
# 重い
import cudf

##  241216 finetune dataの分割(crossdockedの分割に従う)。

In [17]:
filepath = "/workspace/cplm/preprocess/results/finetune/r4/filenames.csv"
dffile = pd.read_csv(filepath, index_col=0)

In [None]:
# 確認
print(len(dffile))
dffile.head()

In [None]:
# 条件をstrにまとめる。
conds = []
for dname, lname, pname, sdf_idx in zip(tqdm(dffile['dname']), dffile['lig_name'], dffile['protein_name'], dffile['sdf_idx']):
    sdf_idx = str(sdf_idx)
    cond = '/'.join([dname, lname, pname, sdf_idx])
    conds.append(cond)
conds = np.array(conds)

In [31]:
# 検索しやすいようにdictにする。
cond2i = {cond: i for i, cond in enumerate(conds)}

In [None]:
# レコードがuniqueであるか確認...OK
uconds = np.unique(conds)
print(len(conds), len(uconds))

In [53]:
dfftest = pd.read_csv("/workspace/cheminfodata/crossdocked/types/cdonly_it2_tt_v1.3_0_test0.types", sep=' ', header=None)
dfftrain = pd.read_csv("/workspace/cheminfodata/crossdocked/types/cdonly_it2_tt_v1.3_0_train0.types", sep=' ', header=None)

In [None]:
# 確認
print(len(dfftest))
print(dfftest[4][:5].tolist())
dfftest.head()

In [None]:
# 確認
dfftrain.head()

In [None]:
# dffileに対し, dftrain, dftestに含まれているものを探す。
nofile_conds = {}
cond_counts = {}
for split, dff in zip(['test', 'train'], [dfftest, dfftrain]):
    nofile_conds[split] = []
    cond_counts[split] = np.zeros(len(conds), dtype=int)
    for idx, rec, lig in zip(dff.index, tqdm(dff[3]), dff[4]):
        rec_dir, rec_base = rec.split('/')
        lig_dir, lig_base = lig.split('/')
        assert rec_dir == lig_dir, idx
        rec_base, rec_ext = os.path.splitext(rec_base)
        lig_base, lig_ext = os.path.splitext(lig_base)
        assert rec_ext == lig_ext == '.gninatypes', idx
        rec_base, rec_idx = rec_base.rsplit('_', maxsplit=1)
        assert rec_idx == '0', idx
        protein_name = rec_base+'.pdb'
        lig_name, sdf_idx = lig_base.rsplit('_', maxsplit=1)
        lig_name = lig_name+'.sdf'
        cond = '/'.join([rec_dir, lig_name, protein_name, sdf_idx])
        if cond in cond2i:
            cond_counts[split][cond2i[cond]]+=1
        else:
            nofile_conds[split].append(cond)

In [None]:
# どのようなものがないのか
print(len(nofile_conds['train']))
print(nofile_conds['train'][0])
# ... 普通にこれらのファイルはあった。

In [None]:
dft = dffile[dffile['dname'] == "1433B_HUMAN_1_240_pep_0"]
print(len(dft))
dft
# ..._it1_が入っているものは取っていないようだった。なんで？
# 確かにコード上もそうなってた。 ... そういえばBindGPTにminとdockedだけ使うと書いてあった。

In [None]:
# BindGPTはどれを使っているか分からないので, 数を調べる。
# BindGPTは27Mくらいだった。...少ない。
sys.path.append("/workspace/cplm")
from src.utils.lmdb import load_lmdb
env, txn = load_lmdb("/workspace/cplm/preprocess/results/finetune/r4/main.lmdb")
print(env.stat()['entries'])


## 241218 cd2020のtypesファイルに基づいてr4_allを分割
データ容量削減のため, lmdbに保存する。

In [4]:
df = cudf.read_csv("/workspace/cplm/preprocess/results/finetune/r4_all/filenames.csv", index_col=0)

In [5]:
seed=0
dftype_train = cudf.read_csv(f"/workspace/cheminfodata/crossdocked/types/it2_tt_v1.3_0_train{seed}.types", 
    header=None, names=['label', 'pK', 'RMSD', 'Receptor', 'Ligand', 'Vina score'], sep=' ')
dftype_test = cudf.read_csv(f"/workspace/cheminfodata/crossdocked/types/it2_tt_v1.3_0_test{seed}.types",
    header=None, names=['label', 'pK', 'RMSD', 'Receptor', 'Ligand', 'Vina score'], sep=' ')


In [6]:
dnames_train = {receptor.split('/')[0] for receptor in dftype_train['Receptor'].to_pandas()}
dnames_test = {receptor.split('/')[0] for receptor in dftype_test['Receptor'].to_pandas()}

In [7]:
split = np.full(len(df), fill_value=-1, dtype=int)
df['split'] = -1
for dname in tqdm(dnames_train):
    df['split'][df['dname'] == dname] = 0
for dname in tqdm(dnames_test):
    df['split'][df['dname'] == dname] = 1

print((df['split'] == 0).sum(), (df['split'] == 1).sum(), (df['split']==-1).sum())

100%|██████████| 1835/1835 [00:17<00:00, 107.22it/s]
100%|██████████| 1065/1065 [00:09<00:00, 111.13it/s]

29517648 14263936 1528868





In [8]:
os.makedirs("/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0", exist_ok=True)
train_idx = df.index[df['split'] == 0].to_pandas().values
test_idx = df.index[df['split'] == 1].to_pandas().values
np.save(f"/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/train_idxs.npy", train_idx)
np.save(f"/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_idxs.npy", test_idx)

In [None]:
input = "/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/train_idxs.npy"
output = "/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/train_idxs.lmdb"

# input = "/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_idxs.npy"
# output = "/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_idxs.lmdb"

index = np.load(input)
env, txn = new_lmdb(output)
for i, idx in enumerate(tqdm(index)):
    txn.put(str(i).encode('ascii'), pickle.dumps(idx))
txn.commit()
env.close()

100%|██████████| 29517648/29517648 [01:46<00:00, 276880.32it/s]


In [None]:
# 250524 npy_to_lmdbに変更
npy_to_lmdb("/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/train_idxs.npy")
npy_to_lmdb("/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_idxs.npy")

100%|██████████| 29517648/29517648 [00:13<00:00, 2252027.67it/s]


## 250524 生成の評価用に, 各ディレクトリから1つずつサンプリング

In [None]:
df = pd.read_csv("/workspace/cplm/preprocess/results/finetune/r4_all/filenames.csv.gz")
test_idxs = np.load("/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_idxs.npy")

In [6]:
print(len(df), len(test_idxs))

45310452 14263936


In [8]:
dnames = df['dname'].values[test_idxs]
rng = np.random.default_rng(0)

udnames = np.unique(dnames)
rng.shuffle(udnames)

iidxs = []
for dname in tqdm(udnames):
    dname_iidxs = np.where(dnames == dname)[0]
    iidxs.append(rng.choice(dname_iidxs))

idxs = test_idxs[iidxs]

100%|██████████| 1054/1054 [01:19<00:00, 13.26it/s]


In [10]:
np.save("/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_dirwise_idxs.npy", np.array(idxs, dtype=int))

In [None]:
# 確認
print(len(idxs))
dnames = df['dname'].values[idxs]
print(len(set(dnames.tolist())))

1054
1054


In [2]:
npy_to_lmdb("/workspace/cplm/preprocess/results/finetune/r4_all/split/it2_0/test_dirwise_idxs.npy")

100%|██████████| 1054/1054 [00:00<00:00, 590305.30it/s]
