# タンパク質についての実験

In [1]:
import os
import pandas as pd
import lmdb
from biopandas.pdb import PandasPdb
from tqdm import tqdm
import pickle
import re
import json
import glob

In [None]:
main_atoms = ["N", "CA", "C", "O", "H"]


In [3]:

def load_from_CASF(pdb_id):
    pdb_path = f"/workspace/cheminfodata/unimol/CASF-2016/casf2016/{pdb_id}_protein.pdb"
    pmol = PandasPdb().read_pdb(pdb_path)
    pocket_residues = json.load(
        open("/workspace/cheminfodata/unimol/CASF-2016/casf2016.pocket.json")
    )[pdb_id]
    return pmol, pocket_residues


In [5]:
load_from_CASF('2cet')

(<biopandas.pdb.pandas_pdb.PandasPdb at 0x7deb20b37700>,
 ['A121',
  'A404',
  'A17',
  'D281',
  'A20',
  'A168',
  'A180',
  'A406',
  'A403',
  'A398',
  'A169',
  'A173',
  'A295',
  'A166',
  'A405',
  'A293',
  'A414',
  'A77',
  'A322',
  'A223',
  'A222',
  'A351',
  'A122',
  'A324',
  'A165'])

In [6]:

def normalize_atoms(atom):
    return re.sub("\d+", "", atom)

In [7]:
pdb_id = '2cet'

pmol, pocket_residues = load_from_CASF(pdb_id)
pname = pdb_id
pro_atom = pmol.df["ATOM"]
pro_hetatm = pmol.df["HETATM"]

In [12]:
type(pmol.df['HETATM'])

pandas.core.frame.DataFrame

In [22]:
list(pmol.df['ATOM']['atom_name'].values)

['N',
 'CA',
 'C',
 'O',
 'CB',
 'CG1',
 'CG2',
 'H',
 'HA',
 'HB',
 'HG11',
 'HG12',
 'HG13',
 'HG21',
 'HG22',
 'HG23',
 'H2',
 'H3',
 'N',
 'CA',
 'C',
 'O',
 'CB',
 'CG',
 'CD',
 'CE',
 'NZ',
 'H',
 'HA',
 'HB2',
 'HB3',
 'HG2',
 'HG3',
 'HD2',
 'HD3',
 'HE2',
 'HE3',
 'HZ1',
 'HZ2',
 'HZ3',
 'N',
 'CA',
 'C',
 'O',
 'CB',
 'CG',
 'CD',
 'CE',
 'NZ',
 'H',
 'HA',
 'HB2',
 'HB3',
 'HG2',
 'HG3',
 'HD2',
 'HD3',
 'HE2',
 'HE3',
 'HZ1',
 'HZ2',
 'HZ3',
 'N',
 'CA',
 'C',
 'O',
 'CB',
 'CG',
 'CD1',
 'CD2',
 'CE1',
 'CE2',
 'CZ',
 'H',
 'HA',
 'HB2',
 'HB3',
 'HD1',
 'HD2',
 'HE1',
 'HE2',
 'HZ',
 'N',
 'CA',
 'C',
 'O',
 'CB',
 'CG',
 'CD',
 'HA',
 'HB2',
 'HB3',
 'HG2',
 'HG3',
 'HD2',
 'HD3',
 'N',
 'CA',
 'C',
 'O',
 'CB',
 'CG',
 'CD',
 'OE1',
 'OE2',
 'H',
 'HA',
 'HB2',
 'HB3',
 'HG2',
 'HG3',
 'N',
 'CA',
 'C',
 'O',
 'H',
 'HA2',
 'HA3',
 'N',
 'CA',
 'C',
 'O',
 'CB',
 'CG',
 'CD1',
 'CD2',
 'CE1',
 'CE2',
 'CZ',
 'H',
 'HA',
 'HB2',
 'HB3',
 'HD1',
 'HD2',
 'HE1',
 'HE2',
 '

In [13]:
pro_atom["ID"] = pro_atom["chain_id"].astype(str) + pro_atom[
    "residue_number"
].astype(str)
pro_hetatm["ID"] = pro_hetatm["chain_id"].astype(str) + pro_hetatm[
    "residue_number"
].astype(str)

pocket = pd.concat(
    [
        pro_atom[pro_atom["ID"].isin(pocket_residues)],
        pro_hetatm[pro_hetatm["ID"].isin(pocket_residues)],
    ],
    axis=0,
    ignore_index=True,
)

In [17]:
pocket["atom_name"]

0        N
1       CA
2        C
3        O
4       CB
      ... 
430    HE2
431     HZ
432      O
433     H1
434     H2
Name: atom_name, Length: 435, dtype: object

In [15]:

pocket["normalize_atom"] = pocket["atom_name"].map(normalize_atoms)
pocket = pocket[pocket["normalize_atom"] != ""]
patoms = pocket["atom_name"].apply(normalize_atoms).values.tolist()
pcoords = [pocket[["x_coord", "y_coord", "z_coord"]].values]
side = [0 if a in main_atoms else 1 for a in patoms]
residues = (
    pocket["chain_id"].astype(str) + pocket["residue_number"].astype(str)
).values.tolist()

"""
return pickle.dumps(
    {
        "atoms": patoms,
        "coordinates": pcoords,
        "side": side,
        "residue": residues,
        "pdbid": pname,
    },
    protocol=-1,
)
"""
''

''

In [None]:

def write_lmdb(pdb_id_list, job_name, outpath="./results"):
    os.makedirs(outpath, exist_ok=True)
    outputfilename = os.path.join(outpath, job_name + ".lmdb")
    try:
        os.remove(outputfilename)
    except:
        pass
    env_new = lmdb.open(
        outputfilename,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=int(10e9),
    )
    txn_write = env_new.begin(write=True)
    for i, pdb_id in tqdm(enumerate(pdb_id_list)):
        inner_output = parser(pdb_id)
        txn_write.put(f"{i}".encode("ascii"), inner_output)
    txn_write.commit()
    env_new.close()

In [None]:
job_name = 'get_pocket_repr'   # replace to your custom name
data_path = './results'  # replace to your data path
weight_path='pocket_pre_220816.pt'  # replace to your ckpt path
only_polar=0  # no h
dict_name='dict_coarse.txt'
batch_size=16
results_path=data_path   # replace to your save path
casf_collect = os.listdir(os.path.join(CASF_PATH, "casf2016"))
casf_collect = list(set([item[:4] for item in casf_collect]))
casf_collect.remove('3qgy')
write_lmdb(casf_collect, job_name=job_name, outpath=data_path)