### Varibles and Modules

In [1]:
from data.pub import env
rcsb_cif_gz_dir = env().rcsb_cif_gz_dir # local PDB database 
rcsb_cif_dir    = env().rcsb_cif_dir # local PDB database 
Add_CIF         = env().Add_CIF   
Cov_Miner       = env().Cov_Miner 
covalent_raw    = env().covalent_raw  
covalent_csv    = env().covalent_csv # result after filtration  
adduct_cif_dir  = env().adduct_cif_dir   

import os, time, datetime, shlex, pathlib, subprocess  
from multiprocessing import Pool
from urllib.request import urlopen 
_ = [pathlib.Path(i).mkdir(parents=True, exist_ok=True) 
        for i in [rcsb_cif_gz_dir, rcsb_cif_dir, adduct_cif_dir]]   



### Delete obselete pdb

In [None]:
obsolete = 'ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat'
with urlopen(obsolete) as fr:
    _ = fr.readline()
    while True:
        try:
            line = fr.readline().decode()
            if not line: break
            PDBID = line.split()[2]
            pdbid = PDBID.lower()  
            os.remove(f'{rcsb_cif_gz_dir}/{pdbid[1:3]}/{pdbid}.cif.gz') 
            print('{} is now obselete and deleted successfully.'.format(PDBID))
        except:
            pass

### Rsync 

In [None]:
!rsync -rlpt -z --delete --port=33444 rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ $rcsb_cif_dir

### Decompress

In [None]:
def unzipper(pdb_id):
    cif_subdir = f'{rcsb_cif_dir}/{pdb_id[1:3]}'
    pathlib.Path(cif_subdir).mkdir(parents=True, exist_ok=True)
    cif        = f'{rcsb_cif_dir}/{pdb_id[1:3]}/{pdb_id}.cif'
    gz         = f'{rcsb_cif_gz_dir}/{pdb_id[1:3]}/{pdb_id}.cif.gz'
    proc = subprocess.run(args=['gzip', '-dc', gz], stdout=subprocess.PIPE) # unzip the zipped 
    with open(cif, 'wb') as fw: fw.write(proc.stdout)   

pdb_ids = []
for subdir in os.scandir(rcsb_cif_gz_dir): 
    if subdir.is_dir():
        for gz in os.scandir(subdir.path):
            if gz.is_file():
                pdb_ids.append(gz.name[:4])

nCores = 20
with Pool(nCores) as p: # unzipping in 3 cores  
    mining_result = p.map(unzipper, pdb_ids, nCores) 

### Mining

In [None]:
def cov_miner(fname): # miner, the c++ program 
    proc = subprocess.run(args=[Cov_Miner, fname], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) 
    result = proc.stdout.decode()
    if not result:
        print(fname, result) 
    return result 

cifFiles = []
for dirName, subdirList, fileList in os.walk(rcsb_cif_dir): 
    for fname in fileList:
        if fname[-4:]=='.cif': 
            cifFiles.append('{}/{}'.format(dirName,fname) ) 

with Pool(nCores) as p: # mining in 3 cores   
    mining_result = p.map(cov_miner, cifFiles, nCores)   

mining_result = ''.join(mining_result)
mining_result = [i for i in mining_result.split('\n')]
with open(covalent_raw, 'w') as f:
    for i in mining_result:
        print(i, file = f)

In [None]:
import hashlib
def md5(name_path_pair): 
    return name_path_pair[0], hashlib.md5(open(name_path_pair[1], 'rb').read()).hexdigest()

collector = []
for subdir in os.scandir(rcsb_cif_gz_dir): 
    if subdir.is_dir():
        for gz in os.scandir(subdir.path):
            if gz.is_file():
                collector.append((gz.name, gz.path))

nCores = 20
md5_collector = Pool(nCores).map(md5, collector, nCores)  

import json, datetime 
with open('data/RCSB_PDB_mmCIF_gzip_md5_'+datetime.datetime.now().strftime('%b_%d_%Y')+'.json','w') as fw:
    json.dump(dict(md5_collector), fw)  

In [None]:
#import shlex, subprocess, os, pickle, pathlib, time, datetime, multiprocessing   
 

## Filter 1 & 2: Only few nucleophile (Amino Acid) can be modified

In [2]:
with open(covalent_raw, 'r') as fr: 
    entries = [i for i in fr.read().splitlines() if i.find(',') > -1]   
    Accepted_Nucleophile = [('ASP', 'OD1'), ('ASP', 'OD2'), ('CYS', 'SG'), ('GLU', 'OE2'), ('HIS', 'ND1'), ('HIS', 'NE2'), ('LYS', 'NZ'), ('MET', 'SD'),  ('THR', 'OG1'), ('SER', 'OG'), ('TYR', 'OH')]
    Covalent = [] # it stores entries in which the amino acid is connected to a non-polymeric compound
    for entry in entries:
        entry = entry.split(',') 
        #    0 1 2   3   4   5   6 7 8 9      10 1 2   3   4   5   6 7 8 9      20    
        # 2XAZ,A,A,LEU,LEU,729,729,?,C,?,polymer,A,A,NIY,NIY,730,730,?,N,?,polymer
        if   (entry[4],  entry[8] ) in Accepted_Nucleophile and entry[10] == 'polymer': pass
        elif (entry[14], entry[18]) in Accepted_Nucleophile and entry[20] == 'polymer': entry = [entry[0]] + entry[11:] + entry[1:11] # Reconstruct the list
        else: 
            continue # skip invalid covalent modification
        Covalent.append(entry)
len(Covalent)

29608

## Filter 3:  Only (C, H, O, N, P, S, F, Cl, Br, I, B) are allowed in Adduct

In [3]:
# We know some ligands already

AAs = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY", "HIS", "ILE",
       "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL"]
NUCs = ["DA", "DC", "DG", "DT", "DI", "A", "C", "G", "U"]
HavingInvalidElement = {'1TW':'Se', '23S':'Se', '2MO':'Mo', '2ST':'Se', '30F':'Se', 
                        '30V':'Se', '31Q':'Hg', '32S':'Se', '3UQ':'Ru', '5SE':'Se', 
                        '6BR':'V', '6KY':'Se', '6SE':'Se', '6WF':'Fe', '6Y1':'Se', 
                        '6Y3':'Se', '75B':'Se', '8CY':'Ru', 'A71':'Pt', 'A72':'Pt', 
                        'AF3':'Al', 'AG':'Ag', 'ALF':'Al', 'AMM':'Si', 'AUC':'Au', 
                        'AU':'Au', 'AVC':'V', 'BCL':'Mg', 'BEF':'Be', 'BF2':'Be', 
                        'BFD':'Be', 'BS3':'Bi', 'C4R':'Rh', 'CAC':'As', 'CAF':'As', 
                        'CAS':'As', 'CEQ':'Si', 'CLA':'Mg', 'CMH':'Hg', 'CN1':'Fe', 
                        'CNC':'Co', 'CNF':'Fe', 'CR':'Cr', 'CRL':'Si', 'CSB':'Pb', 
                        'CSK':'Se', 'CSL':'Se', 'CSR':'As', 'CSZ':'Se', 'CUO':'Cu', 
                        'CVC':'V', 'CZ2':'As', 'CZZ':'As', 'DEU':'Co', 'DOD':'D', 
                        'DOS':'Os', 'DRU':'Ru', 'EU3':'Eu', 'EU':'Eu', 'FEC':'Fe', 
                        'FEM':'Fe', 'FEO':'Fe', 'FNE':'Fe, Ni', 'FSO':'Fe', 'FSX':'Fe', 
                        'GA':'Ga', 'GMS':'Se', 'HCO':'Fe', 'HDE':'Fe', 'HEB':'Fe', 
                        'HEC':'Fe', 'HEM':'Fe', 'HES':'Zn', 'HF3':'Hf', 'HF5':'Hf', 
                        'HFM':'Fe', 'ISW':'Fe', 'IUM':'U', 'LA':'La', 'LOS':'Os',  
                        'OEC': 'Mn, Ca', 'MOO': 'Mo', 'NXC': 'Pt', 'MSE': 'Se', 
                        'REP': 'Re', 'RMD': 'Rh', 'MH0': 'Fe', 'OWK': 'CO',
                        'RU8':'Ru', 'RUA':'Ru', 'RU':'Ru', 'RUD':'Ru', 'RUS':'Se', 
                        'SDG':'Se', 'SE7':'Se', 'SEC':'Se', 'SE':'Se', 'SF4':'Fe', 
                        'SOC':'Se', 'SRM':'Fe', 'SVA':'V', 'SXC':'Pd', 'SYS':'Se', 
                        'T4S':'Se', 'T5S':'Se', 'TB':'Tb', 'TBY':'Sn', 'TTI':'Te', 
                        'UMS':'Se ', 'US3':'Se', 'US5':'Se', 'VG1':'V', 'VO4':'V', 
                        'PK8': 'ZN', 'LQW': 'SE', 'LR2': 'SE', 'TDJ': 'RU', 'LQN': 'SE',
                        'WO4':'W', 'XCC':'Fe, Ni', 'XUA':'Se', 'XUG':'Se', 'YT3':'Y', 
                        'YXX':'Ru', 'ZNH':'Zn', 'ZRC':'Zr', 'F3S': 'FE', 'JY1': 'RH', 
                        'TBR': 'TA', 'CHL': 'MG', 'FUQ': 'MO', 'T9E': 'SE', 'HEA': 'FE', 
                        'IRI': 'IR', 'FES': 'FE', '8M0': 'MO', 'RIR': 'IR', 'KC1': 'MG', 
                        '9JT': 'SE', 'KC2': 'MG', 'EMC': 'HG', 'GXZ': 'W', '6ML': 'FE', 
                        'XCU':'W, Cu','XCO':'W, Co, Si', 'M6O': 'AL'}
Ligand_Excpt = [ # Other Exceptional Ligand
'UNL', 'UNX', 'UNK', # Unknown Ligand/Atom
'CO3', 'SO2', 'SO3', 'SO4', 'NO3', 'PO3', 'PO4', # Anion
'MAN',  # Alpha-D-Mannose
'GOL',  # Pentaethylene Glycol
'1PE',  # Glycol
'NAP',  # NADP+
'BR',   # Element is not ligand 
'CO2',  # Carbon Dioxide   see https://pubs.acs.org/doi/10.1021/bi960424z
'PEG',  # Diethylene Glycol
'PAM',  # PTM: Palmitoylation
'DPN',  # AA: D-Phenylalanine
'PYE',  # Tetrahydropyran 
'BEN',  # A non-covalent inhibitor to trypsin and Xa factor, is often used as a ligand in protein crystallography to prevent proteases from degrading a protein of interest
'GDP',  # Guanosine-5'-Diphosphate
'AMP',  # Adenosine Monophosphate
'ADP',  # Adenosine-5'-Diphosphate
'ATP',  # Adenosine-5'-Triphosphate
'GTP',  # Guanosine-5'-Triphosphate
'NAG',  # GlcNAc PTM
'O',    # Oxygen ??? 1ADL
'IOD',  # Iodine
'Z',    # DNA linking
'CL',   # Chlorine 
'AYE',  # part of UbPA
'MLY',  # methylated LYS / PTM
'MTN',  # MTSL is an organosulfur compound that is used as a nitroxide spin label.
'PEB',   # PHYCOERYTHROBILIN
'PUB',   # PHYCOUROBILIN
'PVN',   # PHYCOVIOLOBILIN
'VRB',   # Phycoviolobilin
'CYC',   # PHYCOCYANOBILIN
'COA',   # cofactor CoA
'BLA',   # BILIVERDINE IX ALPHA
'DBV',   # 15,16-DIHYDROBILIVERDIN
'PCA',   # Modified Residues
'ABA',   # Modified Residues
'CME',   # Modified Residues 
'FAD',   # flavin adenine dinucleotide
'PLP',   # Pyridoxal phosphate cofactors
'PMP',   # pyridoxamine 5'-phosphate cofactors
'BME',   # 2-Mercaptoethanol Reducing Agents. If your protein contains cysteine residues, oxidation could become a problem and cause protein aggregation. To prevent this, keep a reducing agent such as DTT, TCEP, or 2-mercaptoethanol in your buffer.
]  
invalid_ligands = AAs + NUCs + list(HavingInvalidElement.keys()) + Ligand_Excpt # old invalid ligands 

# Records containing these ligands are discarded

Covalent = sorted([entry for entry in Covalent if entry[14] not in invalid_ligands])
len(Covalent)

11559

### What is in adduct

In [10]:
vecEntry_collector = []
collector = []
for vecEntry in Covalent:  
    #    0 1 2   3   4   5   6 7 8 9      10 1 2   3   4   5   6 7 8 9      20    
    # 2XAZ,A,A,LEU,LEU,729,729,?,C,?,polymer,A,A,NIY,NIY,730,730,?,N,?,polymer
    blockName = '_'.join([vecEntry[i] for i in [0,2,4,6,7,8,9,12,14,16,17,18,19]]) 
    adduct_cif_name = adduct_cif_dir + '/' + blockName + "_cif" 
    collector.append(adduct_cif_name)
    if not os.path.isfile(adduct_cif_name):
        vecEntry_collector.append(vecEntry) 

In [11]:
home_dir  = os.getenv('HOME')
def adduct_extractor(vecEntry):
    pdb_id         = vecEntry[0].lower()
    cov_bond_entry = ','.join(vecEntry) 
    mmcif          = f'{rcsb_cif_dir}/{pdb_id[1:3]}/{pdb_id}.cif' 
    subprocess.run(args=[Add_CIF, cov_bond_entry, mmcif, adduct_cif_dir], 
                   stdout=subprocess.PIPE, stderr=subprocess.PIPE)    
nCores = 1
with Pool(nCores) as p:  
    p.map(adduct_extractor, vecEntry_collector, nCores) 

In [13]:
# There are duplicate in Covalent because of _struct_conn.ptnr1_symmetry
new_collector = set([adduct_cif.path for adduct_cif in os.scandir(adduct_cif_dir)])
set(collector).difference(new_collector), set(new_collector).difference(collector)

(set(), set())

### Let's see if any pdb contains invalid elements

In [14]:
from CifFile import ReadCif 
valid_elements = set(['C', 'H', 'D', 'O', 'N', 'P', 'S', 'F', 'CL', 'BR', 'I', 'B'])

collector = {}
for adduct_cif in os.scandir(adduct_cif_dir): 
    with open(adduct_cif.path) as fr: 
        block = ReadCif(fr).first_block()
        elements = set(block['_atom_site.type_symbol']) 
        invalid_elements = elements.difference(valid_elements)
        if invalid_elements:  
            invalid_elements = ', '.join(invalid_elements)  
            collector[adduct_cif.name.split('_')[8]] = invalid_elements 
print('Invalid ligands:')
print(collector)

Invalid ligands:
{}


### No Invalid Elements? Save covalent record

In [8]:
with open(covalent_csv, 'w') as f: f.write('\n'.join([','.join(i) for i in Covalent])) 
print(len(Covalent), Covalent[0])

11559 ['148L', 'A', 'E', 'GLU', 'GLU', '26', '26', '?', 'OE2', '?', 'polymer', 'C', 'A', 'MUB', 'MUB', '.', '1', '?', 'C1', '?', 'branched']
