In [2]:
from pyrosetta import *
from pyrosetta.rosetta import *
from pyrosetta.teaching import *

from pyrosetta.rosetta.protocols.carbohydrates import *
from pyrosetta.rosetta.core.select.residue_selector import *
from pyrosetta.rosetta.core.simple_metrics.metrics import *
from pyrosetta.rosetta.core.simple_metrics.composite_metrics import *
from pyrosetta.rosetta.core.simple_metrics.per_residue_metrics import *

from scipy.spatial.transform import Rotation as R

from utils.carb_utils import *

options = """
-beta
-include_sugars
-alternate_3_letter_codes pdb_sugar

-write_pdb_link_records
-auto_detect_glycan_connections
-ignore_unrecognized_res
-out:level 100
"""

#-out:level 100

init(" ".join(options.split('\n')))

import os
import numpy as np
import pandas as pd
import copy

input_dir = "./"
os.chdir(input_dir)

PyRosetta-4 2021 [Rosetta PyRosetta4.Release.python38.mac 2021.36+release.57ac713a6e1d8ce6f60269b3988b1adac1d96fc6 2021-09-10T13:50:04] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


In [3]:
sc = get_score_function()
fr = pyrosetta.rosetta.protocols.relax.FastRelax()
fr.set_scorefxn(sc)
fr.max_iter(100)

In [4]:
#Get list of proteins
pdb = [];

df = pd.read_csv('./carbbinders_pdblist.txt',header=None).values

print(len(df),df)

6559 [['3old']
 ['3ole']
 ['1pig']
 ...
 ['5DFM']
 ['4A34']
 ['5HQJ']]


In [5]:
ls = os.listdir('./pdbs')
print(len(ls))

6251


In [6]:

        
#Determines if the protein is glycosylated or we have free carbohydrates
def is_glycosylated(pose):
    """
    Gets whether a pose contains glycosylated proteins
    Args:
        pose : pyrosetta pose 
    Returns:
        bool : True / False
    """
    tree_set = pose.glycan_tree_set()
    for start in tree_set.get_start_points():
        parent = tree_set.get_parent(start)
        if parent != 0:
            return True;
    return False
    

In [7]:
os.getcwd()

'/Users/scanner1/Downloads/capdock'

In [8]:
pose = pose_from_pdb('./pdbs/' + ls[0])

In [9]:
ls[0]

'1HGG.pdb'

In [10]:
tree_set = pose.glycan_tree_set()

In [11]:
s = []
for start in tree_set.get_start_points():
    #dont include glycosylated bois
    parent = tree_set.get_parent(start);
    #print(parent)
    if parent != 0:
        continue;
    print(start, pose.pdb_info().pose2pdb(start), pose.residue_type(start).name3(), pose.residue_type(start).name())
    s.append(start)

1525 1 H  Glc ->4)-beta-D-Glcp:reducing_end
1528 1 I  Glc ->4)-beta-D-Glcp:reducing_end
1534 1 K  Glc ->4)-beta-D-Glcp:reducing_end
1537 1 L  Glc ->4)-beta-D-Glcp:reducing_end
1543 1 N  Glc ->4)-beta-D-Glcp:reducing_end
1546 1 O  Glc ->4)-beta-D-Glcp:reducing_end


In [21]:
tree1 = tree_set.get_tree(s[0])

In [22]:
tree1.get_residues()

vector1_unsigned_long[1525, 1526, 1527]

In [23]:
#single test

out = ""

tree_set = pose.glycan_tree_set()
s = []
for start in tree_set.get_start_points():
    #skip the glycosylated glycans
    parent = tree_set.get_parent(start);
    if parent != 0:
        continue;
    #print(start, pose.pdb_info().pose2pdb(start), pose.residue_type(start).name3(), pose.residue_type(start).name())
    s.append(start)

prot = pose.clone()
    
ind = 0;
for i in s:
    tree = tree_set.get_tree(i)
    
    
    res = np.array(tree.get_residues())
    carb = pose.clone()
    for j in range(pose.size()-1,0,-1):
        if j not in res:
            carb.delete_residue_slow(j)
    #print(carb)
    
    
    #save the carb
    carb.dump_pdb('pdb_pre/1bag_carb' + str(ind) + ".pdb")
    
    #relax the carb and save rosetta relax
    #fr.apply(carb)
    #carb.dump_pdb('pdb_pre/1bag_carb' + str(ind) + "_rosRel.pdb")
    
    ind += 1;
    

#have a clean protein file
for j in range(prot.size(),0,-1):
    #print(j)
    if prot.residue(j).is_protein():
        continue;
    print(j)
    prot.delete_residue_slow(j)
    

#output the fasta
for ii in range(1,prot.num_chains()+1):
    out += ">1BAG_" + str(ii) + "\n"
    out += prot.chain_sequence(ii) + "\n"
    
prot.dump_pdb('pdb_pre/1bag_prot.pdb')
#print(out)
    

RuntimeError: 

File: /Volumes/MacintoshHD3/benchmark/W.fujii.release/rosetta.Fujii.release/_commits_/main/source/build/PyRosetta/macos/clang-9.0.0/python-3.8/release/source/src/core/conformation/Conformation.hh:514
[ ERROR ] UtilityExitException
ERROR: Error in core::conformation::Conformation::residue(): The sequence position requested was greater than the number of residues in the pose.



In [36]:
r = pose.residue(3)

In [37]:
r.natoms()

19

In [38]:
r.name3()

'LEU'

In [None]:
pose.

In [19]:
for i in range(1,20):
    print(r.atom_name(i))

 N  
 CA 
 C  
 O  
 CB 
 CG 
 CD1
 CD2
 H  
 HA 
1HB 
2HB 
 HG 
1HD1
2HD1
3HD1
1HD2
2HD2
3HD2


In [34]:
def dump_res_pdb(pose,res,file):
    
    
    anum = 1;
    resnum = 1;
    out = ''

    for ii in res:
        
        r = pose.residue(ii)
        resnum = r.pose2

        for a in range(1,r.natoms()+1):
            
            aname = r.atom_name(a)
            if 'H' in aname:
                continue;
            if 'V' in aname:
                continue;
            
            out += "ATOM  "
            out += str(anum).rjust(5) + ' '
            out += aname
            out += ' ' # no alt location indicator
            out += r.name3().ljust(4)
            out += 'A'
            out += str(ii).rjust(4)
            out += '   '
            #print(m.coor[a,:])
            coor = r.xyz(a)
            c = str( round(coor[0],3) )
            while len( c.split('.')[1] ) < 3:
                c += '0'
            out += c.rjust(8)
            c = str( round(coor[1],3) )
            while len( c.split('.')[1] ) < 3:
                c += '0'
            out += c.rjust(8)
            c = str( round(coor[2],3) )
            while len( c.split('.')[1] ) < 3:
                c += '0'
            out += c.rjust(8)


            #out += str( round(m.coor[a,1],3) ).rjust(8)
            #out += str( round(m.coor[a,2],3) ).rjust(8)

            out += '\n'
            anum += 1
        resnum += 1

    
    return out
    

In [35]:
def output_pdbs(name,pose):

    out = ""

    #get the number of trees
    tree_set = pose.glycan_tree_set()
    s = []
    for start in tree_set.get_start_points():
        #skip the glycosylated glycans
        parent = tree_set.get_parent(start);
        if parent != 0:
            continue;
        #print(start, pose.pdb_info().pose2pdb(start), pose.residue_type(start).name3(), pose.residue_type(start).name())
        s.append(start)

    #print(pose.size())
    #prot = pose.clone()

    ind = 0;
    for i in s:
        tree = tree_set.get_tree(i)
        res = np.array(tree.get_residues())
        #carb = pose.clone()
        

        print( dump_res_pdb(pose,res,'') )

        #print(carb)


        #save the carb
        #carb.dump_pdb('pdb_pre/' + name + '_carb' + str(ind) + ".pdb")

        #relax the carb and save rosetta relax
        #fr.apply(carb)
        #carb.dump_pdb('pdb_pre/' + name + '_carb' + str(ind) + "_rosRel.pdb")

        #ind += 1;

    #print(pose.size(),carb.size(),prot.size())
    #have a clean protein file
    for j in range(prot.size(),0,-1):
        #print(j)
        if j == 0:
            break;
        if prot.residue(j).is_protein():
            continue;
        #print(j)
        prot.delete_residue_slow(j)

    #print(pose.size(),carb.size(),prot.size())
    #output the fasta
    for ii in range(1,prot.num_chains()+1):
        out += ">" + name + "_" + str(ii) + "\n"
        out += prot.chain_sequence(ii) + "\n"

    prot.dump_pdb('pdb_pre/' + name + '_prot.pdb')
    #print(out)
    return out
    
output_pdbs('1hgg',pose)

ATOM      1  C1  Glc A1525    -14.607  70.135  20.516
ATOM      2  C2  Glc A1525    -14.580  69.462  21.901
ATOM      3  C3  Glc A1525    -14.196  70.493  22.969
ATOM      4  C4  Glc A1525    -12.746  70.949  22.641
ATOM      5  O4  Glc A1525    -12.291  71.886  23.657
ATOM      6  C5  Glc A1525    -12.731  71.610  21.194
ATOM      7  O5  Glc A1525    -13.305  70.669  20.222
ATOM      8  O1  Glc A1525    -14.982  69.133  19.574
ATOM      9  O2  Glc A1525    -15.837  68.888  22.183
ATOM     10  O3  Glc A1525    -14.333  69.889  24.241
ATOM     11  C6  Glc A1525    -11.318  72.020  20.625
ATOM     12  O6  Glc A1525    -10.821  73.240  21.148
ATOM     13  C1  Gal A1526    -11.324  71.392  24.592
ATOM     14  C2  Gal A1526    -10.895  72.474  25.575
ATOM     15  C3  Gal A1526     -9.983  71.913  26.639
ATOM     16  O3  Gal A1526     -9.686  72.971  27.579
ATOM     17  C4  Gal A1526    -10.680  70.735  27.319
ATOM     18  C5  Gal A1526    -11.033  69.679  26.276
ATOM     19  O5  Gal A1526  

NameError: name 'prot' is not defined

In [None]:
#output_pdbs('1bag',pyrosetta.toolbox.pose_from_rcsb('1bag'.upper(),ATOM=False))
output_pdbs('1hgg',pose)

In [22]:
fa = ''
for ii in ls[:10]:

    #print(ii[0])
    #try:
        pdb = ii[0]
        print(pdb)
        pose = pyrosetta.toolbox.pose_from_rcsb(pdb.upper(),ATOM=False)
        fa += output_pdbs(pdb,pose)

        f = open('pdb_pre/fasta.fa','w+')
        f.write(fa)
        f.close()
    #except:
    #    print("Unable: ",pdb)
    
    #break;

1e5j



ERROR: Error in core::conformation::Conformation::residue(): The sequence position requested was 0.  Pose numbering starts at 1.
ERROR:: Exit from: /Volumes/scratch/w/rosetta/commits/rosetta/source/build/PyRosetta/macOS-12.7-arm64-arm-64bit/clang-14.0.0/python-3.11/release/source/src/core/conformation/Conformation.hh line: 513


RuntimeError: 

File: /Volumes/scratch/w/rosetta/commits/rosetta/source/build/PyRosetta/macOS-12.7-arm64-arm-64bit/clang-14.0.0/python-3.11/release/source/src/core/conformation/Conformation.hh:513
[ ERROR ] UtilityExitException
ERROR: Error in core::conformation::Conformation::residue(): The sequence position requested was 0.  Pose numbering starts at 1.

