check for exceptions, such as in row 1144 in `dssp_sasa.csv`, there is no value in column 'SASA_wt_g1','SASA_wt_g2','SASA_mu_g1','SASA_mu_g2', and more than
one charaters in column 'Chain_in_G1','Chain_in_G2'. 

The columns info is 

1b2m	1rgc	10524			10801			CD	EAB


In [21]:
from Bio.PDB import DSSP
from Bio.PDB import PDBParser
import warnings
from Bio import BiopythonWarning
from Bio.PDB import PDBList
from Bio.PDB import PDBIO
import numpy as np

In [29]:
# initialize PDB downloader
pdb_dl = PDBList()
# a list of PDB IDs to download
pdb_list = ['1b2m']
# download PDB files, save them in current directory
for i in pdb_list:
    pdb_dl.retrieve_pdb_file(i, pdir='./', file_format='pdb', overwrite=True)

Downloading PDB structure '1b2m'...


In [30]:
residue_max_acc = { 'A': 106.0, 'R': 248.0, 'N': 157.0,
                   'D': 163.0, 'C': 135.0, 'Q': 198.0, 
                   'E': 194.0, 'G': 84.0, 'H': 184.0, 
                   'I': 169.0, 'L': 164.0, 'K': 205.0, 
                   'M': 188.0, 'F': 197.0, 'P': 136.0, 
                   'S': 130.0, 'T': 142.0, 'W': 227.0, 
                   'Y': 222.0, 'V': 142.0, 'X':169.6 } 

class Chain_filter:
    def __init__(self, chain_id):
        self.chain_id = chain_id
    ## accept all model
    def accept_model(self, model):
        return 1
    ## accept the chains in the group
    def accept_chain(self, chain): 
        if chain.get_id() in self.chain_id: 
            return 1
        return 0
    # accept all residue in the chain
    def accept_residue(self, residue): 
        return 1
    # accept all atoms
    def accept_atom(self, atom):
        return 1
    
def extract_chain2pdb(structure, chain_ids, filename):
    sel = Chain_filter(chain_ids) 
    pdbio = PDBIO() 
    pdbio.set_structure(structure) 
    pdbio.save(filename, sel) 

def calculate_sasa(select_mutation_num=None,select_chain_num=None):
    """
    Read in the csv file with a list of wildtypes and mutations.
    Calculate the a pair of Delta_SASA for each protei; also calculate the Delta Delta_SASA (wt - mut)
    return a dataframe containing the information
    """
    warnings.simplefilter('ignore',BiopythonWarning)
    parser = PDBParser()
    result = []
    

    wt_name = '1b2m'
       
        
    # specify the pdb files directory here
    wt_file = './pdb' + wt_name+'.ent'
        
    # get the structure of complex
    wt_struc = parser.get_structure(wt_name, wt_file)

    try:
        # find the dssp result of the complex
        wt_dssp = DSSP(wt_struc[0], wt_file, 'dssp.exe', file_type='PDB') #first model is stucture[0]
        # in case there are failure
        while len(wt_dssp)==0:
            wt_dssp = DSSP(wt_struc[0], wt_file, 'dssp.exe', file_type='PDB')

    except:
        print("The following pair has corrupted pdb:", wt_name, mu_name)
        
    #    
    # find the total sasa of the complex
    Total_SASA_wt = np.sum([item[3]*residue_max_acc[item[1]] for item in wt_dssp if item[1]!='X'])

    # get the number of chains and divide them into groups
    chain_count = len(wt_struc[0])
    first_group = int(chain_count/2)
    index = 0
    group1 = []
    group2 = []
    for chain in wt_struc[0]:
        if index < first_group:
            group1.append(chain.get_id())
        else:
            group2.append(chain.get_id())
        index += 1
    
    Chain_in_g1 = ''.join(group1) 
    Chain_in_g2 = ''.join(group2)
    print(Chain_in_g1)
    print(Chain_in_g2)
        
            
    try:
        ## wild type
        # filter to group 1
        wt_g1_struc = parser.get_structure(wt_name, wt_file)
        extract_chain2pdb(wt_g1_struc,group1, 'temp.ent') #Write a Structure object (or a subset of a Structure object) as a PDB file.
        wt_g1_dssp = DSSP(wt_g1_struc[0],'temp.ent','dssp.exe', file_type='PDB')
        while len(wt_g1_dssp)==0:
            wt_g1_dssp = DSSP(wt_g1_struc[0], 'temp.ent', 'dssp.exe', file_type='PDB')

        SASA_wt_g1 = np.sum([item[3]*residue_max_acc[item[1]] for item in wt_g1_dssp if item[1]!='X'])

        # filter to group 2
        wt_g2_struc = parser.get_structure(wt_name, wt_file)
        extract_chain2pdb(wt_g2_struc,group2, 'temp.ent')
        wt_g2_dssp = DSSP(wt_g2_struc[0],'temp.ent','dssp.exe', file_type='PDB')
        while len(wt_g2_dssp)==0:
            wt_g2_dssp = DSSP(wt_g2_struc[0], 'temp.ent', 'dssp.exe', file_type='PDB')

        SASA_wt_g2 = np.sum([item[3]*residue_max_acc[item[1]] for item in wt_g2_dssp if item[1]!='X'])

    except:
        SASA_wt_g1 = np.nan
        SASA_wt_g2 = np.nan
        print("The following pair cause error in DSSP:", wt_name)
        
    result.append( (wt_name,Total_SASA_wt,SASA_wt_g1,SASA_wt_g2,
                        Chain_in_g1,Chain_in_g2))
    
        
    return result

res = calculate_sasa()
res

CD
EAB
The following pair cause error in DSSP: 1b2m


empty protein, or no valid complete residues



[('1b2m', 10524.0, nan, nan, 'CD', 'EAB')]

Only protein with 2 chains left. and caluclate sasa for left chain and right chain. 