# Notebook for process BSA data generated by run_dssp.ipynb:

In [22]:
from Bio.PDB import *
import pandas as pd
import warnings
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)
import csv
import numpy as np
from Bio.PDB.Polypeptide import three_to_one, one_to_three
import math
import multiprocessing as mp


### Please provide the sasa file generated by run_dssp.

In [23]:
dssp_sasa_file = './dssp/dssp_sasa.csv'
df = pd.read_csv(dssp_sasa_file, index_col=0)

# clean NaN's
# print(df.shape[0])
df.dropna(inplace=True)
print("number of samples:", df.shape[0])

number of samples: 35645


In [24]:
df.head()

Unnamed: 0,wt_name,mu_name,Total_SASA_wt,SASA_wt_g1,SASA_wt_g2,Total_SASA_mu,SASA_mu_g1,SASA_mu_g2,Chain_in_G1,Chain_in_G2
0,10gs,1md4,17797,9785.0,9785.0,17597,10090.0,10079.0,A,B
1,10gs,3hjo,17797,9785.0,9785.0,18250,10511.0,10406.0,A,B
2,10gs,3ie3,17797,9785.0,9785.0,18099,10386.0,10325.0,A,B
3,117e,1e6a,22598,12174.0,12468.0,22407,12287.0,12127.0,A,B
4,117e,1huk,22598,12174.0,12468.0,23418,12508.0,12959.0,A,B


In [25]:
wts = df['wt_name']
mus = df['mu_name']

### Please uncomment this if you haven't download the required pdbs

In [26]:
# def download_all_pdb(pid_set):
#     pdbl = PDBList()
#     for elem in pid_set:
#         print(elem)
#         assert(len(elem) ==4)
#         pdbl.retrieve_pdb_file(elem, pdir = '/home/zgy_ucla_cs/Research/Protein/PPI-Binding/bsa_data/pdbs/', file_format='pdb')
# download_all_pdb(pid_set)        


### Functions for get sequences

In [27]:
def get_seq_all(ent_names, chain_ID0, chain_ID1):
#     print(ent_names, chain_ID0, chain_ID1)
    parser = PDBParser()
    
    for ent_name in ent_names:
        structure = parser.get_structure(ent_name, './dssp/pdb/pdb'+ ent_name.lower() + '.ent')
    #     assert(len(structure) == 1)
        model = structure[0]

        w_name0 = ent_name+'_'+chain_ID0
        if w_name0 not in seq_dict:
            w_seq0 = ''

            chain0 = model[chain_ID0]
            ppb = PPBuilder()
            for pp in ppb.build_peptides(chain0):
                w_seq0 += pp.get_sequence()

            seq_dict[w_name0] = w_seq0
        
        
        w_name1 = ent_name+'_'+chain_ID1
        if w_name1 not in seq_dict:
            w_seq1 = ''

            chain1 = model[chain_ID1]
            ppb = PPBuilder()
            for pp in ppb.build_peptides(chain1):
                w_seq1 += pp.get_sequence()

            seq_dict[w_name1] = w_seq1
    return seq_dict

    

### For efficiency purpose, we choose top 10 samples to process. 

In [28]:
df_val = df.values
seq_dict = {}
for i in range(5):
    if len(df_val[i][-2]) == 1 and len(df_val[i][-1]) == 1:
#         if i %50 == 0:
        print(i, df_val[i])
        seq_dict = get_seq_all(df_val[i][:2], df_val[i][-2], df_val[i][-1])
    else:
        print("passing", df_val[i])
#     break

print(seq_dict)

0 ['10gs' '1md4' 17797 9785.0 9785.0 17597 10090.0 10079.0 'A' 'B']
1 ['10gs' '3hjo' 17797 9785.0 9785.0 18250 10511.0 10406.0 'A' 'B']
2 ['10gs' '3ie3' 17797 9785.0 9785.0 18099 10386.0 10325.0 'A' 'B']
3 ['117e' '1e6a' 22598 12174.0 12468.0 22407 12287.0 12127.0 'A' 'B']
4 ['117e' '1huk' 22598 12174.0 12468.0 23418 12508.0 12959.0 'A' 'B']
{'10gs_A': Seq('PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKF...GKQ'), '10gs_B': Seq('PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKF...GKQ'), '1md4_A': Seq('PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKF...GKQ'), '1md4_B': Seq('PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKF...GKQ'), '3hjo_A': Seq('PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPK...GKQ'), '3hjo_B': Seq('PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPK...GKQ'), '3ie3_A': Seq('PPYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPK...GKQ'), '3ie3_B': Seq('PYTVVYFPVRGRCAALRMLLADQGQSWKEEVVTVETWQEGSLKASCLYGQLPKF...GKQ'), '117e_A': Seq('TYTTRQIG

### Specify the bsa sequence file and the bsa score file. 

In [29]:
bsa_seq_file = './bsa_seq.txt'
bsa_score_file ='./bsa_score.txt'
fout1 =  open(bsa_seq_file, 'w+')
fout2 =  open(bsa_score_file, 'w+')


In [30]:
for k in seq_dict:
    print(k, seq_dict[k], sep = '\t', file = open(bsa_seq_file, 'a'))        

In [31]:
dd_BSA = []
df_val = df.values

for r in df_val[:5]: #demo is for first 5 lines
#     print(r)
    p1 = r[0]+"_"+r[-2]
    p2 = r[0]+"_"+r[-1]
    p3 = r[1]+"_"+r[-2]
    p4 = r[1]+"_"+r[-1]
#     print(p1,p2,p3,p4)
    if p1 not in seq_dict or p2 not in seq_dict or p3 not in seq_dict or p4 not in seq_dict:
        print("passing",r)
        continue
    BSA1 = r[2] - r[3] - r[4] #total_sasa-sasa_g1-sasa_g2 in wildtype
    BSA2 = r[5] - r[6] - r[7]
    #print(BSA1)
    #print(BSA2)
    if BSA1 - BSA2 < -2000 or BSA1 - BSA2 > 2000:
        continue
    print(r[0]+'_'+r[-2],r[0]+'_'+r[-1],r[1]+'_'+r[-2], r[1]+'_'+r[-1],BSA1,BSA2, BSA1 - BSA2, 
          sep = '\t', file = open(bsa_score_file, 'a'))
    dd_BSA.append(BSA1 - BSA2)
#     break