# This notebook serves as an example for processing the binding affinity data from SKEMPI. 

In [1]:
from Bio.PDB import *
import pandas as pd
import warnings
from Bio import BiopythonWarning
warnings.simplefilter('ignore', BiopythonWarning)
import csv
import numpy as np
from Bio.PDB.Polypeptide import three_to_one, one_to_three
import math


In [6]:
def load(fName, toDownload = False):
    ppi = []
    PDB_IDs = set()
    one_case = 0

    with open(fName) as fin:
        a1 = csv.reader(fin, delimiter=';')
        next(a1, None)  # skip the headers
        for row in a1:
            # print(len(row))
#             pdb_id, chain1, chain2 = row[0].split('_')
#             if len(chain1) == 1 and len(chain2) == 1:
#                 one_case+=1
            ppi.append(row[:14])
#                 PDB_IDs.add(row[0])        
    
    print("Finish loading", len(ppi), "case")
    return ppi

In [5]:
def load_pd(fName):
    df = pd.read_csv(fName, sep = ';')
    df = df.dropna(subset = ['Affinity_mut_parsed', 'Affinity_wt_parsed', 'Temperature'])
    return df

# df = load_pd(skepiIn)
# df.shape


In [13]:
skepiIn = "../skempi_v2/skempi_v2.csv"
ppi = load_pd(skepiIn)
print(ppi.shape)

ppi = np.array(ppi)
# print(ppi)

# process temperature at 13
for i in range(len(ppi)):
#     print(i, ppi[i, (1,13)], ppi[i, 13].strip())
#     print(i, ppi[i,:])
    elem = ppi[i, 13].strip()
#     print(ppi[i,13],  elem[:3])    
    ppi[i, 13] = elem[:3]


(6794, 29)


In [14]:
# download_all_pdb(ppi)
def get_seq(ent_name, mutation_pdb, chains, affinity, temp, seq_dict):
    
    chain_ID, residue_ID, r_w, r_m = mutation_pdb[1], mutation_pdb[2:-1], mutation_pdb[0], mutation_pdb[-1]
#     print('chain_ID, residue_ID, r_w, r_m',chain_ID, residue_ID, r_w, r_m)
    
        
    parser = PDBParser()
    structure = parser.get_structure(ent_name, '/home/zgy_ucla_cs/Research/Protein/PPI-Binding/skempi_v2/PDBs/' + ent_name + '.pdb')
#     print(structure)
    assert(len(structure) == 1)
    model = structure[0]

#     for elem in model.get_chains():
#         print(elem)
    w_seq0 = ''

    for each_chain in chains[0]:
#         print(each_chain,)
        chain0 = model[each_chain]
        ppb = PPBuilder()
        for pp in ppb.build_peptides(chain0):
    #         print('Ch0',pp.get_sequence())
            w_seq0 += pp.get_sequence()

    w_seq1 = ''
    for each_chain in chains[1]:
#         print(each_chain,)
        chain1 = model[each_chain]
        for pp in ppb.build_peptides(chain1):
    #         print('Ch1',pp.get_sequence())
            w_seq1 += pp.get_sequence()

    w_name0 = mut_name0 = ent_name+'_'+chains[0]
    w_name1 = mut_name1 = ent_name+'_'+chains[1]
#     print("Wildtype", w_name0, w_name1)
#     print(w_seq0)
#     print(w_seq1)
    
    if w_name0 not in seq_dict:
        seq_dict[w_name0] = w_seq0
    if w_name1 not in seq_dict:
        seq_dict[w_name1] = w_seq1
    
#     Mutation
    mut_chain = model[chain_ID]
    
    if residue_ID.isdigit():
#         print(residue_ID)
        residue = mut_chain[int(residue_ID)]
    else:
#         print(residue_ID[:-1], residue_ID[-1])
        residue = mut_chain[(' ', int(residue_ID[:-1]), residue_ID[-1].upper())]

#     print(residue.resname, three_to_one(residue.resname), one_to_three(r_m))
    assert(three_to_one(residue.resname) == r_w)

    residue.resname = one_to_three(r_m)
    
    m_seq0 = ''
    m_seq1 = ''

#     chain_ID, residue_ID, r_w, r_m = mutation_pdb[1], mutation_pdb[2:-1], mutation_pdb[0], mutation_pdb[-1]
    
    for each_chain in chains[0]:
        chain0 = model[each_chain]
#         print(chain_ID, chain0)
        if chain_ID == chain0.id:
#             print("mutation on 1st chain")
#             mut_name0 = ent_name+'_'+chains[0] + '_' + mutation_pdb
            mut_name0 = ent_name+'_'+chains[0] + '_' + one_to_three(r_w) + str(residue_ID) + one_to_three(r_m)

#             m_seq0 = ''
            for pp in ppb.build_peptides(chain0):
                m_seq0 += pp.get_sequence()
        else:
            for pp in ppb.build_peptides(chain0):
                m_seq0 += pp.get_sequence()
    
    for each_chain in chains[1]:
        chain1 = model[each_chain]
#         print(chain_ID, chain1)
        if chain_ID == chain1.id:
#             print("mutation on 2nd chain")
#             mut_name1 = ent_name+'_'+chains[1] + '_' + mutation_pdb
            mut_name1 = ent_name+'_'+chains[1] + '_' + one_to_three(r_w) + str(residue_ID) + one_to_three(r_m)
#             m_seq0 = ''
            for pp in ppb.build_peptides(chain1):
                m_seq1 += pp.get_sequence()
        else:
            for pp in ppb.build_peptides(chain1):
                m_seq1 += pp.get_sequence()
                
                
#     elif chain_ID == chains[1]:
# #         print("mutation on 2nd chain")
#         m_seq1 = ''
#         mut_name1 = ent_name+'_'+chains[1] + '_' + mutation_pdb
#         for pp in ppb.build_peptides(chain1):
#             m_seq1 += pp.get_sequence()
#     else:
#         print("Error!!")
#         exit()
#     print("Mutation", mut_name0, mut_name1)
#     print(m_seq0) 
#     print(m_seq1)    
    if mut_name0 not in seq_dict:
        seq_dict[mut_name0] = m_seq0
    if mut_name1 not in seq_dict:
        seq_dict[mut_name1] = m_seq1
    dG_w =  (8.314/4184)*(float(temp)) * math.log(float(affinity[1]))
    dG_m = (8.314/4184)*(float(temp)) * math.log(float(affinity[0]))
    print("dG_w, dG_m, ddG",dG_w, dG_m, dG_m - dG_w)
#     fout.write(w_name0, w_name1,mut_name0, mut_name1,dG_w, dG_m, dG_m - dG_w)
    print(w_name0, w_name1,mut_name0, mut_name1,dG_w, dG_m, dG_m - dG_w, file=open("./skempi_v2.singlemut.ddg.score.txt", "a"))
#     print(w_name0, w_name1,mut_name0, mut_name1,dG_w, dG_m, dG_m - dG_w, file=open("./skempi_v2.mut4.ddg.score.txt", "a"))
    return seq_dict
    #     for i in chain.get_residues():
    #         print(i.get_resname())
    #     for pp in ppb.build_peptides(chain):
    #         print('-',pp.get_sequence())
    #         print('-2-',pp.get_sequence()[pos2-1])

In [15]:
# download_all_pdb(ppi)
def get_score(ent_name, mutation_pdb, chains, affinity, temp):
    
    chain_ID, residue_ID, r_w, r_m = mutation_pdb[1], mutation_pdb[2:-1], mutation_pdb[0], mutation_pdb[-1]
#     print('chain_ID, residue_ID, r_w, r_m',chain_ID, residue_ID, r_w, r_m)
    
        
    w_name0 = mut_name0 = ent_name+'_'+chains[0]
    w_name1 = mut_name1 = ent_name+'_'+chains[1]
#     print("Wildtype", w_name0, w_name1)
    
#     Mutation
    
    for each_chain in chains[0]:
#         print(chain_ID, chain0)
        if chain_ID == each_chain:
#             print("mutation on 1st chain")
#             mut_name0 = ent_name+'_'+chains[0] + '_' + mutation_pdb
            mut_name0 = ent_name+'_'+chains[0] + '_' + one_to_three(r_w) + str(residue_ID) + one_to_three(r_m)
    
    for each_chain in chains[1]:
#         print(chain_ID, chain1)
        if chain_ID == each_chain:
#             print("mutation on 2nd chain")
#             mut_name1 = ent_name+'_'+chains[1] + '_' + mutation_pdb
            mut_name1 = ent_name+'_'+chains[1] + '_' + one_to_three(r_w) + str(residue_ID) + one_to_three(r_m)
#             m_seq0 = ''

    dG_w =  (8.314/4184)*(float(temp)) * math.log(float(affinity[1]))
    dG_m = (8.314/4184)*(float(temp)) * math.log(float(affinity[0]))
    print("dG_w, dG_m, ddG",dG_w, dG_m, dG_m - dG_w)
#     fout.write(w_name0, w_name1,mut_name0, mut_name1,dG_w, dG_m, dG_m - dG_w)
    print(w_name0, w_name1,mut_name0, mut_name1,dG_w, dG_m, dG_m - dG_w, file=open("./skempi_v2.singlemut.ddg.score.txt", "a"))
#     print(w_name0, w_name1,mut_name0, mut_name1,dG_w, dG_m, dG_m - dG_w, file=open("./skempi_v2.mut4.ddg.score.txt", "a"))
    return seq_dict
    #     for i in chain.get_residues():
    #         print(i.get_resname())
    #     for pp in ppb.build_peptides(chain):
    #         print('-',pp.get_sequence())
    #         print('-2-',pp.get_sequence()[pos2-1])

In [84]:
# (8.314/4184)*(float(290))*math.log(5.26E-11)

In [17]:
mult_mutation = 0
seq_dict = {}


# fout = open('./skempi.singlemut.ddg.score.txt', 'w+')
# issue: 300 ['1DVF_AB_CD' 'RD100bA'] ['E5.2 Fv' '298']
cnt_single_chain = cnt_both_chain = 0
f1 = open("./skempi_v2.singlemut.ddg.score.txt", "w")
# f2 = open("./skempi_v2.singlemut.mut4.seq.txt", "w")
print(ppi.shape)

for i in range(len(ppi)):
# for i in range(10):
#     print(ppi[i,:])
    ent_name = ppi[i,0][:4]
#     .lower()
    chains = ppi[i,0].split('_')[1:]
    assert(len(chains) == 2)
#     print(i, len(ppi))
    affinity = ppi[i,(7,9)]
    temp = ppi[i,13]
    print(i, ppi[i, :3], temp, affinity)
    mutations_pdb = ppi[i][2].split(',')
#     print(mutations_pdb, chains)
#     print(mutations_pdb)
    if len(mutations_pdb) == 1:
#         print("single mutation")
#         if 'PA48A' in mutations_pdb:
#             print("AAAA", ent_name)
#         get_seq(ent_name, mutations_pdb[0], chains, affinity, temp, seq_dict)
        get_score(ent_name, mutations_pdb[0], chains, affinity, temp)

# '''
# Only keep the single mutation.

#     else:
# #         print("multi mutation")
#         temp_s = set()
#         for elem in mutations_pdb:
#             temp_s.add(elem[1])
# #         print("multi",temp_s)
#         if len(temp_s) > 1:
#             cnt_both_chain+=1
#         if len(temp_s) == 1:
#             cnt_single_chain+=1
# #         print(mutations_pdb)
#         get_seq_multi(ent_name, mutations_pdb, chains, affinity, temp, seq_dict)
#         mult_mutation+=1
# '''        
# print(mult_mutation, len(ppi) - mult_mutation)
# print(cnt_both_chain, cnt_single_chain)
# for elem in seq_dict:
#     print(elem, seq_dict[elem], file=open("./skempi_v2.singlemut.mut4.seq.txt", "a"))
    


(6794, 29)
0 ['1CSE_E_I' 'LI45G' 'LI38G'] 294 [5.26e-11 1.12e-12]
dG_w, dG_m, ddG -16.075988501732105 -13.827155017938493 2.2488334837936126
1 ['1CSE_E_I' 'LI45S' 'LI38S'] 294 [8.33e-12 1.12e-12]
dG_w, dG_m, ddG -16.075988501732105 -14.903759762487807 1.1722287392442983
2 ['1CSE_E_I' 'LI45P' 'LI38P'] 294 [1.02e-07 1.12e-12]
dG_w, dG_m, ddG -16.075988501732105 -9.404712048380915 6.67127645335119
3 ['1CSE_E_I' 'LI45I' 'LI38I'] 294 [1.72e-10 1.12e-12]
dG_w, dG_m, ddG -16.075988501732105 -13.135000932221619 2.9409875695104866
4 ['1CSE_E_I' 'LI45D' 'LI38D'] 294 [1.92e-09 1.12e-12]
dG_w, dG_m, ddG -16.075988501732105 -11.725554820282436 4.350433681449669
5 ['1CSE_E_I' 'LI45E' 'LI38E'] 294 [6.25e-11 1.12e-12]
dG_w, dG_m, ddG -16.075988501732105 -13.726408516417724 2.3495799853143815
6 ['1ACB_E_I' 'LI45G' 'LI38G'] 294 [4.55e-08 1.49e-12]
dG_w, dG_m, ddG -15.90922852762244 -9.87631810128294 6.0329104263395
7 ['1ACB_E_I' 'LI45S' 'LI38S'] 294 [7.14e-09 1.49e-12]
dG_w, dG_m, ddG -15.90922852762244

In [428]:



# ent_name='1EAW'
# parser = PDBParser()
# structure = parser.get_structure(ent_name, './skempi/pdb_new/pdb'+ ent_name.lower() + '.ent')
# #     print(structure)
# assert(len(structure) == 1)
# model = structure[0]

# for elem in model.get_chains():
#     print(elem)
# w_seq0 = ''

# for each_chain in chains[0]:
# #         print(each_chain,)
#     chain0 = model[each_chain]
#     ppb = PPBuilder()
#     for pp in ppb.build_peptides(chain0):
# #         print('Ch0',pp.get_sequence())
#         w_seq0 += pp.get_sequence()

# print(w_seq0)


# mut_chain = model['A']
# residue = mut_chain[(' ', 60, 'A')]
# print(residue)
# # for elem in mut_chain.get_residues():
# #     print(elem)

In [None]:
residue