In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob

from Bio.Seq import Seq
from Bio.pairwise2 import align
from Bio.Align import substitution_matrices
blosum62 = substitution_matrices.load("BLOSUM62")



In [2]:
def mut_str(wt,mut):
    
    if '*' in mut or type(wt) != str:
        return None
    
    try:
        algn = align.globalds(wt, mut, blosum62, -2, -0.5, one_alignment_only=True)[0]
    except TypeError:
        print(wt,mut)
        
    wt_algn = algn[0]
    mut_algn = algn[1]
    mut_str = ''
    wt_idx = 0
    for i in range(len(wt_algn)):
        
        if wt_algn[i] != '-':
            wt_idx += 1
        
        if wt_algn[i] != mut_algn[i]:
            mut_str += f'{wt_algn[i]}{wt_idx}{mut_algn[i]},'
    
    if len(mut_str) > 0:
        mut_str = mut_str[:-1]
        
    return mut_str

In [3]:
# matches = pd.read_csv('<path/to/your>/01_select_hits/00_ngs_processing/all_matches.csv',index_col=0)
matches = pd.read_csv('../00_ngs_processing/all_matches.csv',index_col=0)
print(len(matches))
counts = pd.read_csv('../00_ngs_processing/all_counts.csv',index_col=0)
print(len(matches))
matches = matches.merge(counts,left_on='query_name',right_on='name',how='inner')
renamed = pd.read_csv('../../00_library/05_filtering/selected_designs_with_metrics_renamed.csv',index_col=0)
print(len(matches))
matches = matches.merge(renamed,left_on='match_name',right_on='short_name',how='left')
print(len(matches))

rif_score_df = []

scorefiles = glob('<path/to/your/trp-scoring_rifdock_output>/*.dok')

for fname in scorefiles:
    
    name = fname.replace('.dok','').split('/')[-1]
    
    data = [name]
    with open(fname,'r') as f:
        line = f.readline()

    split = line.split(' ')
    split = [x for x in split if x != '']
    
    for i in [10, 14, 21, 23]: #score, bb-hbond, hyd-cont, hyd-ddg
        data.append(float(split[i]))
    
    rif_score_df.append(data)

rif_score_df = pd.DataFrame(columns=['name',
                                     'just_trp_rif_score', 'just_trp_rif_bb-hbond',
                                     'just_trp_rif_hyd-cont', 'just_trp_rif_hyd-ddg'],data=rif_score_df)

perfect = matches[np.max(matches[['mismatches_alignment','mismatches_query']],axis=1)<=1]
perfect = perfect.merge(rif_score_df,left_on='name_y',right_on='name',how='left')

perfect['mut_str'] = [mut_str(row['binder_seq'],row['AA']) for i, row in perfect.iterrows()]

perfect.to_csv('near_perfect_matches_with_rif_scores.csv')

339677
339677
339677
339677


In [4]:
perfect.head()

Unnamed: 0,query_name,query_len,hit_len,match_name,mismatches_query,mismatches_alignment,algn_len,score,e_val,AA,...,delta_sap_binder,delta_sap_target,length,short_name,name,just_trp_rif_score,just_trp_rif_bb-hbond,just_trp_rif_hyd-cont,just_trp_rif_hyd-ddg,mut_str
0,seq286505,56,56.0,H2Db_02189_std,1.0,1.0,56.0,234.0,1.84039e-29,DREERIKELLEEARHIEDPEEVRRLIEEALHLASERGDMELAIEIL...,...,18.488,10.557,56.0,H2Db_02189_std,HHH_b1_05107_000000173_std_0001_0001,-1.482,0.0,0.0,-0.0,L36R
1,seq286506,56,56.0,H2Db_09766_std,1.0,1.0,56.0,254.0,1.73783e-32,DEREALELLDKALEALAHGNPEEARKLLEKALRLARKTNNKWLEKA...,...,23.668,12.334,56.0,H2Db_09766_std,HHH_b2_05387_000000258_std_0001_0001,1.26,0.0,0.0,-0.0,I47F
2,seq286507,57,57.0,H2Db_05151_std_graft,1.0,1.0,57.0,251.0,5.22199e-32,DEDEEQVRRLLRAAEEYLKECNPEVARLLLFAARQIAERLGDEELR...,...,17.738,9.259,57.0,H2Db_05151_std_graft,m_HHH_b1_07861_000000265_std_0001_27_38_H_._HH...,-0.502,0.0,0.0,-0.0,G21C
3,seq286509,52,52.0,H2Db_05352_std_graft,1.0,1.0,52.0,224.0,5.06115e-28,DAEELLKLAKRLLEEGDPEKARELALFALIAAIFEGDDELLREIRE...,...,22.575,9.433,52.0,H2Db_05352_std_graft,m_HHH_b1_00493_000000142_std_0001_28_35_H_._HH...,-1.787,0.0,0.0,-0.0,R41L
4,seq286512,56,56.0,H2Db_02062_std_graft,1.0,1.0,56.0,247.0,1.58138e-31,TKRALEALLEQLERLLREGDPEEFRHLLLLAKLLSRVLNDEEVLKE...,...,25.474,9.625,56.0,H2Db_02062_std_graft,m_HHH_b1_06629_000000104_std_0001_22_37_H_._HH...,-1.869,0.0,0.0,-0.0,A35S
