In [6]:
import os
import pickle
import pandas as pd
from createMSAFromExisting import *

# Full-length + fragment

In [7]:
def createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length):
    '''Loads a monomer MSA and creates new MSAs that contain 1) a large section of the monomer and 2) a short fragment of the monomer
    
    Args
    ----
    a3m_path : str
        path to an a3m formatted msa file describing the monomeric protein (obtained from colab_mmseqs)
    name : str
        name of the monomeric protein
    protein_range : list
        an inclusive range defining which positions of the MSA to take from the monomeric protein (sometimes the ends need to be chopped off)
    fragment_start_range : list
        an inclusive range defining the range of starting residues for fragments of length `fragment_length`
    fragment_length : int
        the number of residues to take when defining a fragment
    '''
    dir_name = f"{name}_tile{str(fragment_length)}aa"
    try:
        os.makedirs(dir_name, exist_ok=False)
    except FileExistsError:
        print('Directory already exists, possibly overwriting existing files')
            
    for fragment_start in range(fragment_start_range[0],fragment_start_range[1]-fragment_length+1):
        fragment_range = (fragment_start,fragment_start+fragment_length-1) # range is inclusive
        a3m_out_path = f"{dir_name}/{name}_{protein_range[0]}-{protein_range[1]}_{name}_{fragment_range[0]}-{fragment_range[1]}.a3m"
        createMSA(a3m_path, protein_range, fragment_range, -1, a3m_out_path)

## Get the fragment ranges for which we have inhibitory data

In [25]:
path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/Savinov_2022_inhib_peptide_mapping.csv'
inhib_df = pd.read_csv(path)
inhib_df_lptg = inhib_df[(inhib_df['gene']=='lptG-coding-EcoliBL21DE3')]
display(inhib_df_lptg)
inhib_df = inhib_df[(inhib_df['fragment length (aa)']==30)]
inhib_df

Unnamed: 0,gene,fragment ID,fragment start (aa),fragment end (aa),fragment center (aa),fragment length (aa),E = inhibitory effect (enrichment),sem(E),hydrophobicity_KyteDoolittle_classifier,instabIndex_Guruprasad_classifier,charge_Lehninger_classifier,frag_contains_alphaHelix,frag_contains_betaStrand,frag_contains_Turn,frag_centroid_fractional_position_classifier
8004,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_1_42_+,1,14,7.5,14,0.204406,0.090457,hydrophilic,stable,neutral,1.0,0.0,0.0,N-terminal
8005,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_4_45_+,2,15,8.5,14,0.165312,0.051304,hydrophilic,stable,neutral,1.0,0.0,0.0,N-terminal
8006,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_7_48_+,3,16,9.5,14,0.080205,0.013877,hydrophobic,stable,neutral,1.0,0.0,0.0,N-terminal
8007,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_10_51_+,4,17,10.5,14,0.886631,0.007495,hydrophobic,stable,neutral,1.0,0.0,0.0,N-terminal
8008,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_13_54_+,5,18,11.5,14,0.563839,0.006706,hydrophobic,stable,neutral,1.0,0.0,0.0,N-terminal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8346,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_1027_1068_+,343,356,349.5,14,0.431270,0.067149,hydrophobic,unstable,neutral,1.0,0.0,0.0,C-terminal
8347,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_1030_1071_+,344,357,350.5,14,0.302660,0.112773,hydrophobic,unstable,neutral,1.0,0.0,0.0,C-terminal
8348,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_1033_1074_+,345,358,351.5,14,-0.125619,0.101623,hydrophobic,stable,neutral,1.0,0.0,0.0,C-terminal
8349,lptG-coding-EcoliBL21DE3,lptG-coding-EcoliBL21DE3_1036_1077_+,346,359,352.5,14,-0.791674,0.025399,hydrophobic,stable,positive,1.0,0.0,0.0,C-terminal


Unnamed: 0,gene,fragment ID,fragment start (aa),fragment end (aa),fragment center (aa),fragment length (aa),E = inhibitory effect (enrichment),sem(E),hydrophobicity_KyteDoolittle_classifier,instabIndex_Guruprasad_classifier,charge_Lehninger_classifier,frag_contains_alphaHelix,frag_contains_betaStrand,frag_contains_Turn,frag_centroid_fractional_position_classifier
7,EGFP-coding-pEGFP,EGFP-coding-pEGFP_19_108_+,7,36,21.5,30,0.119279,0.139890,hydrophobic,stable,negative,1.0,1.0,0.0,N-terminal
14,EGFP-coding-pEGFP,EGFP-coding-pEGFP_22_111_+,8,37,22.5,30,-0.463168,0.163449,hydrophobic,stable,negative,1.0,1.0,0.0,N-terminal
21,EGFP-coding-pEGFP,EGFP-coding-pEGFP_25_114_+,9,38,23.5,30,0.385676,0.146768,hydrophobic,stable,negative,1.0,1.0,0.0,N-terminal
28,EGFP-coding-pEGFP,EGFP-coding-pEGFP_28_117_+,10,39,24.5,30,-0.616033,0.128368,hydrophilic,stable,negative,1.0,1.0,0.0,N-terminal
35,EGFP-coding-pEGFP,EGFP-coding-pEGFP_31_120_+,11,40,25.5,30,-0.193760,0.129166,hydrophilic,stable,negative,1.0,1.0,0.0,N-terminal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12046,ssb-coding-EcoliBL21DE3,ssb-coding-EcoliBL21DE3_433_522_+,145,174,159.5,30,-0.924184,0.048997,hydrophilic,unstable,negative,0.0,0.0,0.0,C-terminal
12049,ssb-coding-EcoliBL21DE3,ssb-coding-EcoliBL21DE3_436_525_+,146,175,160.5,30,-0.436107,0.160560,hydrophilic,unstable,negative,0.0,0.0,0.0,C-terminal
12052,ssb-coding-EcoliBL21DE3,ssb-coding-EcoliBL21DE3_439_528_+,147,176,161.5,30,0.508135,0.121051,hydrophilic,unstable,negative,0.0,0.0,0.0,C-terminal
12056,ssb-coding-EcoliBL21DE3,ssb-coding-EcoliBL21DE3_442_531_+,148,177,162.5,30,-0.399455,0.222227,hydrophilic,unstable,negative,0.0,0.0,0.0,C-terminal


In [18]:
inhib_df.groupby('gene').agg(
    aa_start_min=pd.NamedAgg(column='fragment start (aa)',aggfunc='min'),
    aa_start_max=pd.NamedAgg(column='fragment start (aa)',aggfunc='max'),
    aa_stop = pd.NamedAgg(column='fragment end (aa)',aggfunc='max')
).reset_index()

Unnamed: 0,gene,aa_start_min,aa_start_max,aa_stop
0,EGFP-coding-pEGFP,7,210,239
1,folA-coding-EcoliBL21DE3,1,130,159
2,ftsZ-coding-EcoliBL21DE3,1,354,383
3,groL-coding-EcoliBL21DE3,1,519,548
4,groS-coding-EcoliBL21DE3,1,68,97
5,gyrA-coding-EcoliBL21DE3,1,843,872
6,ileS-coding-EcoliBL21DE3,1,909,938
7,rpIL-coding-EcoliBL21DE3,1,92,121
8,rpoB-coding-EcoliBL21DE3,1,1313,1342
9,ssb-coding-EcoliBL21DE3,1,149,178


In [23]:
inhib_df_lptg.groupby('gene').agg(
    aa_start_min=pd.NamedAgg(column='fragment start (aa)',aggfunc='min'),
    aa_start_max=pd.NamedAgg(column='fragment start (aa)',aggfunc='max'),
    aa_stop = pd.NamedAgg(column='fragment end (aa)',aggfunc='max')
).reset_index()

Unnamed: 0,gene,aa_start_min,aa_start_max,aa_stop
0,lptG-coding-EcoliBL21DE3,1,347,360


## eGFP

In [5]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/egfp/egfp_03e6d.a3m'
name = 'egfp'
protein_range = (7,239)
fragment_start_range = (7,210)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

## folA

In [6]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/folA/folA_ASversion_e559b.result/folA_ASversion_e559b.a3m'
name = 'folA'
protein_range = (1,159)
fragment_start_range = (7,130)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

## ftsZ

In [7]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/ftsZ/FtsZ_wholesequence_makemsa_99700.a3m'
name = 'ftsZ'
protein_range = (10,316)
fragment_start_range = (1,354)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

In [8]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/ftsZ/FtsZ_wholesequence_makemsa_99700.a3m'
name = 'ftsZ'
protein_range = (10,316)
fragment_start_range = (1,354)
fragment_length = 20

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

## groeL

In [9]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/groeL/groeL_4ee36.result/groeL_4ee36.a3m'
name = 'groEL'
protein_range = (1,548)
fragment_start_range = (1,519)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

## groeS

In [11]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/groeS/groeS_9afac.result/groeS_9afac.a3m'
name = 'groES'
protein_range = (1,97)
fragment_start_range = (1,68)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

Directory already exists, possibly overwriting existing files


## gyrA

In [14]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/gyrA/gyrA_99c82.result/gyrA_99c82.a3m'
name = 'gyrA'
protein_range = (1,872)
fragment_start_range = (1,843)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

Directory already exists, possibly overwriting existing files


## ileS

In [15]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/ileS/ileS_2bda5.a3m'
name = 'gyrA'
protein_range = (1,938)
fragment_start_range = (1,909)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

Directory already exists, possibly overwriting existing files


## lptG

In [28]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/lptG/lptG_ae86a.result/lptG_ae86a.a3m'
name = 'lptG'
protein_range = (1,360)
fragment_start_range = (1,347)
fragment_length = 14

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

## rplL

In [17]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/rplL/rpIL_fullength_7c4cd.a3m'
name = 'rplL'
protein_range = (1,121)
fragment_start_range = (1,92)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

Directory already exists, possibly overwriting existing files


## rpoB

In [18]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/rpoB/rpoB_2bda5.a3m'
name = 'rpoB'
protein_range = (1,1342)
fragment_start_range = (1,1313)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)

Directory already exists, possibly overwriting existing files


## ssB

In [19]:
a3m_path = '/home/gridsan/sswanson/local_code_mirror/ColabFoldCustomMSA/colab_info/ssb/ssb_8fe71.a3m'
name = 'ssb'
protein_range = (1,178)
fragment_start_range = (1,149)
fragment_length = 30

createIndividualMSAsFullLengthFragment(a3m_path,name,protein_range,fragment_start_range,fragment_length)