In [1]:
from pyteomics import fasta
import pandas as pd


In [2]:
prots_str = ''
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'

In [3]:
# split the name on the OS value if it exists
get_name = lambda name: name[:name.index('OS=')-1] if 'OS=' in name else name

prots = []

# go through each entry in the fasta and put it in memory
for i, entry in enumerate(fasta.read(fasta_file)):

    # take the description without the 'sp' value
    desc = entry.description.split('|')[1:] if '|' in entry.description else entry.description

    # if the id is in the description, take it
    if len(desc) > 1:
        id_ = desc[0]
        name = get_name(desc[1])

    # make the id just the number
    else:
        id_ = i
        name = get_name(desc[0])

    # get the sequence
    seq = entry.sequence

    # make the entry and add it to prots
    prots.append((name, id_, seq))
    
    prots_str += f'|name=${name}$?seq=${seq}$?id=${id_}$|'
                 
prots_df = pd.DataFrame(prots, columns=['name', 'id', 'sequence'])
prots_df.head()

Unnamed: 0,name,id,sequence
0,LEG1_MOUSE Galectin-1,P16045,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...
1,ERP44_MOUSE Endoplasmic reticulum resident pro...,Q9D1Q6,MNPAVFLSLADLRCSLLLLVTSIFTPITAEIASLDSENIDEILNNA...
2,"ACPM_MOUSE Acyl carrier protein, mitochondrial",Q9CR21,MASRVLCACVRRLPAAFAPLPRLPTLALARPLSTTLCPEGIRRRPG...
3,HPS3_MOUSE Hermansky-Pudlak syndrome 3 protein...,Q91VB4,MVRLYNLHPFGSQQVVPCQWEPEQVCCGGSDALFVAAGCKVEAFAV...
4,PPIA_MOUSE Peptidyl-prolyl cis-trans isomerase A,P17742,MVNPTVFFDITADDEPLGRVSFELFADKVPKTAENFRALSTGEKGF...


In [4]:
import sys
print(f'size of df: {sys.getsizeof(prots_df)}')
print(f'size of str: {sys.getsizeof(prots_str)}')

size of df: 164228
size of str: 122546


In [5]:
%%time 
str(prots_df[prots_df['sequence'].apply(lambda x: 'GGG' in x)]['name'])

CPU times: user 3.09 ms, sys: 158 µs, total: 3.25 ms
Wall time: 3.24 ms


'8      UBFD1_MOUSE Ubiquitin domain-containing protei...\n41        CIRBP_MOUSE Cold-inducible RNA-binding protein\n45           MAP4_MOUSE Microtubule-associated protein 4\n60     PA1B2_MOUSE Platelet-activating factor acetylh...\n67     HNRPD_MOUSE Heterogeneous nuclear ribonucleopr...\n69     JUPI1_MOUSE Jupiter microtubule associated hom...\n85     HNRH1_MOUSE Heterogeneous nuclear ribonucleopr...\n90                       TBB3_MOUSE Tubulin beta-3 chain\n102         CDHR1_MOUSE Cadherin-related family member 1\n106    PGRC1_MOUSE Membrane-associated progesterone r...\n114      GORS2_MOUSE Golgi reassembly-stacking protein 2\n117    HNRDL_MOUSE Heterogeneous nuclear ribonucleopr...\n121    CHM4B_MOUSE Charged multivesicular body protei...\n147                        MESD_MOUSE LRP chaperone MESD\n159        KV5A7_MOUSE Ig kappa chain V-V region MOPC 41\n162                              CO3_MOUSE Complement C3\n164            NFH_MOUSE Neurofilament heavy polypeptide\n170          

In [6]:
%%time
import re

def get_prots(substr):

    def get_prot(pos):
        name_end_pos = prots_str[:pos].rindex('?')
        name_start_pos = prots_str[:name_end_pos-1].rfind('$')
        return prots_str[name_start_pos+1:name_end_pos-1]
    
    return [get_prot(s.start()) for s in re.finditer(substr, prots_str)]



print(get_prots('GGG'))


['$UBFD1_MOUSE Ubiquitin domain-containing protein UBFD1$', '$CIRBP_MOUSE Cold-inducible RNA-binding protein$', '$CIRBP_MOUSE Cold-inducible RNA-binding protein$', '$MAP4_MOUSE Microtubule-associated protein 4$', '$MAP4_MOUSE Microtubule-associated protein 4$', '$MAP4_MOUSE Microtubule-associated protein 4$', '$MAP4_MOUSE Microtubule-associated protein 4$', '$MAP4_MOUSE Microtubule-associated protein 4$', '$PA1B2_MOUSE Platelet-activating factor acetylhydrolase IB subunit beta$', '$HNRPD_MOUSE Heterogeneous nuclear ribonucleoprotein D0$', '$HNRPD_MOUSE Heterogeneous nuclear ribonucleoprotein D0$', '$JUPI1_MOUSE Jupiter microtubule associated homolog 1$', '$HNRH1_MOUSE Heterogeneous nuclear ribonucleoprotein H$', '$HNRH1_MOUSE Heterogeneous nuclear ribonucleoprotein H$', '$TBB3_MOUSE Tubulin beta-3 chain$', '$CDHR1_MOUSE Cadherin-related family member 1$', '$PGRC1_MOUSE Membrane-associated progesterone receptor component 1$', '$GORS2_MOUSE Golgi reassembly-stacking protein 2$', '$HNRDL_