# Experiment with pandas dataframe usage
1. Put the fasta file into a df and see if building kmers is faster this way than my for loop
2. Use the dataframe the way we used the sqlite database and see how fast queries are

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.sequence.gen_spectra import gen_spectrum, max_mass
from pyteomics import fasta
import pandas as pd

## 1. Make all kmers via loading a fasta into a df

In [2]:
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
entries = []
for entry in fasta.read(fasta_file):
    get_name = lambda name: name[:name.index('OS=')-1] if 'OS=' in name else name

    # take the description without the 'sp' value
    desc = entry.description.split('|')[1:] if '|' in entry.description else entry.description

    # if the id is in the description, take it
    if len(desc) > 1:
        id_ = desc[0]
        name = get_name(desc[1])

    # make the id just the number
    else:
        id_ = i
        name = get_name(desc[0])

    # get the sequence
    seq = entry.sequence

    # make the entry and add it to prots
    entries.append({'name': name, 'id': id_, 'sequence': seq})

In [3]:
proteins = pd.DataFrame(entries)

In [4]:
proteins.head()

Unnamed: 0,name,id,sequence
0,LEG1_MOUSE Galectin-1,P16045,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...
1,ERP44_MOUSE Endoplasmic reticulum resident pro...,Q9D1Q6,MNPAVFLSLADLRCSLLLLVTSIFTPITAEIASLDSENIDEILNNA...
2,"ACPM_MOUSE Acyl carrier protein, mitochondrial",Q9CR21,MASRVLCACVRRLPAAFAPLPRLPTLALARPLSTTLCPEGIRRRPG...
3,HPS3_MOUSE Hermansky-Pudlak syndrome 3 protein...,Q91VB4,MVRLYNLHPFGSQQVVPCQWEPEQVCCGGSDALFVAAGCKVEAFAV...
4,PPIA_MOUSE Peptidyl-prolyl cis-trans isomerase A,P17742,MVNPTVFFDITADDEPLGRVSFELFADKVPKTAENFRALSTGEKGF...


In [5]:
def breakdown(s):
    kmers = []
    for j in range(len(s) - 2):

        # make a kmer sequence. Do the max (to generate the kmer spec once) then 
        # just iterate through it
        kmer_len = 30 if j + 30 <= len(s) \
            else len(s) - j

        for k in range(3, kmer_len):
            kmers.append(s[j:j+k])
        
    return kmers
    
a = proteins['sequence'].apply(breakdown).to_frame()
a.head()

Unnamed: 0,sequence
0,"[MAC, MACG, MACGL, MACGLV, MACGLVA, MACGLVAS, ..."
1,"[MNP, MNPA, MNPAV, MNPAVF, MNPAVFL, MNPAVFLS, ..."
2,"[MAS, MASR, MASRV, MASRVL, MASRVLC, MASRVLCA, ..."
3,"[MVR, MVRL, MVRLY, MVRLYN, MVRLYNL, MVRLYNLH, ..."
4,"[MVN, MVNP, MVNPT, MVNPTV, MVNPTVF, MVNPTVFF, ..."


In [6]:
b = a.explode('sequence').drop_duplicates('sequence')
b

Unnamed: 0,sequence
0,MAC
0,MACG
0,MACGL
0,MACGLV
0,MACGLVA
...,...
278,NGPYM
278,NGPYMR
278,GPYM
278,GPYMR


In [7]:
def spectrify(a):
    f = {}
    f['bs'] = max_mass(a, 'b', 1)
    f['bd'] = max_mass(a, 'b', 2)
    f['ys'] = max_mass(a, 'y', 1)
    f['yd'] = max_mass(a, 'y', 2)
    f['sequence'] = a
    return f

c = pd.DataFrame(list(b['sequence'].apply(spectrify)))

In [8]:
c.head()

0    {'bs': 306.09406043499996, 'bd': 153.550668434...
0    {'bs': 363.11552443499994, 'bd': 182.061400434...
0    {'bs': 476.19958843499995, 'bd': 238.603432434...
0    {'bs': 575.268002435, 'bd': 288.137639435, 'ys...
0    {'bs': 646.305116435, 'bd': 323.65619643499997...
Name: sequence, dtype: object

In [9]:
c.tail()

278    {'bs': 563.228236435, 'bd': 282.117756435, 'ys...
278    {'bs': 719.329347435, 'bd': 360.168311935, 'ys...
278    {'bs': 449.185309435, 'bd': 225.096292935, 'ys...
278    {'bs': 605.2864204350001, 'bd': 303.1468484350...
278    {'bs': 548.2649564349999, 'bd': 274.6361164349...
Name: sequence, dtype: object

## 2. Try fast queries

In [10]:
gen_spectrum('GGG', ion='b', charge=2)['spectrum'][-1]

86.53947243500001

In [11]:
c[c['bd'].between(86.539, 86.54)]

KeyError: 'bd'

## 3. Search for substrings

In [None]:
proteins[proteins['sequence'].str.contains('GGG')]

In [None]:
proteins[proteins['id'] == 'Q78JW9']

In [None]:
'MLKRGRGRPGKRRRRVSIETSTCFRPACVKLGAGAGANLRQLASSRRPLRSWWVLYTIIMAAAGAPDGMEEPGMDTEAEAVATEAPARPLNCVEAEAAVGAAAEDSCDARGNLQPAPAQPPGDPAAQASVSNGEDAGGGVGKELVDLKIIWNKTKHDVKVPLDSTGSELKQKIHSITGLPPAMQKVMYKGLVPEDKTLREIKVTSGAKIMVVGSTINDVLAVNTPKDAAQQDAKAEENKKEPLCRQKQHRKVLDKGKPEDVMPSVKGAQERLPTVPLSGMYNKSGGKVRLTFKLEQDQLWIGTKERTEKLPMGSIKNVVSEPIEGHEDYHMMAFQLGPTEASYYWVYWVPTQYVDAIKDTVLGKWQYF'.index('GGG')


In [None]:
'MLKRGRGRPGKRRRRVSIETSTCFRPACVKLGAGAGANLRQLASSRRPLRSWWVLYTIIMAAAGAPDGMEEPGMDTEAEAVATEAPARPLNCVEAEAAVGAAAEDSCDARGNLQPAPAQPPGDPAAQASVSNGEDAGGGVGKELVDLKIIWNKTKHDVKVPLDSTGSELKQKIHSITGLPPAMQKVMYKGLVPEDKTLREIKVTSGAKIMVVGSTINDVLAVNTPKDAAQQDAKAEENKKEPLCRQKQHRKVLDKGKPEDVMPSVKGAQERLPTVPLSGMYNKSGGKVRLTFKLEQDQLWIGTKERTEKLPMGSIKNVVSEPIEGHEDYHMMAFQLGPTEASYYWVYWVPTQYVDAIKDTVLGKWQYF'[136:]