# Experiment with pandas dataframe usage
1. Put the fasta file into a df and see if building kmers is faster this way than my for loop
2. Use the dataframe the way we used the sqlite database and see how fast queries are

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.sequence.gen_spectra import gen_spectrum, max_mass
from pyteomics import fasta
import pandas as pd
from collections import defaultdict
import swifter
import dask.dataframe as dd

## 1. Make all kmers via loading a fasta into a df

In [2]:
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/NOD2_mouse_database.fasta'
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
entries = []
for entry in fasta.read(fasta_file):
    get_name = lambda name: name[:name.index('OS=')-1] if 'OS=' in name else name

    # take the description without the 'sp' value
    desc = entry.description.split('|')[1:] if '|' in entry.description else entry.description

    # if the id is in the description, take it
    if len(desc) > 1:
        id_ = desc[0]
        name = get_name(desc[1])

    # make the id just the number
    else:
        id_ = i
        name = get_name(desc[0])

    # get the sequence
    seq = entry.sequence

    # make the entry and add it to prots
    entries.append({'name': name, 'id': id_, 'sequence': seq})

In [5]:
proteins = pd.DataFrame(entries, npartitions=2)

TypeError: __init__() got an unexpected keyword argument 'npartitions'

In [4]:
ps = dd.from_pandas(proteins)

ps.head()

ValueError: Exactly one of npartitions and chunksize must be specified.

In [4]:
proteins.head()

Unnamed: 0,name,id,sequence
0,LEG1_MOUSE Galectin-1,P16045,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...
1,ERP44_MOUSE Endoplasmic reticulum resident pro...,Q9D1Q6,MNPAVFLSLADLRCSLLLLVTSIFTPITAEIASLDSENIDEILNNA...
2,"ACPM_MOUSE Acyl carrier protein, mitochondrial",Q9CR21,MASRVLCACVRRLPAAFAPLPRLPTLALARPLSTTLCPEGIRRRPG...
3,HPS3_MOUSE Hermansky-Pudlak syndrome 3 protein...,Q91VB4,MVRLYNLHPFGSQQVVPCQWEPEQVCCGGSDALFVAAGCKVEAFAV...
4,PPIA_MOUSE Peptidyl-prolyl cis-trans isomerase A,P17742,MVNPTVFFDITADDEPLGRVSFELFADKVPKTAENFRALSTGEKGF...


In [5]:
import time
st = time.time()
def breakdown(s):
    seq = s['sequence']
    kmers = []
    for j in range(len(seq) - 2):

        # make a kmer sequence. Do the max (to generate the kmer spec once) then 
        # just iterate through it
        kmer_len = 30 if j + 30 <= len(seq) \
            else len(seq) - j

        kmers += [seq[j:j+k] for k in range(3, kmer_len)]
        
    return kmers
        
    
proteins['kmers'] = ps.apply(breakdown, axis=1)
print(f'Time to complete: {time.time() - st}')
ps.head()

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=279.0, style=ProgressStyle(description…


Time to complete: 0.9276762008666992


Unnamed: 0,name,id,sequence,kmers
0,LEG1_MOUSE Galectin-1,P16045,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...,"[MAC, MACG, MACGL, MACGLV, MACGLVA, MACGLVAS, ..."
1,ERP44_MOUSE Endoplasmic reticulum resident pro...,Q9D1Q6,MNPAVFLSLADLRCSLLLLVTSIFTPITAEIASLDSENIDEILNNA...,"[MNP, MNPA, MNPAV, MNPAVF, MNPAVFL, MNPAVFLS, ..."
2,"ACPM_MOUSE Acyl carrier protein, mitochondrial",Q9CR21,MASRVLCACVRRLPAAFAPLPRLPTLALARPLSTTLCPEGIRRRPG...,"[MAS, MASR, MASRV, MASRVL, MASRVLC, MASRVLCA, ..."
3,HPS3_MOUSE Hermansky-Pudlak syndrome 3 protein...,Q91VB4,MVRLYNLHPFGSQQVVPCQWEPEQVCCGGSDALFVAAGCKVEAFAV...,"[MVR, MVRL, MVRLY, MVRLYN, MVRLYNL, MVRLYNLH, ..."
4,PPIA_MOUSE Peptidyl-prolyl cis-trans isomerase A,P17742,MVNPTVFFDITADDEPLGRVSFELFADKVPKTAENFRALSTGEKGF...,"[MVN, MVNP, MVNPT, MVNPTV, MVNPTVF, MVNPTVFF, ..."


In [6]:

st = time.time()

y = defaultdict(list)
[y[x].append(row.id) for row in a.itertuples() for x in row.sequences]
    
z = pd.DataFrame([(key, value) for key, value in y.items()])
print(f'Time taken: {time.time() - st}')
z.head()

NameError: name 'a' is not defined

In [None]:
def spectrify(a):
    f = {}
    f['bs'] = max_mass(a, 'b', 1)
    f['bd'] = max_mass(a, 'b', 2)
    f['ys'] = max_mass(a, 'y', 1)
    f['yd'] = max_mass(a, 'y', 2)
    f['sequence'] = a
    return f

c = pd.DataFrame(list(b['sequence'].apply(spectrify)))

In [None]:
c.head()

In [None]:
c.tail()

## 2. Try fast queries

In [7]:
gen_spectrum('GGG', ion='b', charge=2)['spectrum'][-1]

86.53947243500001

In [None]:
c[c['bd'].between(86.539, 86.54)]

## 3. Search for substrings

In [8]:
%%time
list(proteins[proteins['sequence'].str.contains('GGG')]['name'])

CPU times: user 3.2 ms, sys: 1.51 ms, total: 4.71 ms
Wall time: 4.16 ms


['UBFD1_MOUSE Ubiquitin domain-containing protein UBFD1',
 'CIRBP_MOUSE Cold-inducible RNA-binding protein',
 'MAP4_MOUSE Microtubule-associated protein 4',
 'PA1B2_MOUSE Platelet-activating factor acetylhydrolase IB subunit beta',
 'HNRPD_MOUSE Heterogeneous nuclear ribonucleoprotein D0',
 'JUPI1_MOUSE Jupiter microtubule associated homolog 1',
 'HNRH1_MOUSE Heterogeneous nuclear ribonucleoprotein H',
 'TBB3_MOUSE Tubulin beta-3 chain',
 'CDHR1_MOUSE Cadherin-related family member 1',
 'PGRC1_MOUSE Membrane-associated progesterone receptor component 1',
 'GORS2_MOUSE Golgi reassembly-stacking protein 2',
 'HNRDL_MOUSE Heterogeneous nuclear ribonucleoprotein D-like',
 'CHM4B_MOUSE Charged multivesicular body protein 4b',
 'MESD_MOUSE LRP chaperone MESD',
 'KV5A7_MOUSE Ig kappa chain V-V region MOPC 41',
 'CO3_MOUSE Complement C3',
 'NFH_MOUSE Neurofilament heavy polypeptide',
 'NSF1C_MOUSE NSFL1 cofactor p47',
 'BIP_MOUSE Endoplasmic reticulum chaperone BiP',
 'KV5A2_MOUSE Ig kappa cha

In [9]:
%%time
list(proteins[proteins['sequence'].map(lambda x: 'GGG' in x)]['name'])

CPU times: user 1.27 ms, sys: 75 µs, total: 1.34 ms
Wall time: 1.3 ms


['UBFD1_MOUSE Ubiquitin domain-containing protein UBFD1',
 'CIRBP_MOUSE Cold-inducible RNA-binding protein',
 'MAP4_MOUSE Microtubule-associated protein 4',
 'PA1B2_MOUSE Platelet-activating factor acetylhydrolase IB subunit beta',
 'HNRPD_MOUSE Heterogeneous nuclear ribonucleoprotein D0',
 'JUPI1_MOUSE Jupiter microtubule associated homolog 1',
 'HNRH1_MOUSE Heterogeneous nuclear ribonucleoprotein H',
 'TBB3_MOUSE Tubulin beta-3 chain',
 'CDHR1_MOUSE Cadherin-related family member 1',
 'PGRC1_MOUSE Membrane-associated progesterone receptor component 1',
 'GORS2_MOUSE Golgi reassembly-stacking protein 2',
 'HNRDL_MOUSE Heterogeneous nuclear ribonucleoprotein D-like',
 'CHM4B_MOUSE Charged multivesicular body protein 4b',
 'MESD_MOUSE LRP chaperone MESD',
 'KV5A7_MOUSE Ig kappa chain V-V region MOPC 41',
 'CO3_MOUSE Complement C3',
 'NFH_MOUSE Neurofilament heavy polypeptide',
 'NSF1C_MOUSE NSFL1 cofactor p47',
 'BIP_MOUSE Endoplasmic reticulum chaperone BiP',
 'KV5A2_MOUSE Ig kappa cha

In [10]:
%%time
list(proteins[proteins['sequence'].apply(lambda x: 'GGG' in x)]['name'])

CPU times: user 2.08 ms, sys: 617 µs, total: 2.7 ms
Wall time: 2.26 ms


['UBFD1_MOUSE Ubiquitin domain-containing protein UBFD1',
 'CIRBP_MOUSE Cold-inducible RNA-binding protein',
 'MAP4_MOUSE Microtubule-associated protein 4',
 'PA1B2_MOUSE Platelet-activating factor acetylhydrolase IB subunit beta',
 'HNRPD_MOUSE Heterogeneous nuclear ribonucleoprotein D0',
 'JUPI1_MOUSE Jupiter microtubule associated homolog 1',
 'HNRH1_MOUSE Heterogeneous nuclear ribonucleoprotein H',
 'TBB3_MOUSE Tubulin beta-3 chain',
 'CDHR1_MOUSE Cadherin-related family member 1',
 'PGRC1_MOUSE Membrane-associated progesterone receptor component 1',
 'GORS2_MOUSE Golgi reassembly-stacking protein 2',
 'HNRDL_MOUSE Heterogeneous nuclear ribonucleoprotein D-like',
 'CHM4B_MOUSE Charged multivesicular body protein 4b',
 'MESD_MOUSE LRP chaperone MESD',
 'KV5A7_MOUSE Ig kappa chain V-V region MOPC 41',
 'CO3_MOUSE Complement C3',
 'NFH_MOUSE Neurofilament heavy polypeptide',
 'NSF1C_MOUSE NSFL1 cofactor p47',
 'BIP_MOUSE Endoplasmic reticulum chaperone BiP',
 'KV5A2_MOUSE Ig kappa cha