# Experiment with pandas dataframe usage
1. Put the fasta file into a df and see if building kmers is faster this way than my for loop
2. Use the dataframe the way we used the sqlite database and see how fast queries are

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.sequence.gen_spectra import gen_spectrum, max_mass
from pyteomics import fasta
import pandas as pd
from collections import defaultdict
import swifter
import dask.dataframe as dd
from more_itertools import flatten

## 1. Make all kmers via loading a fasta into a df

In [2]:
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/NOD2_mouse_database.fasta'
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
entries = []
for entry in fasta.read(fasta_file):
    get_name = lambda name: name[:name.index('OS=')-1] if 'OS=' in name else name

    # take the description without the 'sp' value
    desc = entry.description.split('|')[1:] if '|' in entry.description else entry.description

    # if the id is in the description, take it
    if len(desc) > 1:
        id_ = desc[0]
        name = get_name(desc[1])

    # make the id just the number
    else:
        id_ = i
        name = get_name(desc[0])

    # get the sequence
    seq = entry.sequence

    # make the entry and add it to prots
    entries.append({'name': name, 'id': id_, 'sequence': seq})

In [3]:
proteins = pd.DataFrame(entries)

In [4]:
%%time
def breakdown(s):
    kmers = []
    for j in range(len(s) - 2):

        # make a kmer sequence. Do the max (to generate the kmer spec once) then 
        # just iterate through it
        kmer_len = 30 if j + 30 <= len(s) \
            else len(s) - j

        kmers += [s[j:j+k] for k in range(3, kmer_len)]
        
    return kmers
        
    
kmers = pd.DataFrame(list(set(flatten(proteins['sequence'].swifter.apply(breakdown)))), columns =['sequence'])

kmers.head()

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=279.0, style=ProgressStyle(description…


CPU times: user 1.96 s, sys: 186 ms, total: 2.14 s
Wall time: 2.21 s


Unnamed: 0,sequence
0,DWCRFSQMLHPIFEEASDVIKEE
1,VALY
2,SGSLTPQPSPQLPTPKTLGGPVQSS
3,SDGLNLVAEKVVILVTDAND
4,LTDENFESRV


In [5]:
%%time
def spectrify(a):
    f = {}
    f['bs'] = max_mass(a, 'b', 1)
    f['bd'] = max_mass(a, 'b', 2)
    f['ys'] = max_mass(a, 'y', 1)
    f['yd'] = max_mass(a, 'y', 2)
    f['sequence'] = a
    return f

mass_list = list(kmers['sequence'].swifter.apply(spectrify))
del kmers
mass_dict = {}
for ml in mass_list:
    if ml['bd'] not in mass_dict:
        mass_dict[ml['bd']] = {'bd': ml['bd'], 'yd': ml['yd'], 'bs': ml['bs'], 'ys': ml['ys'], 'sequences': []}
    mass_dict[ml['bd']]['sequences'].append(ml['sequence'])
    
mass_sequences = pd.DataFrame([value for _, value in mass_dict.items()])

mass_sequences.astype({'bs': 'float32', 'bd': 'float32', 'ys': 'float32', 'yd': 'float32'})

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=2495259.0, style=ProgressStyle(descrip…


CPU times: user 37.7 s, sys: 1.48 s, total: 39.1 s
Wall time: 40.6 s


Unnamed: 0,bd,yd,bs,ys,sequences
0,1396.146362,1405.151611,2791.285400,2809.295898,[DWCRFSQMLHPIFEEASDVIKEE]
1,224.133728,233.139008,447.260193,465.270752,"[VALY, AVLY, GLLY, VAYI, LGYL, LGLY, VIAY, GIL..."
2,1222.145020,1231.150269,2443.282715,2461.293213,[SGSLTPQPSPQLPTPKTLGGPVQSS]
3,1034.052002,1043.057373,2067.096924,2085.107422,[SDGLNLVAEKVVILVTDAND]
4,596.285645,605.290955,1191.564087,1209.574585,"[LTDENFESRV, FESSSLGQQLN, ADRFSEEEVK, DPSTRSVF..."
...,...,...,...,...,...
1860549,1465.234253,1474.239502,2929.461182,2947.471680,[SKLNYKPPPQKSLKELQEMDKDDES]
1860550,718.851501,727.856750,1436.695679,1454.706299,[WGKVHYSIYGTGS]
1860551,1318.257080,1327.262329,2635.506836,2653.517578,[KFLEKLPEATGLSPLSVEPKTQKL]
1860552,1042.555420,1051.560669,2084.103516,2102.114258,[ATGLFKHQLSGNSPAGTLFR]


In [6]:
mass_sequences[mass_sequences['bs'] == 304.096168435]

Unnamed: 0,bd,yd,bs,ys,sequences
196194,152.551722,161.557005,304.096168,322.106733,"[CEA, ACE, AEC, DMG, DGM, EAC, CAE, GDM, MDG, ..."


## 2. Try fast queries

In [7]:
gen_spectrum('GGG', ion='b', charge=2)['spectrum'][-1]

86.53947243500001

In [8]:
%%time
kmers[kmers['bd'].between(86.539, 86.54)]

NameError: name 'kmers' is not defined

## 3. Search for substrings

In [9]:
%%time
list(proteins[proteins['sequence'].apply(lambda x: 'GGG' in x)]['name'])

CPU times: user 1.2 ms, sys: 73 µs, total: 1.27 ms
Wall time: 1.22 ms


['UBFD1_MOUSE Ubiquitin domain-containing protein UBFD1',
 'CIRBP_MOUSE Cold-inducible RNA-binding protein',
 'MAP4_MOUSE Microtubule-associated protein 4',
 'PA1B2_MOUSE Platelet-activating factor acetylhydrolase IB subunit beta',
 'HNRPD_MOUSE Heterogeneous nuclear ribonucleoprotein D0',
 'JUPI1_MOUSE Jupiter microtubule associated homolog 1',
 'HNRH1_MOUSE Heterogeneous nuclear ribonucleoprotein H',
 'TBB3_MOUSE Tubulin beta-3 chain',
 'CDHR1_MOUSE Cadherin-related family member 1',
 'PGRC1_MOUSE Membrane-associated progesterone receptor component 1',
 'GORS2_MOUSE Golgi reassembly-stacking protein 2',
 'HNRDL_MOUSE Heterogeneous nuclear ribonucleoprotein D-like',
 'CHM4B_MOUSE Charged multivesicular body protein 4b',
 'MESD_MOUSE LRP chaperone MESD',
 'KV5A7_MOUSE Ig kappa chain V-V region MOPC 41',
 'CO3_MOUSE Complement C3',
 'NFH_MOUSE Neurofilament heavy polypeptide',
 'NSF1C_MOUSE NSFL1 cofactor p47',
 'BIP_MOUSE Endoplasmic reticulum chaperone BiP',
 'KV5A2_MOUSE Ig kappa cha