In [18]:
digests = {
    "trypsin": {
        "start" : [
            {
                "amino_acid": "D", 
                "cut_position": "left"
            }
        ], 
        "end" : [
            {
                "amino_acid": "R", 
                "cut_position": "right"
            }, {
                "amino_acid": "K", 
                "cut_position": "right"
            }
        ]
    }
}

In [None]:
def digest(db: Database, digest_type: str, missed_cleavages: int) -> Database:
    '''
    Digest each protein in the database. If no digest is done, then 
    the original database is returned. 
    NOTE: 
    The entires in the database after digestion are the names of the form
    <protein_name>_<start_position>_<end_position>

    Inputs:
        db:                 (Database) the input source
        digest_type:        (str) the digestion to perform
        missed_cleavages:   (int) the number of missed cleavages allowed
    Outputs:
        (Database) updated protein entries
    '''
    if digest_type not in digests:
        return db
    
    digest_rules = digests[digest_type]
    starts = {s['amino_acid']: s['cut_position'] for s in digest_rules['start']}
    ends = {s['amino_acid']: s['cut_position'] for s in digest_rules['end']}
    
    new_prots = {}
    
    for p_name, entry in db.proteins.items():
        
        for pos, aa in enumerate(entry.sequence):
            
            digested = []
            
            if aa in starts:
                
                # get the starting position for this cut based on rule
                s = pos if starts[aa] == 'left' else pos + 1
                
                allowed_misses = missed_cleavages
                
                # find all of the next ends. we will keep track of them for up to missed_cleavages
                for j in range(pos, len(entry.sequence)):
                    
                    # if we're out of missed cleavages, break
                    if allowed_misses < 0:
                        break
                    
                    
                    # check if we're at the end
                    if j == len(entry.sequence) - 1:
                        
                        # get the cut sequence
                        digested.append(entry.sequence[s:], s, len(entry.sequence))
                        break
                    
                    # check of this aa is an end
                    if entry.sequence[j] in ends:
                        
                        # first reduce allowed
                        allowed_misses -= 1
                        
                        # determine if we do j or j+1 based on the rule
                        e = j if ends[entry.sequence[j]] == 'left' else j + 1
                        
                        digested.append(entry.sequence[s:e], s, e)
                        
        for d in digested:
            new_prots[f'{prot_name}_{d[1]}_{d[2]}'] = d[0]
            
    db._replace(proteins=new_prots)
    return db

In [19]:
p = 'MATPEASGSGEKVEGSEPSVTYYRLEEVAKRNSAEETWMVIHGRVYDITRFLSEHPGGEEVLLEQAGADATESFEDVGHSPDAREMLKQYYIGDVHPSDLKPKGDDKDPSKNNSCQSSWAYWFVPIVGAILIGFLYRHFWADSKSS'


In [20]:
print([i for i, aa in enumerate(p) if aa == 'K' or aa == 'R'])

[11, 23, 29, 30, 43, 49, 83, 87, 100, 102, 106, 110, 136, 143]


In [28]:
tryptic(p, 15)

['MATPEASGSGEKVEGSEPSVTYYRLEEVAKRNSAEETWMVIHGRVYDITRFLSEHPGGEEVLLEQAGADATESFEDVGHSPDAREMLKQYYIGDVHPSDLKPKGDDKDPSKNNSCQSSWAYWFVPIVGAILIGFLYRHFWADSKSS']