# Extra needed code

## Spectra calculations

In [1]:
amino_acids={
    "A":71.037114,
    "R":156.101111,
    "N":114.042927,
    "D":115.026943,
    "C":103.009185,
    "E":129.042593,
    "Q":128.058578,
    "G":57.021464,
    "H":137.058912,
    "I":113.084064,
    "L":113.084064,
    "K":128.094963,
    "M":131.040485,
    "F":147.068414,
    "P":97.052764,
    "S":87.032028,
    "T":101.047679,
    "U":150.95363,
    "W":186.079313,
    "Y":163.06332,
    "V":99.068414
}

'''calc_masses

DESC:
    calculates the masses/spectrum for a sequence
Inputs:
    sequence: str amino acid sequence to change to list of masses
    charge: int charge value to calculate masses for
Outputs:
    list of floats, float       spectrum and the precursor mass 
'''
def calc_masses(sequence, charge):
    masses = []

    length = len(sequence)
    total = 2 * 1.007825035 + 15.99491463 #This is the mass of water. Adding the mass of water to the sum of all the residue masses gives the mass of the peptide.
    for i in range(length):
        total +=  amino_acids[sequence[i]]

    pre_mz = (total+charge*1.0072764)/charge   

    if charge == 1:
        #b+
        total = 1.007825035 - 0.0005486 #for the H to turn the residue NH on the N-terminus into NH2
        for i in range (0, length):
            total += amino_acids[sequence[i]]
            masses.append(total)
            #Since z (the charge) is equal to one, the total here is the m/z

        #y+
        total = 3 * 1.007825035 + 15.99491463 - 0.0005486 #for the OH to turn the residue CO on the C-terminus into COOH + 1 proton to make NH into NH2 and 1 proton make positively charged
        for i in range (0,length):
            total += amino_acids[sequence[length-i-1]]
            masses.append(total)

    elif charge == 2:
        #b++
        total = 2 * 1.007825035 - 2 * 0.0005486 #adding one more proton this time to make it doubly charged
        for i in range (0, length):
            total += amino_acids[sequence[i]]
            masses.append(total/2)

        #y++
        total = 4 * 1.007825035 + 15.99491463 - 2 * 0.0005486 #another proton to make doubly charged
        for i in range (0, length):
            total += amino_acids[sequence[length-i-1]]
            masses.append(total/2)
        #The masses you get exactly match Spectrum Mill. To get this, I had to make sure to use the mass of H+ and the mass of H when appropriate.

    return masses, pre_mz

## Scoring

In [29]:
'''cmp_string_string

DESC:
    compare the two spectras from two strings
    uses simple additive scoring
PARAMS:
    seq: string sequence of amino acids
    ref_seq: string sequence of amino acids
RETURNS:
    float score from comparison
'''
def cmp_string_string(seq, ref_seq):
    spec1, spec2 = [], []
    m11, _ = calc_masses(seq, 1)
    m12, _ = calc_masses(seq, 2)
    m21, _ = calc_masses(ref_seq, 1)
    m22, _ = calc_masses(ref_seq, 2)
    spec1 = m11 + m12 
    spec2 = m21 + m22 
    return cmp_spectra_spectra(spec1, spec2)

def cmp_spectra_spectra(spec: list, reference: list) -> float:
    '''
    CREATED FEB 26 2020
    Score two spectra against eachother. Simple additive scoring with bonuses for streaks
    Divides by the length of the reference to make it length biased for the reference

    Inputs:
        spec:       list of floats (from mass spectra)
        reference:  list of floats (calculated from protein sequence)
    Outputs:
        score:      float score 
    '''
    if len(spec) == 0 or len(reference) == 0:
        return
    streak = 0
    last = False
    score = 0
    max_streak = 0
    for mass in spec:
        if last == True:
            streak += 1
            max_streak = max([streak, max_streak])

        if mass in reference:
            score += 1
            last = True

        else:
            streak = 0
            last = False
    
    score += max_streak
    score /= (len(reference) / 2)
    return score 

# Testing
## Data

In [38]:
insulin = 'MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN'
peptides = ['AFVNQ', 'AFVNQHLCGS', 'AFVNQHLCGSHLVEA', 'AFVNQHLCGSHLVEALYLVC'] # lengths of 5, 10, 15, 20 
pep_starts = [insulin.index(p) for p in peptides]
ks = [4, 6, 8, 10, 12, 14, 16, 18, 20]
kmers = {}
for k in ks:
    gen_mers = lambda prot, l: [prot[i:i+l] for i in range(len(prot) - l + 1)]
    kmers['k={}'.format(k)] = gen_mers(insulin, k)

## Score kmers

In [39]:
scores = {}
for pep in peptides:
    scores[pep] = {}
    for k in ks: 
        kkey = 'k={}'.format(k)
        score_mers = lambda pep, mer: [cmp_string_string(m, pep) for m in mer]
        scores[pep][kkey] = score_mers(pep, kmers[kkey])

## test alignment algorithm

In [40]:
def make_alignment(estimated_start, pep_scores): # in the actual program, the start prediction comes from the aggregation, but i dont feel like doing that so we'll mock it
    # pep scores is a dict {k=3: [], k=4: [], ...}
    # go through all of these and find when the scores stop growing
    max_score_k = 0
    max_score = -10
    get_k = lambda sk: int(sk[sk.index('=')+1:])
    for ke in pep_scores:
        # check to see that the starting position is in the length. If its not, we know the peptide is shorter than that k
        if estimated_start > len(pep_scores[ke]):
            continue
        k = get_k(ke)
        max_score, max_score_k = (max_score, max_score_k) if max_score >= pep_scores[ke][estimated_start] else (pep_scores[ke][estimated_start], k)
    return max_score_k

## run test alignment with data

In [41]:
idx = 0
for pep, pscores in scores.items():
    print('estimating alignment for {}'.format(pep))
    print('scores at the start position:\n{}'.format(' '.join(['{} score:{} |'.format(str(ke), s[pep_starts[idx]]) for ke, s in pscores.items()])))
    print(make_alignment(pep_starts[idx], pscores))
    idx += 1
    

estimating alignment for AFVNQ
scores at the start position:
k=4 score:1.2 | k=6 score:1.5 | k=8 score:1.5 | k=10 score:1.5 | k=12 score:1.5 | k=14 score:1.5 | k=16 score:1.5 | k=18 score:1.5 | k=20 score:1.5 |
6
estimating alignment for AFVNQHLCGS
scores at the start position:
k=4 score:0.6 | k=6 score:0.9 | k=8 score:1.2 | k=10 score:3.95 | k=12 score:1.6 | k=14 score:1.5 | k=16 score:1.5 | k=18 score:1.5 | k=20 score:1.5 |
10
estimating alignment for AFVNQHLCGSHLVEA
scores at the start position:
k=4 score:0.4 | k=6 score:0.6 | k=8 score:0.8 | k=10 score:1.0 | k=12 score:1.2 | k=14 score:1.4333333333333333 | k=16 score:1.5666666666666667 | k=18 score:1.5 | k=20 score:1.5 |
16
estimating alignment for AFVNQHLCGSHLVEALYLVC
scores at the start position:
k=4 score:0.3 | k=6 score:0.45 | k=8 score:0.675 | k=10 score:0.75 | k=12 score:0.9 | k=14 score:1.05 | k=16 score:1.2 | k=18 score:1.35 | k=20 score:3.975 |
20
