# BA4I Implement ConvolutionCyclopeptideSequencing
See Last cell for BA4I

# Generating Theoretical Spectrum Problem

In [1]:
with open('integer_mass_table.txt', 'r') as f:
    massTable = {}
    for line in f:
        k, v = line.rstrip().split()
        v = int(v)
        massTable[k] = v

def CyclicSpectrum(peptide, massTable):
    prefixMass = [0]
    for i in range(1, len(peptide) + 1):
        prefixMass.append(prefixMass[i-1] + massTable[peptide[i-1]])
    totalMass = prefixMass[-1]
    spectrum = [0]
    for i in range(len(peptide)):
        for j in range(i+1, len(peptide) + 1):
            spectrum.append(prefixMass[j] - prefixMass[i])
            if i > 0 and j < len(peptide):
                spectrum.append(totalMass - (prefixMass[j] - prefixMass[i]))
    spectrum.sort()
    return spectrum

In [2]:
def CyclicSpectrumMass(mass):
    prefixMass = [0]
    for i in range(1, len(mass) + 1):
        prefixMass.append(prefixMass[i-1] + mass[i-1])
    totalMass = prefixMass[-1]
    spectrum = [0]
    for i in range(len(mass)):
        for j in range(i+1, len(mass) + 1):
            spectrum.append(prefixMass[j] - prefixMass[i])
            if i > 0 and j < len(mass):
                spectrum.append(totalMass - (prefixMass[j] - prefixMass[i]))
    spectrum.sort()
    return spectrum

In [3]:
def LinearSpectrum(peptide, massTable):
    prefixMass = [0]
    for i in range(1, len(peptide) + 1):
        prefixMass.append(prefixMass[i-1] + massTable[peptide[i-1]])
    totalMass = prefixMass[-1]
    spectrum = [0]
    for i in range(len(peptide)):
        for j in range(i+1, len(peptide) + 1):
            spectrum.append(prefixMass[j] - prefixMass[i])
    spectrum.sort()
    return spectrum

In [4]:
def LinearSpectrumMass(mass):
    prefixMass = [0]
    for i in range(1, len(mass) + 1):
        prefixMass.append(prefixMass[i-1] + mass[i-1])
    totalMass = prefixMass[-1]
    spectrum = [0]
    for i in range(len(mass)):
        for j in range(i+1, len(mass) + 1):
            spectrum.append(prefixMass[j] - prefixMass[i])
    spectrum.sort()
    return spectrum

# Counting Peptides with Given Mass Problem

In [5]:
def DPpeptide_with_mass(m, massTable):
    count = [0 for i in range(m + 1)]
    #Remove pep with duplicate mass
    del massTable['I']
    del massTable['K']
    allaa = list(massTable.keys())
    for i in massTable.values():
        count[i] = 1
    for i in range(m + 1):
        for j in range(len(allaa)):
            if i >  massTable[allaa[j]]:
                count[i] += count[i - massTable[allaa[j]]]
    return count[m]

In [6]:
import copy
massTablecopy = copy.copy(massTable)
DPpeptide_with_mass(1499, massTablecopy)

6888812571632460

# CyclopeptideSequencing

In [6]:
def consistSpec(candidate, spectrum):
    res = True
    for i in range(len(candidate)):
        if candidate[i] not in spectrum:
            return False
    return res

def cycloPeptideSequencing(spectrum, massTable):
    allaa = massTable.keys()
    candidatePep = set(allaa)
    finalPep = set()
    while len(candidatePep):
        newCandidate = set()
        for pep in candidatePep:
            for nxt in allaa:
                newCandidate.add(pep + nxt)
        candidatePep = newCandidate
        tempCandidate = list(candidatePep)
        for pep in tempCandidate:
            pepSpecLinear = LinearSpectrum(pep, massTable)
            pepSpecCirc = CyclicSpectrum(pep, massTable)
            pepMass = sum([massTable[k] for k in pep])
            if pepMass == spectrum[-1]:
                if pepSpecCirc == spectrum:
                    finalPep.add(pep)
                candidatePep.remove(pep)
            elif not consistSpec(pepSpecLinear, spectrum):
                candidatePep.remove(pep)
    return finalPep

In [7]:
def cycloPeptideSequencingMass(spectrum, mass):
    allmass = set(mass)
    candidatePep = [()]
    finalPep = set()
    while len(candidatePep) > 0:
        newCandidate = set()
        for pep in candidatePep:
            for nxt in allmass:
                newCandidate.add(pep + (nxt,))
        candidatePep = newCandidate
        tempCandidate = list(candidatePep)
        for pep in tempCandidate:
            pepSpecLinear = LinearSpectrumMass(pep)
            pepSpecCirc = CyclicSpectrumMass(pep)
            pepMass = sum(pep)
            if pepMass == spectrum[-1]:
                if pepSpecCirc == spectrum:
                    finalPep.add(pep)
                candidatePep.remove(pep)
            elif not consistSpec(pepSpecLinear, spectrum):
                candidatePep.remove(pep)
    return finalPep

In [96]:
with open('cycloPeptideSequencingTest1.txt', 'r') as f:
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    mass = [k for k in massTable.values()]
    seq = list(cycloPeptideSequencingMass(spectrum, mass))
    output = []
    for i in range(len(seq)):
        output.append('-'.join([str(k) for k in seq[i]]))
    print(' '.join(output))

128-186-113 113-128-186 113-186-128 186-128-113 128-113-186 186-113-128


In [54]:
with open('cycloPeptideSequencingTest1.txt', 'r') as f:
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    seq = list(cycloPeptideSequencing(spectrum, massTable))
    output = set()
    for i in range(len(seq)):
        output.add('-'.join([str(massTable[k]) for k in seq[i]]))
    print(' '.join(list(output)))

186-113-128 128-186-113 113-128-186 186-128-113 113-186-128 128-113-186


# Cyclopeptide Scoring Problem

In [8]:
import copy
def pepScoreCirc(peptide, spectrum, massTable):
    pepSpec = CyclicSpectrum(peptide, massTable)
    score = 0
    tempSpectrum = copy.copy(spectrum)
    for mass in pepSpec:
        if mass in tempSpectrum:
            score += 1
            tempSpectrum.remove(mass)
    return score

In [9]:
def pepScoreLinear(peptide, spectrum, massTable):
    pepSpec = LinearSpectrum(peptide, massTable)
    score = 0
    tempSpectrum = copy.copy(spectrum)
    for mass in pepSpec:
        if mass in tempSpectrum:
            score += 1
            tempSpectrum.remove(mass)
    return score

In [10]:
def pepScoreLinearMass(peptide, spectrum):
    pepSpec = LinearSpectrumMass(peptide)
    score = 0
    tempSpectrum = copy.copy(spectrum)
    for mass in pepSpec:
        if mass in tempSpectrum:
            score += 1
            tempSpectrum.remove(mass)
    return score

In [38]:
with open('pepScoreProblemTest1.txt', 'r') as f:
    peptide = f.readline().rstrip()
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    print(pepScoreCirc(peptide, spectrum, massTable))

11


In [39]:
with open('pepScoreProblemTest1.txt', 'r') as f:
    peptide = f.readline().rstrip()
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    print(pepScoreLinear(peptide, spectrum, massTable))

8


# Implement the Trim function in LeaderboardCyclopeptideSequencing

In [11]:
def trim(peptides, spectrum, N, massTable):
    leaderboard = {}
    for peptide in peptides:
        scoreLinear = pepScoreLinear(peptide, spectrum, massTable)
        leaderboard[peptide] = scoreLinear
    if len(leaderboard.keys()) <= N:
        return peptides
    threshold = sorted(list(leaderboard.values()), reverse=True)[N - 1]
    output = [k for k in leaderboard.keys() if leaderboard[k] >= threshold]
    return output

In [12]:
def trimMass(peptides, spectrum, N):
    leaderboard = {}
    for peptide in peptides:
        scoreLinear = pepScoreLinearMass(peptide, spectrum)
        leaderboard[peptide] = scoreLinear
    if len(leaderboard.keys()) <= N:
        return peptides
    threshold = sorted(list(leaderboard.values()), reverse=True)[N - 1]
    output = [k for k in leaderboard.keys() if leaderboard[k] >= threshold]
    return output

In [146]:
with open('trimTest1.txt', 'r') as f:
    peptides = f.readline().rstrip().split()
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    N = int(f.readline().rstrip())
    print(trim(peptides, spectrum, N, massTable))

['LAST', 'ALST']


# LeaderboardCyclopeptideSequencing

In [39]:
def LeaderboardCyclopeptideSequencing(spectrum, N, massTable):
    allaa = massTable.keys()
    candidatePep = set(allaa)
    finalPep = ''
    maxScore = 0
    while len(candidatePep):
        newCandidate = set()
        for pep in candidatePep:
            for nxt in allaa:
                newCandidate.add(pep + nxt)
        candidatePep = newCandidate
        tempCandidate = list(candidatePep)
        for pep in tempCandidate:
            pepMass = sum([massTable[k] for k in pep])
            if pepMass == spectrum[-1]:
                scoreCirc = pepScoreCirc(pep, spectrum, massTable)
                if scoreCirc > maxScore:
                    finalPep = pep
                    maxScore = scoreCirc
            elif pepMass > spectrum[-1]:
                candidatePep.remove(pep)
        candidatePep = trim(candidatePep, spectrum, N, massTable)
    return finalPep

In [41]:
with open('LeaderboardCyclopeptideSequencingTest1.txt', 'r') as f:
    N = int(f.readline().rstrip())
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    pep = LeaderboardCyclopeptideSequencing(spectrum, N, massTable)
    print('-'.join([str(massTable[k]) for k in pep]))

147-71-129-113


# Spectral Convolution Problem

In [42]:
def spec_convolution(spectrum):
    out = []
    spectrum.sort()
    pre = spectrum[:-1]
    suf = spectrum[1:]
    table = {}
    for i in range(len(pre)):
        for j in range(i, -1, -1):
            if suf[i] > pre[j]:
                mass = suf[i] - pre[j]
                out.append(mass)
                if mass not in table.keys():
                    table[mass] = 0
                table[mass] += 1
    res = []
    allkeys = sorted(table, key = table.get, reverse = True)
    for k in allkeys:
        res.extend([k] * table[k])
    return table, res

In [43]:
with open('spectralConvolutionProblemTest1.txt', 'r') as f:
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    table, out = spec_convolution(spectrum)
    print(' '.join([str(i) for i in out]))
    print(table)

137 137 186 186 49 323
{137: 2, 49: 1, 186: 2, 323: 1}


# ConvolutionCyclopeptideSequencing

In [44]:
import copy


def convolution_sequencing(spectrum, M, N, massTable):
    table,res =spec_convolution(spectrum)
    newtable = {}
    for k in table.keys():
        if k >= 57 and k <= 200:
            newtable[k] = table[k]
    allValues = sorted(newtable.values(), reverse = True)
    print(allValues)
    if M > len(allValues):
        val_to_keep = allValues
    else:
        thres = allValues[M]
        val_to_keep = [k for k in newtable.keys() if newtable[k] >= thres]
    print(val_to_keep)
    simple_table = copy.copy(massTable)
    for k,v in massTable.items():
        if v not in val_to_keep:
            del simple_table[k]
    placeholders = 'abcdefghijklmnopqrstuvwxyz'
    start = 0
    for v in val_to_keep:
        if v not in simple_table.values():
            simple_table[placeholders[start]] = v
            start += 1
    print(simple_table)
    pep = LeaderboardCyclopeptideSequencing(spectrum, N, simple_table)
    print(pep)
    return(simple_table, ('-'.join([str(simple_table[k]) for k in pep])))

In [45]:
with open('ConvolutionCyclopeptideSequencingTest1.txt', 'r') as f:
    M = int(f.readline().rstrip())
    N = int(f.readline().rstrip())
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    spectrum.sort()
    print(spectrum)
    simple_table, pep = convolution_sequencing(spectrum, M, N, massTable)
    print(pep)

[57, 57, 71, 99, 129, 137, 170, 186, 194, 208, 228, 265, 285, 299, 307, 323, 356, 364, 394, 422, 493]
[7, 7, 7, 7, 6, 5, 5, 5, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1]
[58, 72, 66, 80, 71, 99, 113, 57, 87, 115, 129, 65, 95, 123, 137, 79, 109, 151, 91, 157, 171, 128, 136, 166, 194, 148, 156, 186, 162, 170, 200, 178]
{'G': 57, 'A': 71, 'S': 87, 'V': 99, 'I': 113, 'L': 113, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'H': 137, 'R': 156, 'W': 186, 'a': 58, 'b': 72, 'c': 66, 'd': 80, 'e': 65, 'f': 95, 'g': 123, 'h': 79, 'i': 109, 'j': 151, 'k': 91, 'l': 157, 'm': 171, 'n': 136, 'o': 166, 'p': 194, 'q': 148, 'r': 162, 's': 170, 't': 200, 'u': 178}
GEVAdG
57-129-99-71-80-57


In [35]:
#simple_table = {'G': 57, 'A': 71, 'S': 87, 'V': 99, 'I': 113, 'L': 113, 'D': 115, 'K': 128, 'Q': 128, 'E': 129, 'H': 137, 'R': 156, 'W': 186, 'a': 58, 'b': 72, 'c': 66, 'd': 80, 'e': 65, 'f': 95, 'g': 123, 'h': 79, 'i': 109, 'j': 151, 'k': 91, 'l': 157, 'm': 171, 'n': 136, 'o': 166, 'p': 194, 'q': 148, 'r': 162, 's': 170, 't': 200, 'u': 178}
pepScoreCirc('GEVAdG', spectrum, simple_table)

21

In [46]:
with open('rosalind_ba4i.txt', 'r') as f:
    M = int(f.readline().rstrip())
    N = int(f.readline().rstrip())
    spectrum = f.readline().rstrip().split()
    spectrum = [int(k) for k in spectrum]
    spectrum.sort()
    print(spectrum)
    testtable, pep = convolution_sequencing(spectrum, M, N, massTable)
    print(pep)

[0, 97, 97, 97, 101, 113, 113, 128, 128, 129, 156, 163, 186, 198, 225, 225, 226, 226, 229, 253, 257, 269, 276, 283, 326, 326, 349, 354, 354, 354, 366, 381, 382, 384, 389, 423, 446, 451, 455, 462, 479, 482, 494, 510, 512, 545, 547, 552, 559, 575, 579, 583, 607, 607, 609, 623, 642, 660, 672, 675, 680, 680, 720, 731, 735, 736, 738, 770, 772, 773, 777, 788, 828, 828, 833, 836, 848, 866, 885, 899, 901, 901, 925, 929, 933, 949, 956, 961, 963, 996, 998, 1014, 1026, 1029, 1046, 1053, 1057, 1062, 1085, 1119, 1124, 1126, 1127, 1142, 1154, 1154, 1154, 1159, 1182, 1182, 1225, 1232, 1239, 1251, 1255, 1279, 1282, 1282, 1283, 1283, 1310, 1322, 1345, 1352, 1379, 1380, 1380, 1395, 1395, 1407, 1411, 1411, 1411, 1508]
[78, 72, 54, 52, 52, 50, 46, 40, 38, 32, 32, 28, 26, 24, 24, 24, 24, 22, 22, 22, 20, 20, 20, 18, 18, 18, 18, 18, 18, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 14, 14, 14, 14, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 

In [47]:
pepScoreCirc('EPKTPWYLLRPQ', spectrum, testtable)

134