Create a dictionary for all unique building blocks

PN = Asp/Gln, PU = Asp/Glu, NN = Asn/Gln, NU = Asn/Glu

First number represents O-acetylation, second number represents N-deacetylation

e.g. PN00: Asp/Gln, no O-acetylation, no N-deacetylation

In [0]:
blocks = {'PN00':1009.4452, 'PN10':1051.4557, 'PN01':967.4346, 'PN11':1009.4452,
          'PU00':1010.4292, 'PU10':1052.4397, 'PU01':968.4186, 'PU11':1010.4292,
          'NN00':1008.4611, 'NN10':1050.4717, 'NN01':966.4506, 'NN11':1008.4611,
          'NU00':1009.4452, 'NU10':1051.4557, 'NU01':967.4346, 'NU11':1009.4452}


water = 18.0106
ala = 89.0477
lactate = 90.0317
asp = 133.0375
asn = 132.0535

# Threshold for MS value comparison
thres = 0.05
# Decimal place for MW calculation
round_pos = 4

print(len(blocks))

Create a dictionary for monomers

In [0]:
monomers = {}
for key, value in blocks.items():
  new_key = key
  new_value = round(value, round_pos)
  monomers[new_key] = new_value

i = 0
for key, value in monomers.items():
  if i<10:
    print(key, ': ', value)
    i += 1
  else:
    break
print(len(monomers))

Create a dictionary for dimers

Use '-' as separation between individual blocks

In [0]:
dimers = {}
for key1, value1 in monomers.items():
  for key2, value2 in blocks.items():
    new_key = key1 + '-' + key2
    new_value = round(value1+value2, round_pos)
    dimers[new_key] = new_value

i = 0
for key, value in dimers.items():
  if i<10:
    print(key, ': ', value)
    i += 1
  else:
    break
print(len(dimers))

Create a dictionary for trimers

Use '-' as separation between individual blocks

In [0]:
trimers = {}
for key1, value1 in dimers.items():
  for key2, value2 in blocks.items():
    new_key = key1 + '-' + key2
    new_value = round(value1+value2, round_pos)
    trimers[new_key] = new_value

i = 0
for key, value in trimers.items():
  if i<10:
    print(key, ': ', value)
    i += 1
  else:
    break
print(len(trimers))

Combine monomers, dimers, and trimers as list "Raw Final A"

In [0]:
raw_final_A = {}
raw_final_A.update(monomers)
raw_final_A.update(dimers)
raw_final_A.update(trimers)

i = 0
for key, value in raw_final_A.items():
  if i<10:
    print(key, ': ', value)
    i += 1
  else:
    break
print(len(raw_final_A))

Account for removing (R) bridge for the first unit

'R' = Removed, 'N' = Not removed

Store the result in list "Raw Final B"

In [0]:
raw_final_B = {}
for key, value in raw_final_A.items():
  # Get the property of the first unit
  c_string = key.split('-')[0]
  # Determine the type of the bridge (Asp? Asn?)
  # The bridge is Asp when the first char is 'P'
  if c_string[0]=='P':
    bridge = asp
  else:
    bridge = asn
  # Add items without removing the Asp bridge
  new_key_N = key + '-N'
  new_value_N = value
  raw_final_B[new_key_N] = new_value_N
  # Add items with the Asp bridge removed
  new_key_R = key + '-R'
  new_value_R = round(value-bridge+water, round_pos)
  raw_final_B[new_key_R] = new_value_R

i = 0
for key, value in raw_final_B.items():
  if i<10:
    print(key, ': ', value)
    i += 1
  else:
    break
print(len(raw_final_B))


Account for the amino acid variation for the last unit

'3' = 3 alaines, '4' = 4 alanines, '5' = 5 alanies

Store the result in list "Raw Final C"

In [0]:
raw_final_C = {}
for key, value in raw_final_B.items():
  # Add items with one alanine removed
  new_key_3 = key + '3'
  new_value_3 = round(value-ala+water, round_pos)
  raw_final_C[new_key_3] = new_value_3
  # Add items without any change
  new_key_4 = key + '4'
  new_value_4 = value
  raw_final_C[new_key_4] = new_value_4
  # Add items with one extra alanine
  new_key_5 = key + '5'
  new_value_5 = round(value+ala-water, round_pos)
  raw_final_C[new_key_5] = new_value_5


i = 0
for key, value in raw_final_C.items():
  if i<10:
    print(key, ': ', value)
    i += 1
  else:
    break
    
print(len(raw_final_C))

Account for the cyclization of aspartic acids

For each cyclization event, one moleclue of water is lost

'C0' = No cyclization, 'C1' = 1 cyclization event, etc

Store the result in list "Raw Final D"

In [0]:
# Helper function to calculate number of asp bridges
# based on the characteriztic string
def asp_bridge(c_string):
  ans = -1
  # Generally, the maximum cyclization event is
  # the number of character 'P'
  c_strs = c_string.split('-')
  # Get the total number of units
  n_units = len(c_strs)-1
  # Get the count of character 'P'
  p_count = 0
  for i in range(n_units):
    if c_strs[i][0]=='P':
      p_count += 1
    else:
      continue
  # However, when the bridge is removed
  # available cyclization site is reduced by 1
  if c_strs[0][0]=='P' and c_strs[-1][0] == 'R':
    p_count -= 1
  
  ans = p_count
  return ans


In [0]:
raw_final_D = {}
for key, value in raw_final_C.items():
  cyc_max = asp_bridge(key)
  # Add the item without any cyclization first
  new_key = key + 'C0'
  new_value = value
  raw_final_D[new_key] = new_value
  # Add all items with at least 1 cyclization
  cyc_num = 1
  while cyc_num<=cyc_max:
    new_key_C = key + 'C' + str(cyc_num)
    new_value_C = round(value-cyc_num*water, round_pos)
    raw_final_D[new_key_C] = new_value_C
    cyc_num += 1

i = 0
for key, value in raw_final_D.items():
  if i<20:
    print(key, ': ', value)
    i += 1
  else:
    break

print(len(raw_final_D))

Output the final list as a CSV file

In [0]:
import csv

output_file_name = 'RefList_v3.csv'
final = raw_final_D
with open(output_file_name, 'w') as f:
  f.write("Index, Property, Molecular weight\n")
  i = 1
  for key in final.keys():
    f.write("%d,%s,%s\n"%(i, key, final[key]))
    i = i+1

Download the output file from Google Colab workspace

In [0]:
!ls
from google.colab import files
files.download(output_file_name)