In [6]:
import sys
import xml.etree.ElementTree as ET
import pickle

In [7]:
# constants
SOURCE_NAMESPACE = 'Unimod'
SOURCE_POSITION_ENUM = 'Position'
SOURCE_POSITION_ENUM_ANY = 'ANYWHERE'
SOURCE_POSITION_ENUM_ANY_N = 'ANY_N'
SOURCE_POSITION_ENUM_ANY_C = 'ANY_C'
SOURCE_POSITION_ENUM_PROTEIN_N = 'PROTEIN_N'
SOURCE_POSITION_ENUM_PROTEIN_C = 'PROTEIN_C'
SOURCE_MODIFICATION_CLASS = 'Modification'

def main():
#     if len(sys.argv) != 2:
#         sys.stderr.write('Usage: %s <Unimod XML file>\n' % sys.argv[0])
#         return
    mod_dict = {}
    root = ET.parse("unimod.xml").getroot()
    mods = root.find('{http://www.unimod.org/xmlns/schema/unimod_2}modifications')
    if mods == None:
        raise ValueError('modifications tag not found')
    parsed_mods = []
    for mod in mods.findall('{http://www.unimod.org/xmlns/schema/unimod_2}mod'):
        parsed_mod = Modification()
        id_str = mod.get('record_id')
        try:
            parsed_mod.unimod_id = int(id_str)
        except:
            raise ValueError('invalid mod id "%s"' % id_str)
        parsed_mod.title = mod.get('title')
        # strip out non-ascii characters from name
        parsed_mod.full_name = ''.join(c for c in mod.get('full_name') if ord(c) < 128)
        for child in mod:
            if child.tag == '{http://www.unimod.org/xmlns/schema/unimod_2}specificity':
                site = child.get('site')
                if site == 'N-term':
                    site = 'n'
                elif site == 'C-term':
                    site = 'c'
                elif not (len(site) == 1 and site.isalpha() and site.isupper()):
                    raise ValueError('invalid site "%s" in unimod id %s' % (site, id_str))
                position = child.get('position')
                if position not in ['Anywhere', 'Any N-term', 'Any C-term', 'Protein N-term', 'Protein C-term']:
                    raise ValueError('invalid position "%s" in unimod id %s' % (position, id_str))
                parsed_mod.specificities.append(ModificationSpecificity(site, position))
            elif child.tag == '{http://www.unimod.org/xmlns/schema/unimod_2}delta':
                if parsed_mod.mono_mass != None:
                    raise ValueError('multiple mono masses for unimod id %s' % id_str)
                if parsed_mod.avg_mass != None:
                    raise ValueError('multiple avg masses for unimod id %s' % id_str)
                mono_mass_str = child.get('mono_mass')
                try:
                    parsed_mod.mono_mass = float(mono_mass_str)
                except:
                    raise ValueError('invalid mono mass "%s" in unimod id %s' % (mono_mass_str, id_str))
                avg_mass_str = child.get('avge_mass')
                try:
                    parsed_mod.avg_mass = float(avg_mass_str)
                except:
                    raise ValueError('invalid avg mass "%s" in unimod id %s' % (avg_mass_str, id_str))
        parsed_mod.verify()
        parsed_mods.append(parsed_mod)

    if len(parsed_mods) == 0:
        raise ValueError('no modifications found')
    
    for mods_info in parsed_mods:
        key = mods_info.mono_mass #creates the name of the dictionary with name of the modification as key and delta mass as the value
        value = mods_info.full_name
        if key in mod_dict.keys():
            mod_dict[key].append(value)
        else:
            mod_dict[key] = [value]
    
   
    with open('unimod_modification.pickle', 'wb') as handle: #saves dictionary as the pickle file
        pickle.dump(mod_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [8]:
class Modification:
    def __init__(self):
        self.unimod_id = None
        self.title = None
        self.full_name = None
        self.mono_mass = None
        self.avg_mass = None
        self.specificities = []
    def verify(self):
        if self.unimod_id == None:
            raise ValueError('parsed modification is missing unimod id')
        if self.title == None:
            raise ValueError('parsed modification is missing title')
        if self.full_name == None:
            raise ValueError('parsed modification is missing full name')
        elif self.mono_mass == None:
            raise ValueError('parsed modification is missing mono mass')
        elif self.avg_mass == None:
            raise ValueError('parsed modification is missing avg mass')
        elif len(self.specificities) == 0:
            raise ValueError('parsed modification has no specificities')

class ModificationSpecificity:
    def __init__(self, site, position):
        self.site = site
        if position in ['Anywhere', SOURCE_POSITION_ENUM_ANY]:
            self.position = SOURCE_POSITION_ENUM_ANY
        elif position in ['Any N-term', SOURCE_POSITION_ENUM_ANY_N]:
            self.position = SOURCE_POSITION_ENUM_ANY_N
        elif position in ['Any C-term', SOURCE_POSITION_ENUM_ANY_C]:
            self.position = SOURCE_POSITION_ENUM_ANY_C
        elif position in ['Protein N-term', SOURCE_POSITION_ENUM_PROTEIN_N]:
            self.position = SOURCE_POSITION_ENUM_PROTEIN_N
        elif position in ['Protein C-term', SOURCE_POSITION_ENUM_PROTEIN_C]:
            self.position = SOURCE_POSITION_ENUM_PROTEIN_C
        else:
            raise ValueError('invalid specificity position: %s' % position)

if __name__ == '__main__':
    main()

{42.010565: ['Acetylation', 'Ser->Glu substitution'], -0.984016: ['Amidation', 'Asp->Asn substitution', 'Glu->Gln substitution'], 226.077598: ['Biotinylation'], 57.021464: ['Iodoacetamide derivative', 'Ala->Gln substitution', 'Gly->Asn substitution', 'Addition of Glycine'], 43.005814: ['Carbamylation', 'Ala->Asn substitution'], 58.005479: ['Iodoacetic acid derivative', 'Ala->Glu substitution', 'Gly->Asp substitution'], 0.984016: ['Deamidation', 'Asn->Asp substitution', 'Gln->Glu substitution'], 486.251206: ['Gygi ICAT(TM) d0'], 494.30142: ['Gygi ICAT(TM) d8'], -29.992806: ['Homoserine', 'Met->Thr substitution'], -48.003371: ['Homoserine lactone', 'Prompt loss of side chain from oxidised Met'], 450.275205: ['Applied Biosystems original ICAT(TM) d8'], 442.224991: ['Applied Biosystems original ICAT(TM) d0'], 99.068414: ['N-isopropylcarboxamidomethyl'], 414.193691: ['Biotinyl-iodoacetamidyl-3,6-dioxaoctanediamine'], 79.966331: ['Phosphorylation'], 108.975121: ['S-methyl amino phosphinate']