In [None]:
pip install requests
pip install lxml
pip install pandas
pip install openpyxl
pip install cobra

# for windows only, download the file from https://www.lfd.uci.edu/~gohlke/pythonlibs/#pyeda firstly
# pip install pyeda-0.28.0-cp37-cp37m-win_amd64.whl  
pip install modelseedpy


# build model from KEGG

## initialize new model

In [2]:
import cobra

org_name ='mbry'
org_full_name = 'Methylocystis_bryophila'
new_model = cobra.Model(org_full_name)
cobra.io.write_sbml_model(new_model,filename=org_name+'_model\\'+org_name+'_blank.xml')
new_model


0,1
Name,Methylocystis_bryophila
Memory address,161f35e44c0
Number of metabolites,0
Number of reactions,0
Number of genes,0
Number of groups,0
Objective expression,0
Compartments,


## add reactions from KEGG

### define function

In [3]:
def get_stoichiometry(KEGG_reaction):
    # get stoichiometry
    reaction_stoich = {}
    reactants, products = KEGG_reaction.split('<=>')
    pattern = r"(\([\d\.]+\)\s)?([A-Z]+\d*\[0\])"
    reactant_matches = re.findall(pattern, reactants)
    for count, reactant in reactant_matches:
        if count:
            c = re.findall(r'[(](.*?)[)]',count)[0]
            reaction_stoich[reactant] = -1*float(c)
        else:
            reaction_stoich[reactant] = -1

    product_matches = re.findall(pattern, products)
    for count, product in product_matches:
        if count:
            c = re.findall(r'[(](.*?)[)]',count)[0]
            reaction_stoich[product] = float(c)
        else:
            reaction_stoich[product] = 1
    return reaction_stoich



def add_kegg_reaction(
    model,
    reaction_id,
    KEGG_REACTIONS_DB,
    KEGG_COMPOUNDS_DB,
    compartment = 'c0',
    direction="forward",
):  

    KEGG_reaction = KEGG_REACTIONS_DB['EQUATION'][reaction_id]
    

    reaction_stoich = get_stoichiometry(KEGG_reaction)

    cobra_reaction = cobra.Reaction(reaction_id+'_'+ compartment)
    if KEGG_REACTIONS_DB['NAMES'][reaction_id]==KEGG_REACTIONS_DB['NAMES'][reaction_id]:
        cobra_reaction.name = KEGG_REACTIONS_DB['NAMES'][reaction_id].split("|", 1)[0]
    else:
        cobra_reaction.name = reaction_id

    metabolites_to_add = {}
    for metabolite, stoich in reaction_stoich.items():
        id = metabolite[0:6]
        if KEGG_COMPOUNDS_DB['Name'][id]==KEGG_COMPOUNDS_DB['Name'][id]:
            name = KEGG_COMPOUNDS_DB['Name'][id].split("|", 1)[0]
        else:
            name = id

        if metabolite[6:9] == '[0]':
            compartment_string = 'c0'
        else:
            compartment_string = 'e0'

        formula = KEGG_COMPOUNDS_DB['formula'][id]
        charge = KEGG_COMPOUNDS_DB['charge'][id]

        if formula==formula:
            if charge==charge:
                metabolites_to_add[
                    cobra.Metabolite(
                        id+'_'+compartment_string, name=name+'_'+compartment_string, compartment=compartment_string,
                        formula=formula, charge= int(charge)
                    )
                ] = stoich
            else:
                metabolites_to_add[
                    cobra.Metabolite(
                        id+'_'+compartment_string, name=name+'_'+compartment_string, compartment=compartment_string,
                        formula=formula
                    )
                ] = stoich
        else:
            if charge==charge:
                metabolites_to_add[
                    cobra.Metabolite(
                        id+'_'+compartment_string, name=name+'_'+compartment_string, compartment=compartment_string,
                        charge= int(charge)
                    )
                ] = stoich
            else:
                metabolites_to_add[
                    cobra.Metabolite(
                        id+'_'+compartment_string, name=name+'_'+compartment_string, compartment=compartment_string
                    )
                ] = stoich
            


    cobra_reaction.add_metabolites(metabolites_to_add)
    cobra_reaction.reaction
    if direction == "reversible":
        cobra_reaction.lower_bound = -1000
    elif direction == "backward":
        cobra_reaction.lower_bound = -1000
        cobra_reaction.upper_bound = 0

    model.add_reactions([cobra_reaction])


### add reaction

In [4]:
import cobra  
import pandas as pd
import re

#org_name = 'mbry'
new_model = cobra.io.read_sbml_model(org_name+'_model\\'+org_name+'_blank.xml')

KEGG_REACTIONS_DB = pd.read_table('KEGG_REACTIONS_DB.txt',index_col=0)
KEGG_COMPOUNDS_DB = pd.read_table('KEGG_COMPOUNDS_DB.txt',index_col=0)

df_reaction = pd.read_excel(org_name+'_model\\'+org_name+'_clean_kegg_reaction.xlsx', index_col=1)  

unadded_reaction_id_list = []

i,j = 0,0
for reaction_id in df_reaction.index:
    reaction_type = df_reaction['reaction_type'][reaction_id]
    if reaction_type == 'reversible':
        direction = 'reversible'
    elif reaction_type == 'irreversible':
        direction = 'forward'
    try:
        add_kegg_reaction(new_model,reaction_id,KEGG_REACTIONS_DB,KEGG_COMPOUNDS_DB,compartment = 'c0',direction=direction)
        print(reaction_id+' added')
        i+=1
    except:
        print(reaction_id+' added wrong !!!')
        unadded_reaction_id_list.append(reaction_id)
        j+=1
        continue

print('\n %d / %d reactions added successfully \n ====================='%(i,i+j))

# add genes
i=1
for reaction in new_model.reactions:
    rid = [rea for rea in str(reaction.id).split('_') if rea.startswith('R')]
    if rid:
        gene_list_str = '( '+df_reaction['entry_name'][rid[0]].replace(' ', ' or ').replace('mbry:', '')+' )'
        reaction.gene_reaction_rule = gene_list_str
    if i%100 == 0:
        print('%d / %d reactions genes added succesfully'%(i,len(new_model.reactions)))
    i+=1
print('%d / %d reactions genes added succesfully'%(len(new_model.reactions),len(new_model.reactions)))


cobra.io.write_sbml_model(new_model,filename=org_name+'_model\\'+org_name+'_KEGG_c0.xml')

No metabolites in model
No reactions in model
No objective coefficients in model. Unclear what should be optimized


R01070 added
R00710 added
R00746 added
R00754 added
R00014 added
R03270 added
R02569 added
R00200 added
R00658 added
R01518 added
R01061 added
R01015 added
R04779 added
R02740 added
R00959 added
R03321 added
R01600 added
R02739 added
R01786 added
R07618 added
R01512 added
R09127 added
R00341 added
R00235 added
R00711 added
R02073 added
R00206 added
R00199 added
R00621 added
R03316 added
R02570 added
R00405 added
R00268 added
R01899 added
R01082 added
R01900 added
R01325 added
R00351 added
R00342 added
R02164 added
R01049 added
R01641 added
R01056 added
R01529 added
R01830 added
R01528 added
R02035 added
R02736 added
R01827 added
R01057 added
R08572 added
R01621 added
R10221 added
R00289 added
R00286 added
R02630 added
R00867 added
R01819 added
R01818 added
R00888 added
R00885 added
R02568 added
R03397 added
R03236 added
R00291 added
R01105 added
R01092 added
R00955 added
R02957 added
R01385 added
R07765 added
R07764 added
R07762 added
R07763 added
R04968 added
R04726 added
R04963 added

In [14]:
for me in new_model.metabolites:
    if me.formula:
        print(me.id+'\t'+me.name+'\t'+me.formula)
    else:
        print(me.id+'\t'+me.name+'\t')

C05378_c0	beta-D-Fructose 1,6-bisphosphate_c0	C6H10O12P2
C00111_c0	Glycerone phosphate_c0	C3H5O6P
C00118_c0	D-Glyceraldehyde 3-phosphate_c0	C3H5O6P
C00084_c0	Acetaldehyde_c0	C2H4O
C00003_c0	NAD+_c0	C21H26N7O14P2
C00001_c0	H2O_c0	H2O
C00033_c0	Acetate_c0	C2H3O2
C00004_c0	NADH_c0	C21H27N7O14P2
C00080_c0	H+_c0	H
C00469_c0	Ethanol_c0	C2H6O
C00006_c0	NADP+_c0	C21H25N7O17P3
C00005_c0	NADPH_c0	C21H26N7O17P3
C00022_c0	Pyruvate_c0	C3H3O3
C00068_c0	Thiamin diphosphate_c0	C12H17N4O7P2S
C05125_c0	2-(alpha-Hydroxyethyl)thiamine diphosphate_c0	C14H21N4O8P2S
C00011_c0	CO2_c0	CO2
C15972_c0	Enzyme N6-(lipoyl)lysine_c0	C8H14NORS2
C16255_c0	[Dihydrolipoyllysine-residue acetyltransferase] S-acetyldihydrolipoyllysine_c0	C10H18NO2RS2
C00024_c0	Acetyl-CoA_c0	C23H34N7O17P3S
C15973_c0	Enzyme N6-(dihydrolipoyl)lysine_c0	C8H16NORS2
C00010_c0	CoA_c0	C21H32N7O16P3S
C00002_c0	ATP_c0	C10H13N5O13P3
C00008_c0	ADP_c0	C10H13N5O10P2
C00074_c0	Phosphoenolpyruvate_c0	C3H2O6P
C00631_c0	2-Phospho-D-glycerate_c0	C3H4O7P
C0019

## check balance

In [6]:
def check_balance(reaction,H_metabolite=None):
    try:
        feedback = reaction.check_mass_balance()
        if feedback :
            print(reaction.id + ': ' + str(feedback))
            print('      '+ reaction.reaction)
            if H_metabolite:
                if 'H' in feedback and feedback.get('H') == feedback.get('charge'):
                    H_to_add={H_metabolite:-1*feedback.get('H')}
                    reaction.add_metabolites(H_to_add)
                    print('      H is added')
                    return(check_balance(reaction,H_metabolite))
            return int(0)
        else:
            #print(reaction.id + ': banlance')
            #print('      '+ reaction.reaction)
            return int(1)
    except:
        print(reaction.id + ': error')
        print('      '+ reaction.reaction)
        return int(0)



import cobra
import pandas as pd

#org_name = 'mbry'
new_model = cobra.io.read_sbml_model(org_name+'_model\\'+org_name+'_KEGG_c0.xml')
H_metabolite = new_model.metabolites.get_by_id('C00080_c0')

i=0
for reaction in new_model.reactions:
    i=i+check_balance(reaction,H_metabolite)
print('After add H+, %d / %d reactions are balanced'%(i,len(new_model.reactions)))

cobra.io.write_sbml_model(new_model,filename=org_name+'_model\\'+org_name+'_KEGG_c0_balanced.xml')

No objective coefficients in model. Unclear what should be optimized


R00710_c0: {'charge': -1.0, 'H': -1.0}
      C00001_c0 + C00003_c0 + C00084_c0 <=> C00004_c0 + C00033_c0 + C00080_c0
      H is added
R00014_c0: {'charge': 1.0, 'H': 1.0}
      C00022_c0 + C00068_c0 --> C00011_c0 + C05125_c0
      H is added
R00200_c0: {'charge': -1.0, 'H': -1.0}
      C00002_c0 + C00022_c0 --> C00008_c0 + C00074_c0
      H is added
R04779_c0: {'charge': -1.0, 'H': -1.0}
      C00002_c0 + C05345_c0 --> C00008_c0 + C05378_c0
      H is added
R01600_c0: {'charge': -1.0, 'H': -1.0}
      C00002_c0 + C00221_c0 --> C00008_c0 + C01172_c0
      H is added
R01786_c0: {'charge': -1.0, 'H': -1.0}
      C00002_c0 + C00267_c0 --> C00008_c0 + C00668_c0
      H is added
R09127_c0: {'charge': 2.0}
      C00469_c0 + 2.0 C18233_c0 <=> 2.0 C00080_c0 + C00084_c0 + 2.0 C18234_c0
R00235_c0: {'charge': -1.0, 'H': -1.0}
      C00002_c0 + C00010_c0 + C00033_c0 --> C00013_c0 + C00020_c0 + C00024_c0
      H is added
R00711_c0: {'charge': -1.0, 'H': -1.0}
      C00001_c0 + C00006_c0 + C00084_c0 

### print unbalanced reactions

In [7]:
for reaction in new_model.reactions:
    if check_balance(reaction)==0:
        print('           https://www.kegg.jp/entry/'+reaction.id.replace('_c0',''))
        for meta, sto in reaction.metabolites.items():
            print('           '+meta.id+'    '+ str(meta.formula)+'    '+str(meta.charge))


        #new_model.metabolites.get_by_id('C00042_c0').charge



R09127_c0: {'charge': 2.0}
      C00469_c0 + 2.0 C18233_c0 <=> 2.0 C00080_c0 + C00084_c0 + 2.0 C18234_c0
           https://www.kegg.jp/entry/R09127
           C18233_c0    None    0
           C00469_c0    C2H6O    0
           C18234_c0    None    0
           C00084_c0    C2H4O    0
           C00080_c0    H    1
R02164_c0: {'H': -2.0}
      C00042_c0 + C15602_c0 <=> C00122_c0 + C15603_c0
           https://www.kegg.jp/entry/R02164
           C15602_c0    None    0
           C00042_c0    C4H4O4    -2
           C15603_c0    None    0
           C00122_c0    C4H2O4    -2
R01105_c0: {'H': 30.0, 'O': 16.0, 'C': 18.0, 'R': 2.0}
      C00001_c0 --> C00124_c0 + C05796_c0
           https://www.kegg.jp/entry/R01105
           C00001_c0    H2O    0
           C05796_c0    C12H20O11R2    0
           C00124_c0    C6H12O6    0
R07765_c0: {'charge': 2.0, 'C': -11.0, 'H': -21.0, 'N': -2.0, 'O': -7.0, 'P': -1.0, 'R': -1.0}
      C00004_c0 + C00080_c0 + C16221_c0 <=> C00003_c0 + C04088_c0
      