<a href="https://colab.research.google.com/github/theSamurai1997/Supplementary_Information_GSMs/blob/main/Karan_GEM_converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install cobra

Collecting cobra
  Downloading cobra-0.29.0-py2.py3-none-any.whl.metadata (9.3 kB)
Collecting appdirs~=1.4 (from cobra)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting depinfo~=2.2 (from cobra)
  Downloading depinfo-2.2.0-py3-none-any.whl.metadata (3.8 kB)
Collecting diskcache~=5.0 (from cobra)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting httpx~=0.24 (from cobra)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting optlang~=1.8 (from cobra)
  Downloading optlang-1.8.2-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting python-libsbml~=5.19 (from cobra)
  Downloading python_libsbml-5.20.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (532 bytes)
Collecting ruamel.yaml~=0.16 (from cobra)
  Downloading ruamel.yaml-0.18.6-py3-none-any.whl.metadata (23 kB)
Collecting swiglpk (from cobra)
  Downloading swiglpk-5.0.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)


In [None]:
import cobra
import pandas as pd
from cobra.io import load_model
from cobra.io import load_json_model, save_json_model, load_matlab_model, save_matlab_model, read_sbml_model, write_sbml_model
from cobra import Model, Reaction, Metabolite, Gene

# Data cleaning

## Metabolite handling

In [None]:
df_metabolite = pd.read_csv('bigg_models_metabolites.txt', delimiter='\t')

In [None]:
# Remove rows with NaN in the specific column 'database_links'
df_metabolite_cleaned = df_metabolite.dropna(subset=['database_links'])
df_metabolite_cleaned.shape

(10114, 6)

In [None]:
# Filter the dataframe with only the component that include SEED annotation
filtered_df_metabolite = df_metabolite_cleaned[df_metabolite_cleaned['database_links'].str.contains('SEED Compound', case=False, na=False)]
filtered_df_metabolite.shape


(5986, 6)

In [None]:
# This check if there is a pattern for seed compound and extract the compound and store in a new columns containing a list of the compounds
pattern = r'http://identifiers.org/seed.compound/(\w{8})'
filtered_df_metabolite.loc[:, 'seed_compound'] = filtered_df_metabolite['database_links'].str.extractall(pattern)[0].groupby(level=0).apply(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_metabolite.loc[:, 'seed_compound'] = filtered_df_metabolite['database_links'].str.extractall(pattern)[0].groupby(level=0).apply(list)


In [None]:
filtered_df_metabolite.seed_compound

Unnamed: 0,seed_compound
3,[cpd21754]
4,[cpd01155]
5,"[cpd02566, cpd29666]"
6,[cpd02494]
7,"[cpd02582, cpd15332]"
...,...
15552,[cpd00549]
15553,[cpd15972]
15564,"[cpd00012, cpd27828]"
15567,[cpd00097]


## Reaction handling

In [None]:
df_reaction = pd.read_csv('bigg_models_reactions.txt', delimiter='\t')
df_reaction.shape

(28301, 6)

In [None]:
# Remove rows with NaN in the specific column 'database_links'
df_reaction_cleaned = df_reaction.dropna(subset=['database_links'])
df_reaction_cleaned.shape

(15094, 6)

In [None]:
# Filter the dataframe with only the reaction that include SEED annotation
filtered_df_reaction = df_reaction_cleaned[df_reaction_cleaned['database_links'].str.contains('SEED Reaction', case=False, na=False)]
filtered_df_reaction.shape

(5845, 6)

In [None]:
# This check if there is a pattern for seed reaction and extract the reaction and store in a new columns containing a list of the compounds
pattern = r'http://identifiers.org/seed.reaction/(\w{8})'
filtered_df_reaction.loc[:, 'seed_reaction'] = filtered_df_reaction['database_links'].str.extractall(pattern)[0].groupby(level=0).apply(list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_reaction.loc[:, 'seed_reaction'] = filtered_df_reaction['database_links'].str.extractall(pattern)[0].groupby(level=0).apply(list)


In [None]:
filtered_df_reaction

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,seed_reaction
5,EX_15dap_e,"1,5-Diaminopentane exchange",15dap_e <->,iSF_1195; iBWG_1329; iECD_1391; iECB_1328; iAP...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_15dap_LPAREN_e_RPAREN_; EX_15dap_e,[rxn08305]
6,EX_23dappa_e,"2,3-diaminopropionate exchange",23dappa_e <->,iEC042_1314; iECBD_1354; iECABU_c1320; iBWG_13...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_23dappa_LPAREN_e_RPAREN_; EX_23dappa_e,[rxn07930]
7,EX_26dap__M_e,"Meso-2,6-Diaminoheptanedioate exchange",26dap__M_e <->,iYS1720; iUMNK88_1353; iWFL_1372; iYL1228; iUT...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_26dap_DASH_M_LPAREN_e_RPAREN_; EX_26dap_M_L...,[rxn07931]
8,EX_34dhpac_e,"3,4-Dihydroxyphenylacetaldehyde exchange",34dhpac_e <->,iECED1_1282; iECO103_1326; iECO111_1330; iECOK...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_34dhpac_LPAREN_e_RPAREN_; EX_34dhpac_e,[rxn07972]
9,EX_3amp_e,3'-AMP exchange,3amp_e <->,iY75_1357; iWFL_1372; iYO844; iZ_1308; iUTI89_...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_3amp_LPAREN_e_RPAREN_; EX_3amp_e; EX_3amp_e_,[rxn07973]
...,...,...,...,...,...,...,...
21578,r2416,Mitochondrial Carrier (MC) TCDB:2.A.29.19.1,h_c + his__L_m <-> h_m + his__L_c,Recon3D; iCHOv1_DG44; iCHOv1,MetaNetX (MNX) Equation: http://identifiers.or...,r2416,[rxn13314]
21582,r2465,Cation Diffusion Facilitator (CDF) TCDB:2.A.4.2.3,HC02172_c <-> HC02172_e,iCHOv1; iCHOv1_DG44,RHEA: http://identifiers.org/rhea/29351; RHEA:...,r2465,"[rxn09390, rxn09391]"
21583,r2472,Major Facilitator(MFS) TCDB:2.A.1.4.7,glyc3p_c + pi_m <-> glyc3p_m + pi_c,Recon3D; iCHOv1_DG44; iCHOv1,RHEA: http://identifiers.org/rhea/29015; RHEA:...,r2472,"[rxn08642, rxn10165]"
21597,r2532,Major Facilitator(MFS) TCDB:2.A.1.44.1,asn__L_e <-> asn__L_c,iAM_Pv461; Recon3D; iAM_Pk459; iAM_Pc455; iAM_...,MetaNetX (MNX) Equation: http://identifiers.or...,ASNt5r; r2532,"[rxn05220, rxn08162]"


# Converting Model

In [None]:
%model = read_sbml_model('iKK848.xml')

In [None]:
model = read_sbml_model('iKK1425.xml')

In [None]:
copy_model = model.copy()

In [None]:
for metabolite in copy_model.metabolites:
    # Iterate over each row in the filtered DataFrame
    for index, row in filtered_df_metabolite.iterrows():
        # Access the list of compounds in the 'seed_compound' column
        compound_list = row['seed_compound']
       # Check if the current compound is in the metabolite's id
        for compound in compound_list:
            try:
                if compound in metabolite.id:
                    # Extract the bigg_id for the current row
                    bigg_id = row['bigg_id']

                    # Print and update the metabolite's id
                    print(f"Original Metabolite ID: {metabolite.id}")
                    metabolite.id = bigg_id
                    print(f"Updated Metabolite ID: {metabolite.id}")
            except ValueError:
                # pass if the model already contains a metabolite with the bigg id
                pass



Original Metabolite ID: cpd00001[c0]
Updated Metabolite ID: h2o_c
Original Metabolite ID: cpd00001[e0]
Original Metabolite ID: cpd00001[e0]
Updated Metabolite ID: h2o_p
Original Metabolite ID: cpd00002[c0]
Updated Metabolite ID: atp_g
Original Metabolite ID: cpd00003[c0]
Updated Metabolite ID: nad_c
Original Metabolite ID: cpd00003[e0]
Original Metabolite ID: cpd00003[e0]
Updated Metabolite ID: nad_r
Original Metabolite ID: cpd00004[c0]
Updated Metabolite ID: nadh_s
Original Metabolite ID: cpd00005[c0]
Updated Metabolite ID: nadph_n
Original Metabolite ID: cpd00006[c0]
Updated Metabolite ID: nadp_m
Original Metabolite ID: cpd00007[c0]
Updated Metabolite ID: o2_m
Original Metabolite ID: cpd00007[e0]
Original Metabolite ID: cpd00007[e0]
Updated Metabolite ID: o2_h
Original Metabolite ID: cpd00008[c0]
Updated Metabolite ID: adp_c
Original Metabolite ID: cpd00009[c0]
Updated Metabolite ID: pi_g
Original Metabolite ID: cpd00009[e0]
Original Metabolite ID: cpd00009[e0]
Updated Metabolite ID:

In [None]:
sbml_filename = "iKK848_bigg_id_updated.sbml"
cobra.io.write_sbml_model(copy_model, sbml_filename)

In [None]:
for reaction in copy_model.reactions:
    # Iterate over each row in the filtered DataFrame
    for index, row in filtered_df_reaction.iterrows():
        # Access the list of compounds in the 'seed_compound' column
        reaction_list = row['seed_reaction']
       # Check if the current compound is in the metabolite's id
        for seed_reaction in reaction_list:
            try:
                if seed_reaction in reaction.id:
                    # Extract the bigg_id for the current row
                    bigg_id = row['bigg_id']

                    # Print and update the metabolite's id
                    print(f"Original Metabolite ID: {reaction.id}")
                    reaction.id = bigg_id
                    print(f"Updated Metabolite ID: {reaction.id}")
            except ValueError:
                # pass if the model already contains a metabolite with the bigg id
                pass

Original Metabolite ID: rxn00001_c0
Updated Metabolite ID: PPAm
Original Metabolite ID: rxn00006_c0
Updated Metabolite ID: CAT
Original Metabolite ID: rxn00010_c0
Updated Metabolite ID: GLXCL
Original Metabolite ID: rxn00011_c0
Updated Metabolite ID: PDHam1hi
Original Metabolite ID: rxn00019_c0
Updated Metabolite ID: NPPDO
Original Metabolite ID: rxn00022_c0
Updated Metabolite ID: MALTly
Original Metabolite ID: rxn00029_c0
Updated Metabolite ID: PPBNGS
Original Metabolite ID: rxn00048_c0
Updated Metabolite ID: RBFSb
Original Metabolite ID: rxn00060_c0
Updated Metabolite ID: HMBS
Original Metabolite ID: rxn00062_c0
Updated Metabolite ID: ATPM
Original Metabolite ID: rxn00069_c0
Updated Metabolite ID: GLUSx
Original Metabolite ID: rxn00076_c0
Updated Metabolite ID: NADDPp
Original Metabolite ID: rxn00077_c0
Updated Metabolite ID: NADK
Original Metabolite ID: rxn00083_c0
Updated Metabolite ID: NADTRHD
Original Metabolite ID: rxn00085_c0
Updated Metabolite ID: GLUSy
Original Metabolite ID:

In [None]:
copy_model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
h2o_p,EX_cpd00001_e0,38.78,0,0.00%
pi_n,EX_cpd00009_e0,1.35,0,0.00%
glc__D_g,EX_cpd00027_e0,5.0,6,15.28%
mn2_e,EX_cpd00030_e0,0.007341,0,0.00%
zn2_c,EX_cpd00034_e0,0.007341,0,0.00%
so4_n,EX_cpd00048_e0,0.007341,0,0.00%
ser__L_g,EX_cpd00054_e0,0.1,3,0.15%
cu2_u,EX_cpd00058_e0,0.007341,0,0.00%
ca2_m,EX_cpd00063_e0,0.007341,0,0.00%
phe__L_p,EX_cpd00066_e0,0.1,9,0.46%

Metabolite,Reaction,Flux,C-Number,C-Flux
4crsol_e,DM_cpd01042_c0,-0.005198,7,0.02%
co2_v,EX_cpd00011_e0,-54.83,1,35.11%
ac_e,EX_cpd00029_e0,-50.64,2,64.85%
h_p,EX_cpd00067_e0,-33.71,0,0.00%
5mtr_p,EX_cpd01981_e0,-0.005198,6,0.02%
cpd11416_c0,EX_cpd11416_c0,-1.006,0,0.00%
h2_p,EX_cpd11640_e0,-91.63,0,0.00%


In [None]:
search_word = 'NADtru'

for reaction in copy_model.reactions:
    if search_word in reaction.id:
        print(f"Reaction ID: {reaction.id}")
        print(f"Name: {reaction.name}")
        print(f"Equation: {reaction.reaction}")
        print(f"Lower Bound: {reaction.lower_bound}")
        print(f"Upper Bound: {reaction.upper_bound}")
        print(reaction.annotation)
        print(f"Genes: {[gene.id for gene in reaction.genes]}")

Reaction ID: NADtru
Name: NAD(+) transport (Peroxisomal membrane)
Equation: nad_r <=> nad_c
Lower Bound: -1000.0
Upper Bound: 1000.0
{'sbo': ['SBO:0000167', 'SBO:0000185'], 'seed.reaction': ['rxn13361'], 'bigg.reaction': ['NADth', 'NADtm', 'NADtn', 'NADtru', 'NADts', 'NADtx'], 'biocyc': 'META:RXN-5802', 'metanetx.reaction': 'MNXR101900'}
Genes: []
