In [1]:
import json
import cobra
import time
import os
import pandas as pd
import numpy as np
import re
from goatools import obo_parser
from goatools.anno.genetogo_reader import Gene2GoReader

import sys
sys.path.append('../script')
from GO_Kcat_analysis import *

# Input and Output Data

In [2]:
# Model file
model_file = '../data/iML1515R/iML1515R.json'
model_name = os.path.splitext(os.path.basename(model_file))[0]
# The Taxonomy information corresponding to the model's species.
org_name = 'Escherichia coli'  # replace with the actual biological name ：Bacillus subtilis，Thermus thermophilus，Corynebacterium glutamicum
org_type = 'org_id'  #org_name total
org_id = '83333'  #83333

# GO-kcat relationship data from UniProt
GO_kcat_file = '../data/GO/GO_kcat_tree_total.csv'

#download an .obo file of the most current GO: 
# wget http://current.geneontology.org/ontology/go-basic.obo
obo_path = '../data/GO/go-basic.obo'

# NCBI's gene2go file
# wget https://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz
file_gene2go = '../data/GO/gene2go'

# Output file
output_dir = f"../analysis/GOATOOLS/{model_name}"
os.makedirs(output_dir, exist_ok=True)
go_kcat_out_file = os.path.join(output_dir, f'go_term_mean_kcat_{org_type}_dict.json')

Load the GO term data and calculate the GO term average kcat value for a specific organism 


In [3]:
start_time = time.time()

# Read model
model = cobra.io.json.load_json_model(model_file)
convert_to_irreversible(model)
norm_model = isoenzyme_split(model)

# GETTING ASSOCIATIONS FOR ONE SPECIES
# In NCBI, data for Escherichia coli MG1655(taxid 511145) can be obtained, but the taxid 83333 is not available
objanno_ecoli = Gene2GoReader(file_gene2go, taxids=[511145,83333]) 
ns2assc_ecoli = objanno_ecoli.get_ns2assc()

# GO term information
go = obo_parser.GODag(obo_path)


go_term_mean_kcat_dict = get_go_term_mean_kcat_by_org_v3_1(norm_model, ns2assc_ecoli, org_type, org_name, org_id, GO_kcat_file, go)

with open(go_kcat_out_file, 'w') as json_file:
    json.dump(go_term_mean_kcat_dict, json_file, indent=4)

end_time = time.time()
print("Total time:", end_time - start_time, "seconds")

HMS:0:00:56.695533  44,166 annotations,  3,838 genes,  4,130 GOs, 1 taxids READ: ../data/GO/gene2go 
../data/GO/go-basic.obo: fmt(1.2) rel(2025-03-16) 43,544 Terms
{'GO:0035999', 'GO:0019264', 'GO:0006545', 'GO:0006564', 'GO:0006565', 'GO:0006730', 'GO:0008652', 'GO:0006546', 'GO:0046653'}
{'GO:0006779', 'GO:0006970', 'GO:0009236', 'GO:0008152', 'GO:0032259', 'GO:0019354'}
{'GO:0006783', 'GO:0006779', 'GO:0006782', 'GO:0033194'}
{'GO:0006749', 'GO:0034599', 'GO:0098869', 'GO:0045454'}
{'GO:0044205', 'GO:0006222', 'GO:0009220', 'GO:0006221', 'GO:0006207'}
{'GO:0046296'}
{'GO:0046296'}
{'GO:0046296'}
{'GO:0006096', 'GO:0016310', 'GO:0006002', 'GO:0030388', 'GO:0008152', 'GO:0061621', 'GO:0006007', 'GO:0051289', 'GO:0046835'}
{'GO:0006568', 'GO:0008652', 'GO:0009073', 'GO:0000162'}
{'GO:0009061', 'GO:0019563', 'GO:0006072', 'GO:0046168'}
{'GO:0009082', 'GO:1901566', 'GO:0009099', 'GO:0006532', 'GO:0008652', 'GO:0009081', 'GO:0046394', 'GO:0009097', 'GO:0009098'}
{'GO:0019632', 'GO:0009073

# Obtain the Reaction-gene-GO relationship

In [4]:
go_types = ['Total']
clc_types = ['median']#'max', 'mean'
org_types = ['org_id', 'org_name', 'total']

for go_type in go_types:
    for clc_type in clc_types:
        for org_type in org_types:
            go_kcat_out_file = os.path.join(output_dir, f'go_term_mean_kcat_{org_type}_dict')
            go_kcat_process_outfile = f'{go_kcat_out_file}_process_{go_type}_{clc_type}.json'
            go_term_mean_kcat_dict_use = load_json(f'{go_kcat_out_file}.json')
            go_term_mean_kcat_process_dict = process_data(go_term_mean_kcat_dict_use, go_type, clc_type)
            with open(go_kcat_process_outfile, 'w') as json_file:
                json.dump(go_term_mean_kcat_process_dict, json_file, indent=4)