# Import Packages 

In [1]:
#import package
from cobra import Model, Reaction, Metabolite
import cobra
import pandas as pd 
from cobra.flux_analysis import (
    single_gene_deletion, single_reaction_deletion, double_gene_deletion,
    double_reaction_deletion)

In [2]:
import os
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import json
import scipy 
import re


import requests 
from bs4 import BeautifulSoup
import html.parser

***
# Locate Data

SBML File location: C:\Users\zafri\Documents\Senior_Design\Data\Models\Download_bigg_web 

In [3]:
cwd=os.getcwd()
cwd
os.chdir(os.path.join(cwd , 'Data/Models/Download_bigg_web'))


In [4]:
%pwd

'C:\\Users\\zafri\\Documents\\Senior_Design\\Data\\Models\\Download_bigg_web'

In [5]:
def data_locate(rel_path_to_input,a):
    cwd = os.getcwd()
    stem=cwd
    for i in range(a):
        stem=os.path.abspath(os.path.join(stem, '..'))
    data_loc=os.path.join(stem , rel_path_to_input)
    os.chdir(data_loc)
    sub_files = os.listdir(data_loc)
    print ("Current Directory: ", cwd)
    print ("Data location: " , data_loc)
    print (sub_files)
    return data_loc, sub_files

***
***
***
# LOAD SBML MODEL 

In [6]:
RECON3d = cobra.io.read_sbml_model("Recon3D.xml")
RECON3d

0,1
Name,Recon3D
Memory address,0x019070b51a90
Number of metabolites,5835
Number of reactions,10600
Objective expression,-1.0*BIOMASS_maintenance_reverse_5b3f9 + 1.0*BIOMASS_maintenance
Compartments,"cytosol, lysosome, mitochondria, endoplasmic reticulum, extracellular space, peroxisome/glyoxysome, nucleus, golgi apparatus, default"


In [7]:
print (RECON3d.reactions.G3PD.annotation.get('ec-code'))

['1.1.5.3', '1.1.99.5']


## Check the number of reactions with EC in annotation
* obtain a realistic number of reactions that we are mapping against 

In [None]:
EC_List=[]
RXN_List=[]
for reaction in RECON3d.reactions:
    RXN_List.append(reaction.id)
    EC_List.append(reaction.annotation.get("ec-code"))

In [None]:
df1=pd.DataFrame({"reaction": RXN_List, "ec" : EC_List})

In [None]:
t,f=df1["ec"].isnull().value_counts()
print (df1["ec"].isnull().value_counts())
print ("No EC: ", t)
print ("With EC: ", f)
print ("sanity check: total equals # with and without ec : ", t+f==df1.shape[0] )


# Load reactions from Dan Matlab Model

* work previously done: 
    * Dan had previous condensed model of central metabolism in humans
            * Total of 384 Reactions
            * *Shortened metabolite list was appended as well* 
         * Mapped Matlab Model ID to SBML model id based upon reaction string
             * Problem: Some did not map --> Now we have less than 384 
             
             
## LOAD REACTION EC DICTIONARY (CSV FORM)

In [None]:
data_loc, sub_files=data_locate("Dan Matlab\code\datafiles_zaf", 1)

In [None]:
RXN_EC_DICT=pd.read_csv("USE_THIS_Dan_matlab_Model_wstrings - SBML ANNOT DICTIONARY.csv")

In [None]:
RXN_EC_DICT.shape

In [None]:
RXN_EC_DICT

In [None]:
#print (RXN_EC_DICT["Reaction"].isnull().value_counts())
f, t=RXN_EC_DICT["Reaction"].isnull().value_counts()
print ("No Mapped Reaction: ", t)
print ("With Mapped Reaction: ", f)
print ("sanity check: total equals # with and without rxn : ", t+f==RXN_EC_DICT.shape[0] )



In [None]:
#drop last four rows (sum of na rows etc)

RXN_EC_DICT=RXN_EC_DICT.drop([380,381,382,383], axis=0)

In [None]:
#print (RXN_EC_DICT["Reaction"].isnull().value_counts())
f, t=RXN_EC_DICT["Reaction"].isnull().value_counts()
print ("No Mapped Reaction: ", t)
print ("With Mapped Reaction: ", f)
print ("sanity check: total equals # with and without ec : ", t+f==RXN_EC_DICT.shape[0] )
print ("Anticipated DF shape : ", RXN_EC_DICT.shape[0]-t )


In [None]:
#drop if no reaction (means we were unable to find a recon3d reaction match for that reaction from Matlab model)
print ("shape before: ", RXN_EC_DICT.shape)
RXN_EC_DICT=RXN_EC_DICT.dropna(subset=['Reaction'])
print ("shape after: ", RXN_EC_DICT.shape)


In [None]:
RXN_EC_DICT.shape
RXN_EC_DICT.head()

In [None]:
RXN_EC_DICT=RXN_EC_DICT.drop(['Unnamed: 4','Unnamed: 5'], axis=1)


In [None]:
print (RXN_EC_DICT.shape)
RXN_EC_DICT

# SBML ids to list (365 rxns)

## double check all sbml ids mapped are in recon3d and can be mapped

In [None]:
sbml_rxn_short_list=RXN_EC_DICT["ID in MODEL SBML"].values.tolist()
sbml_rxn_short_list

In [None]:

reactions_found=[]
for reaction in RECON3d.reactions:
    for rxn in sbml_rxn_short_list:
        if reaction.id ==str(rxn):
            reactions_found.append(reaction.id)

In [None]:
if len(reactions_found)==len(sbml_rxn_short_list):
    print ("all reactions matched" )

print (len(reactions_found))
print (len(sbml_rxn_short_list))

## all reactions from matlab were successfully mapped to RECON3d 

* total of 366 reactions

## Reaction ids (for condensed model) mapped to --> model --> Pull info from Recon3d Model


In [None]:
#Pulls following information
#RXN_ID=[] bigg id
#ANNOT=[]
#ANNOT_KEYS=[]
#ANNOT_VALUES=[]
#RXN_STRING=[]
#ANNOT_EC=[]
#ANNOT_BIGG=[]
#ANNOT_METANET=[]


def Pull_Model_info_rxn(rxn_list_query, model):
    RXN_ID=[]
    ANNOT=[]
    ANNOT_KEYS=[]
    ANNOT_VALUES=[]
    RXN_STRING=[]
    ANNOT_EC=[]
    ANNOT_BIGG=[]
    ANNOT_METANET=[]

    c=0
    for rxn_id in RECON3d.reactions:
        if str(rxn_id.id) in rxn_id_short:
            RXN_ID.append(rxn_id.id)
            RXN_STRING.append(rxn_id.reaction)
            ANNOT.append(rxn_id.annotation)
            ANNOT_KEYS.append(rxn_id.annotation.keys())
            ANNOT_EC.append(rxn_id.annotation.get('ec-code'))
            ANNOT_BIGG.append(rxn_id.annotation.get('bigg.reaction'))
            ANNOT_METANET.append(rxn_id.annotation.get('metanetx.reaction'))

        
            ANNOT_VALUES.append(rxn_id.annotation.values())
        
            
            c+=1
            print (rxn_id)
    print (c)
        
    

In [None]:
RXN_ID=[]
ANNOT=[]
ANNOT_KEYS=[]
ANNOT_VALUES=[]
RXN_STRING=[]
ANNOT_EC=[]
ANNOT_BIGG=[]
ANNOT_METANET=[]

c=0
for rxn_id in RECON3d.reactions:
    if str(rxn_id.id) in sbml_rxn_short_list:
        RXN_ID.append(rxn_id.id)
        RXN_STRING.append(rxn_id.reaction)
        ANNOT.append(rxn_id.annotation)
        ANNOT_KEYS.append(rxn_id.annotation.keys())
        ANNOT_EC.append(rxn_id.annotation.get('ec-code'))
        ANNOT_BIGG.append(rxn_id.annotation.get('bigg.reaction'))
        ANNOT_METANET.append(rxn_id.annotation.get('metanetx.reaction'))

        
        ANNOT_VALUES.append(rxn_id.annotation.values())
        
        
        c+=1
        print (rxn_id)
print (c)
        

In [None]:
SBML_All_Rows_df=pd.DataFrame({"rxn_id": RXN_ID, "rxn_string": RXN_STRING, "Full Annotation": ANNOT, 
                  "Annot Keys": ANNOT_KEYS, "Annot Values": ANNOT_VALUES, "ANNOT_EC":ANNOT_EC,
                  "ANNOT_BIGG": ANNOT_BIGG, "ANNOT_METANET": ANNOT_METANET})

In [None]:
SBML_All_Rows_df

* not every reaction in the model is annotated with 'ec-code'
    * Lets check how much EC coverage we have

In [None]:
SBML_All_Rows_df["ANNOT_EC"][364] is None

## EC annotation coverage not great-- losing reactions 
* Metanet annot has good coverage

In [None]:
SBML_All_Rows_df["ANNOT_METANET"].isnull().value_counts()

In [None]:
Metanetx_list=SBML_All_Rows_df["ANNOT_METANET"].tolist()

In [None]:
print (Metanetx_list[-1])

In [None]:
def lookup_ec_metnetx(METXID):
    url1='https://www.metanetx.org/equa_info/' + str(METXID)
    #print (url1)
    r=requests.get(url1)
    if r.status_code==200:
        soup = BeautifulSoup(r.text, 'html.parser')
        #print(soup.prettify())
        table = soup.find('table', attrs = {"class" : "mnx_table"}) 
        td_tags = table.findAll('td')
        for i in range(len(td_tags)):
            if str(td_tags[i])=="<td>EC number</td>":
                METANET_EC=str(td_tags[i+1])
                


    else:
        print ("/n METID DID NOT WORK /n")
        print (METXID)
    
    return METANET_EC

In [None]:
METANET_EC_RESULTS=[]
for ID in Metanetx_list:
    if ID=="" or ID is None:
        METANET_EC_RESULTS.append("")
    else:
        METANET_EC_RESULTS.append(lookup_ec_metnetx(ID))
    print (len(METANET_EC_RESULTS),(len(METANET_EC_RESULTS)/len(Metanetx_list)))

In [None]:
METANET_EC_RESULTS

In [None]:
#append metanet ECs 
# make final EC column
SBML_All_Rows_df["METANETX_EC"]= METANET_EC_RESULTS
SBML_All_Rows_df["EC_Comp"]=SBML_All_Rows_df["ANNOT_EC"]

In [None]:
SBML_All_Rows_df.isnull()

In [None]:
for index in range(len(SBML_All_Rows_df)):
    if SBML_All_Rows_df["EC_Comp"][index] is None:
        if SBML_All_Rows_df["METANETX_EC"][index]=="<td>NA</td>":
            SBML_All_Rows_df["EC_Comp"][index]="MISSING EC STILL"
        else:
            SBML_All_Rows_df["EC_Comp"][index]=SBML_All_Rows_df["METANETX_EC"][index]

In [None]:
SBML_All_Rows_df["EC_Comp"]

In [None]:
#Count the still empty EC
c=0
for i in SBML_All_Rows_df["EC_Comp"]:
    if i =="MISSING EC STILL":
        c+=1
    elif i =="":
        c+=1

#sanity check
print ("number of reactions with no ec mapping: ", c, str(c/len(SBML_All_Rows_df["EC_Comp"]))+"%")
d=(len(SBML_All_Rows_df["EC_Comp"])-c)
print ("number of reactions with ec mapping: ",d, str(d/len(SBML_All_Rows_df["EC_Comp"]))+"%")
print (c+d)

In [None]:
#drop the rows without EC
df_EC=SBML_All_Rows_df[SBML_All_Rows_df.EC_Comp != 'MISSING EC STILL']
df_EC=df_EC[df_EC.EC_Comp != '']


In [None]:
df_EC

In [None]:
df_EC.shape

In [None]:
writer = pd.ExcelWriter('RXN_W_EC_Dict_FINAL', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df_EC.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [None]:
df_EC


In [None]:
len(df_EC.ANNOT_EC[3])
type(df_EC.ANNOT_EC[1])

In [None]:
# Save DF_EC so we dont need to run the scrape everytime
writer = pd.ExcelWriter('df_ec.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df_EC.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

# Note: THERE IS A BREAK IN THE WORKFLOW HERE: 


### CONTENTS: 

* Split EC lists into individual list of ECs
    * Initial workflow to drop and select scraped data
    * use unique list of EC from df_EC to drop rows of kinetic data 
        * **abandoned due to wanting of model reaction string in feature matrix to create metabolite and structural features more easily in the future ** 
        * can also add flux and data from model if needed --> but dan has provided most of these already from previous work
    


***





### Split EC lists into individual list of ECs 


### EC from SBML model has type list and str 

#### Create a list of just EC from model 

In [None]:
individual_ec_list=[]
for i in df_EC.ANNOT_EC:
    if type(i)==list:
        for n in i:
            individual_ec_list.append(n)          
    if type(i)==str:
        individual_ec_list.append(n)          

In [None]:
individual_ec_list

In [None]:
len(individual_ec_list)

In [None]:
EC_only=pd.DataFrame({"EC list": individual_ec_list})

In [None]:
writer = pd.ExcelWriter('EC_ONLY_260_FINAL', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
EC_only.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [None]:
%pwd

# dropping rows from scraped data 

## data had been pre-appended together 

### had it broken into old scraped rows and new scraped rows




# Protocol 

<br>

## remove all rows with empty ec 
## remove all rows where EC does not match Individual_EC_List (261 ec #)

# output: saved as Kinetic_dropped




In [None]:
Kinetic_df=pd.read_excel("APPENDED_RECON_SCRAPE_DATA_ALL.xlsx")

In [None]:
Kinetic_df.head()


In [None]:
Kinetic_df.shape

In [None]:
Kinetic_df.columns

In [None]:
len(Kinetic_df["EC Number"].unique())

In [None]:
#lets get a rough look at how many ec match / how much preprocessing is needed
c_drop=0
c_keep=0
kinetic_EC_unique=Kinetic_df["EC Number"].unique()
c_total=len(kinetic_EC_unique)
for ec in kinetic_EC_unique:
    if ec in individual_ec_list:
        c_keep+=1
        print (ec)
    else:
        c_drop+=1
print ("Total # Unique EC", c_total)
print ("Keep : ", c_keep ,"     ", (c_keep/c_total) , "%")
print ( "Drop : ", c_drop,"     ", (c_drop/c_total) , "%")
        

# Drop Rows without EC 


** around 700 data points in scrape did not have EC**

In [None]:
print (Kinetic_df.shape)
Kinetic_df=Kinetic_df.dropna(subset=['EC Number'])

In [None]:
Kinetic_df.shape

In [None]:
def strip_space(string):
    
    string = string.strip()
    
    return string

In [None]:
Kinetic_df["EC Number"]= Kinetic_df["EC Number"].apply(strip_space)


# Create a list of EC 

### ANNOT

In [None]:
for i in individual_ec_list:
    i=i.strip()

In [None]:
#actually drop rows 
Row_Status=[]
c_keep=0
c_drop=0
for ec in Kinetic_df["EC Number"]:
    if ec in individual_ec_list:
        Row_Status.append("Keep")
        c_keep+=1
    else:
        c_drop+=1
        Row_Status.append("Drop")
print ("Keep : ", c_keep )
print ( "Drop : ", c_drop)
        
        

In [None]:
#sanity check
len(Row_Status)==len(Kinetic_df["EC Number"])

In [None]:
#Append row status to dataframe
Kinetic_df["Row_Status"]=Row_Status

In [None]:
Kinetic_df

In [None]:
Kinetic_df_dropped = Kinetic_df[Kinetic_df.Row_Status != 'Drop']

In [None]:
Kinetic_df_dropped.head()

In [None]:
#final shape
Kinetic_df_dropped.shape

In [None]:
print (Kinetic_df_dropped["EC Number"].unique())
len(Kinetic_df_dropped["EC Number"].unique())

# START SECOND WORKFLOW HERE: 

## go backwards --> map reaction string, annot, to kinetic data

# note to self 

* df_EC --> model info
* Kinetic_df_dropped --> Kinetic data rows where the EC matched the individual EC pulled from df_EC["ANNOT_EC"] 
    * (df_EC had EC in list and string)
    * created a list of individual EC numbers ( like if had done df_EC["ANNOT_EC"].unique())
    

In [8]:
# Run if starting from here ---> Keep going else
data_locate("Dan Matlab\code\datafiles_zaf", 1)

Current Directory:  C:\Users\zafri\Documents\Senior_Design\Data\Models\Download_bigg_web
Data location:  C:\Users\zafri\Documents\Senior_Design\Data\Models\Dan Matlab\code\datafiles_zaf
['190RXN_EC_Dict_FINAL', 'APPENDED_RECON_SCRAPE_DATA_ALL.xlsx', 'Dan_matlab_Model_wstrings.xlsx', 'df_ec', 'df_ec.xlsx', 'EC_ONLY_260_FINAL', 'EC_ONLY_260_FINAL.xlsx', 'Features Sheet - Metabolite Features_zaf.csv', 'file.txt', 'KINETIC_DATASET_FINAL(EC_matched)', 'KINETIC_DATASET_FINAL(EC_matched).xlsx', 'Kinetic__matched(beforemanualcuration.xlsx', 'Kinetic__matched_CURATECopy.xlsx', 'network_info.txt', 'RECONcuration_done_FROM_HONG.xlsx', 'RECON_newrxn_done_FROM_HONG.xlsx', 'RXN_EC_Dict_260_FINAL', 'RXN_EC_Dict_260_FINAL.xlsx', 'RXN_W_EC_Dict_FINAL', 'USE_THIS_Dan_matlab_Model_wstrings - SBML ANNOT DICTIONARY.csv']


('C:\\Users\\zafri\\Documents\\Senior_Design\\Data\\Models\\Dan Matlab\\code\\datafiles_zaf',
 ['190RXN_EC_Dict_FINAL',
  'APPENDED_RECON_SCRAPE_DATA_ALL.xlsx',
  'Dan_matlab_Model_wstrings.xlsx',
  'df_ec',
  'df_ec.xlsx',
  'EC_ONLY_260_FINAL',
  'EC_ONLY_260_FINAL.xlsx',
  'Features Sheet - Metabolite Features_zaf.csv',
  'file.txt',
  'KINETIC_DATASET_FINAL(EC_matched)',
  'KINETIC_DATASET_FINAL(EC_matched).xlsx',
  'Kinetic__matched(beforemanualcuration.xlsx',
  'Kinetic__matched_CURATECopy.xlsx',
  'network_info.txt',
  'RECONcuration_done_FROM_HONG.xlsx',
  'RECON_newrxn_done_FROM_HONG.xlsx',
  'RXN_EC_Dict_260_FINAL',
  'RXN_EC_Dict_260_FINAL.xlsx',
  'RXN_W_EC_Dict_FINAL',
  'USE_THIS_Dan_matlab_Model_wstrings - SBML ANNOT DICTIONARY.csv'])

In [9]:
# load df_EC from Excel
df_EC=pd.read_excel("df_EC.xlsx")
Kinetic_df=pd.read_excel("APPENDED_RECON_SCRAPE_DATA_ALL.xlsx")

In [10]:
df_EC

Unnamed: 0,ANNOT_BIGG,ANNOT_EC,ANNOT_METANET,Annot Keys,Annot Values,Full Annotation,rxn_id,rxn_string,METANETX_EC,EC_Comp
1,2OXOADOXm,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']",MNXR94818,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '2OXOADOXm', 'META...","{'SBO': 'SBO:0000375', 'bigg.reaction': '2OXOA...",2OXOADOXm,2oxoadp_m + coa_m + nad_m --> co2_m + glutcoa_...,<td>1.2.1.52<br>1.2.4.2<br>1.8.1.4<br>2.3.1.61...,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']"
3,34HPPOR,1.13.11.27,MNXR94843,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '34HPPOR', 'META:4...","{'SBO': 'SBO:0000375', 'bigg.reaction': '34HPP...",34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,<td>1.13.11.27</td>,1.13.11.27
4,3DSPHR,1.1.1.102,MNXR94866,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '3DSPHR', 'META:3-...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3DSPH...",3DSPHR,3dsphgn_c + h_c + nadph_c --> nadp_c + sphgn_c,<td>1.1.1.102</td>,1.1.1.102
5,3HAO,1.13.11.6,MNXR94889,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HAO', '1.13.11.6...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HAO'...",3HAO,3hanthrn_c + o2_c --> cmusa_c + h_c,<td>1.13.11.6</td>,1.13.11.6
6,3HBCOAHLm,3.1.2.4,MNXR94891,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HBCOAHLm', '3.1....","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HBCO...",3HBCOAHLm,3hibutcoa_m + h2o_m --> 3hmp_m + coa_m + h_m,<td>3.1.2.4</td>,3.1.2.4
10,AASAD3m,1.2.1.31,MNXR95158,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AASAD3m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AASAD...",AASAD3m,L2aadp6sa_m + h2o_m + nad_m --> L2aadp_m + 2.0...,<td>1.2.1.31</td>,1.2.1.31
11,AATA,2.6.1.39,MNXR95160,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AATA', 'META:2-AM...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AATA'...",AATA,2oxoadp_c + glu__L_c <=> L2aadp_c + akg_c,<td>2.6.1.39</td>,2.6.1.39
12,ACACT10m,"['2.3.1.16', '2.3.1.9']",MNXR95195,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT10m', 'META:...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT10m,2maacoa_m + coa_m <=> accoa_m + ppcoa_m,<td>2.3.1.16<br>2.3.1.9</br></td>,"['2.3.1.16', '2.3.1.9']"
13,ACACT1m,2.3.1.9,MNXR95194,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT1m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT1m,2.0 accoa_m --> aacoa_m + coa_m,<td>2.3.1.9</td>,2.3.1.9
15,ACITL,"['2.3.3.8', '4.1.3.6']",MNXR95268,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACITL', 'META:ATP...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACITL...",ACITL,atp_c + cit_c + coa_c --> accoa_c + adp_c + oa...,<td>2.3.3.8<br>4.1.3.6</br></td>,"['2.3.3.8', '4.1.3.6']"


In [11]:
Kinetic_df

Unnamed: 0,Entry ID,Recommended Enzyme Name,Reaction ID,Enzymatic Activity,EC Number,Gene,Organism,Strain/Tissue,Reaction,Cofactor,...,pH,Temp (°C),Buffer/Media,Uniprot,References,Data Point,Notes,BIGG ID,BIGG NAME,KEEPing
28,29,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,7.5,25.0,,,RF4,,isozyme ADH1B2,,,X
29,30,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,7.5,25.0,,,RF4,,isozyme ADH1B1,,,X
36,37,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF7,,,,,X
41,42,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,,,X
42,43,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH2 at 21-,,,X
43,44,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,,,X
44,45,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH3 at 21-,,,X
58,59,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH3 at 21-,,,X
59,60,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,,,X
60,61,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,,,X


* Now what we need to do 

    
    * Function goal:
    for every ec in kinetic data["ec"]
        for every ec in df
            if df list
                if ec in list 
                    append row#locationindf
            if df string
                if  ec==str
                    append row#locationindf
                    
                    
   * Kinetic DF has non consecutive and repeating indexes right now
       * RESET

In [12]:
Kinetic_df.index

Int64Index([ 28,  29,  36,  41,  42,  43,  44,  58,  59,  60,
            ...
            117, 118, 119, 120, 121, 122, 123, 124, 120, 121],
           dtype='int64', length=8762)

In [13]:
Kinetic_df.index=range(len(Kinetic_df.index))
df_EC.index=range(len(df_EC.index))

In [14]:
Kinetic_df=Kinetic_df.rename(columns={"EC Number": "EC_Number"})
Kinetic_df.columns = Kinetic_df.columns.str.strip()
df_EC.columns = df_EC.columns.str.strip()


In [15]:
### Lets try a data frame approach--> dictionary
# want loc in df_ec for every input from kinetic

def dataframe_find_loc(df1,df1_col, df2, df2_col): 
    dictionary={}
    loc_list=[]
    for row_val1 in df1[df1_col]:
        df1_row_=str(row_val1)
        lookupstring=re.compile(df1_row_)
        loc=[]
        for row_2 in range(len(df2[df2_col])):
            lookin_string=str(df2[df2_col][row_2])
            match=lookupstring.search(lookin_string)
            if match!=None and [row_2, df2_col] not in loc:
                loc.append([row_2, df2_col])
        
        dictionary[df1_row_]=loc
        #rxns.append(df1_row_)
        #loc_list.append(loc)
                
    #all_matches.append(match_rxn)
    return dictionary
            
            
            
            

In [16]:
dictionary =dataframe_find_loc(Kinetic_df,"EC_Number", df_EC, "EC_Comp")

In [17]:
# LOTS OF sanity check

print (len(dictionary.keys()))
key_list=[dictionary.keys]

405


In [18]:
print (len(dictionary.keys()))
key_list=[dictionary.keys]

405


In [19]:
len(Kinetic_df["EC_Number"].unique())

405

In [20]:
dictionary["1.1.1.1"]

[[2, 'EC_Comp'], [64, 'EC_Comp']]

In [21]:
dictionary.items()

dict_items([('1.1.1.1', [[2, 'EC_Comp'], [64, 'EC_Comp']]), ('1.1.1.2', [[2, 'EC_Comp'], [70, 'EC_Comp'], [79, 'EC_Comp'], [142, 'EC_Comp'], [180, 'EC_Comp'], [181, 'EC_Comp']]), ('1.1.1.8', [[64, 'EC_Comp'], [120, 'EC_Comp']]), ('1.1.1.10', [[2, 'EC_Comp']]), ('1.1.1.14', []), ('1.1.1.19', []), ('1.1.1.21', []), ('1.1.1.22', []), ('1.1.1.27', [[180, 'EC_Comp']]), ('1.1.1.31', [[66, 'EC_Comp']]), ('1.1.1.34', [[189, 'EC_Comp']]), ('1.1.1.35', [[63, 'EC_Comp'], [64, 'EC_Comp']]), ('1.1.1.37', [[79, 'EC_Comp'], [181, 'EC_Comp']]), ('1.1.1.38', []), ('1.1.1.39', []), ('1.1.1.40', [[80, 'EC_Comp'], [81, 'EC_Comp']]), ('1.1.1.41', [[70, 'EC_Comp']]), ('1.1.1.42', [[71, 'EC_Comp']]), ('1.1.1.44', []), ('1.1.1.45', []), ('1.1.1.49', [[133, 'EC_Comp']]), ('1.1.1.50', []), ('1.1.1.51', []), ('1.1.1.62', []), ('1.1.1.64', []), ('1.1.1.71', []), ('1.1.1.79', []), ('1.1.1.95', [[103, 'EC_Comp']]), ('nan', []), ('1.1.1.102', [[2, 'EC_Comp']]), ('1.1.1.105', []), ('1.1.1.145', []), ('1.1.1.146', [])

In [22]:
count=0
for ec, loc in dictionary.items():
    if loc==[]:
        count+=1
    print (ec, loc)

1.1.1.1 [[2, 'EC_Comp'], [64, 'EC_Comp']]
1.1.1.2 [[2, 'EC_Comp'], [70, 'EC_Comp'], [79, 'EC_Comp'], [142, 'EC_Comp'], [180, 'EC_Comp'], [181, 'EC_Comp']]
1.1.1.8 [[64, 'EC_Comp'], [120, 'EC_Comp']]
1.1.1.10 [[2, 'EC_Comp']]
1.1.1.14 []
1.1.1.19 []
1.1.1.21 []
1.1.1.22 []
1.1.1.27 [[180, 'EC_Comp']]
1.1.1.31 [[66, 'EC_Comp']]
1.1.1.34 [[189, 'EC_Comp']]
1.1.1.35 [[63, 'EC_Comp'], [64, 'EC_Comp']]
1.1.1.37 [[79, 'EC_Comp'], [181, 'EC_Comp']]
1.1.1.38 []
1.1.1.39 []
1.1.1.40 [[80, 'EC_Comp'], [81, 'EC_Comp']]
1.1.1.41 [[70, 'EC_Comp']]
1.1.1.42 [[71, 'EC_Comp']]
1.1.1.44 []
1.1.1.45 []
1.1.1.49 [[133, 'EC_Comp']]
1.1.1.50 []
1.1.1.51 []
1.1.1.62 []
1.1.1.64 []
1.1.1.71 []
1.1.1.79 []
1.1.1.95 [[103, 'EC_Comp']]
nan []
1.1.1.102 [[2, 'EC_Comp']]
1.1.1.105 []
1.1.1.145 []
1.1.1.146 []
1.1.1.149 []
1.1.1.153 []
1.1.1.184 []
1.1.1.189 []
1.1.1.197 []
1.1.1.205 [[142, 'EC_Comp']]
1.1.1.209 []
1.1.1.213 []
1.1.1.239 []
1.1.1.300 []
1.1.1.315 []
1.1.3.15 []
1.2.1.3 [[5, 'EC_Comp'], [116, 'EC_Co

In [23]:
print ("no matching reaction in model: ", count)
print ("EC to bigg reaction found :", 403-count)

no matching reaction in model:  265
EC to bigg reaction found : 138


In [24]:
# add dictionary results to kinetic df
#take a closer look at kinetic df before appending
Kinetic_df

Unnamed: 0,Entry ID,Recommended Enzyme Name,Reaction ID,Enzymatic Activity,EC_Number,Gene,Organism,Strain/Tissue,Reaction,Cofactor,...,pH,Temp (°C),Buffer/Media,Uniprot,References,Data Point,Notes,BIGG ID,BIGG NAME,KEEPing
0,29,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,7.5,25.0,,,RF4,,isozyme ADH1B2,,,X
1,30,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,7.5,25.0,,,RF4,,isozyme ADH1B1,,,X
2,37,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF7,,,,,X
3,42,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,,,X
4,43,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH2 at 21-,,,X
5,44,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,,,X
6,45,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH3 at 21-,,,X
7,59,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH3 at 21-,,,X
8,60,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,,,X
9,61,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,,,X


# CLEANUP
### A) <br>
* Check how many rows had their metabolite mapped to a bigg id
    * check null in bigg ID column --> (This was the part that was curated by hong manually) 

### B) 

* **Things to Consider**
    * IF kinetic EC is missing--> row will be dropped (for right now- TBD) 
    *
    
    
        * **Do not drop rows with empty parameter (km/kcat) column --> Mentioned earlier by hong

In [25]:
#A) 
Kinetic_df["BIGG ID"].isnull().value_counts()

False    5225
True     3537
Name: BIGG ID, dtype: int64

In [26]:
Kinetic_df.EC_Number.isnull().value_counts()

False    8033
True      729
Name: EC_Number, dtype: int64

In [27]:
a= Kinetic_df['BIGG ID'] [0]
Kinetic_df['BIGG ID']=Kinetic_df['BIGG ID'].astype(str)
Kinetic_df['EC_Number']=Kinetic_df['EC_Number'].astype(str)

In [28]:
Kinetic_df

Unnamed: 0,Entry ID,Recommended Enzyme Name,Reaction ID,Enzymatic Activity,EC_Number,Gene,Organism,Strain/Tissue,Reaction,Cofactor,...,pH,Temp (°C),Buffer/Media,Uniprot,References,Data Point,Notes,BIGG ID,BIGG NAME,KEEPing
0,29,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,7.5,25.0,,,RF4,,isozyme ADH1B2,,,X
1,30,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,7.5,25.0,,,RF4,,isozyme ADH1B1,,,X
2,37,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF7,,,,,X
3,42,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,,,X
4,43,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH2 at 21-,,,X
5,44,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,,,X
6,45,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH3 at 21-,,,X
7,59,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH3 at 21-,,,X
8,60,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,,,X
9,61,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,,,X


In [29]:
Kinetic_No_Match=Kinetic_df[(Kinetic_df['BIGG ID'] =='nan') & (Kinetic_df['EC_Number'] =="nan")]

In [30]:
Kinetic_No_Match

Unnamed: 0,Entry ID,Recommended Enzyme Name,Reaction ID,Enzymatic Activity,EC_Number,Gene,Organism,Strain/Tissue,Reaction,Cofactor,...,pH,Temp (°C),Buffer/Media,Uniprot,References,Data Point,Notes,BIGG ID,BIGG NAME,KEEPing
1183,4,D-3-phosphoglycerate dehydrogenase,3-PGDH,,,PHGDH,Homo sapiens,Human,3-phospho-D-glycerate + NAD+ = 3-phosphonooxyp...,,...,,,,O43175,RF1,,,,,X
1184,5,D-3-phosphoglycerate dehydrogenase,3-PGDH,,,PHGDH,Homo sapiens,Human,3-phospho-D-glycerate + NAD+ = 3-phosphonooxyp...,,...,,,,O43175,RF1,,,,,X
4890,214,Adenylate kinase isoenzyme 6UniRule annotation...,AK6,,,AK6,Homo sapiens,Human,ATP + AMP = 2 ADP,,...,,,,Q9Y3D8,RF11,,,,,X
4891,215,Adenylate kinase isoenzyme 6UniRule annotation...,AK6,,,AK6,Homo sapiens,Human,ATP + AMP = 2 ADP,,...,,,,Q9Y3D8,RF11,,,,,X
5841,107,"High affinity cGMP-specific 3',5'-cyclic phosp...",,,,PDE9A,Homo sapiens,Human,"Guanosine 3',5'-cyclic phosphate + H2O = guano...","Zn2+, Mg2+",...,,,,O76083,RF16,,,,,X
7660,699,Aflatoxin B1 aldehyde reductase member 2,AFB1 aldehyde reductase 1AFB1-AR 1,,,AKR7A2,Homo sapiens,Human,4-hydroxybutanoate + NADP+ = succinate semiald...,,...,,,,,O43488,RF8,,,,
7661,700,Aflatoxin B1 aldehyde reductase member 2,AFB1 aldehyde reductase 1AFB1-AR 1,,,AKR7A2,Homo sapiens,Human,4-hydroxybutanoate + NADP+ = succinate semiald...,,...,,,,,O43488,RF8,,,,
7662,701,Aflatoxin B1 aldehyde reductase member 2,AFB1 aldehyde reductase 1AFB1-AR 1,,,AKR7A2,Homo sapiens,Human,4-hydroxybutanoate + NADP+ = succinate semiald...,,...,,,,,O43488,RF8,,,,
7663,702,Aflatoxin B1 aldehyde reductase member 2,AFB1 aldehyde reductase 1AFB1-AR 1,,,AKR7A2,Homo sapiens,Human,4-hydroxybutanoate + NADP+ = succinate semiald...,,...,,,,,O43488,RF8,,,,
7668,1008,Aldo-keto reductase family 1 member B10,ARL-1,,,AKR1B10,Homo sapiens,Human,,,...,,,,,O60218,RF23,,,,


In [31]:
type(a)

float

In [32]:
Kinetic__Match=Kinetic_df[(Kinetic_df['BIGG ID'] !='nan') & (Kinetic_df['EC_Number'] !="nan")]

In [33]:
Kinetic__Match


Unnamed: 0,Entry ID,Recommended Enzyme Name,Reaction ID,Enzymatic Activity,EC_Number,Gene,Organism,Strain/Tissue,Reaction,Cofactor,...,pH,Temp (°C),Buffer/Media,Uniprot,References,Data Point,Notes,BIGG ID,BIGG NAME,KEEPing
16,74,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzyme beta1beta1,acald,acetaldehyde,
17,75,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzyme gamma2gamma2; isoenzymes beta2beta2,acald,acetaldehyde,
18,76,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzymes beta2beta2,acald,acetaldehyde,
19,77,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzyme gamma1gamma1,acald,acetaldehyde,
20,78,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,acald,acetaldehyde,
21,79,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzyme beta3beta3,acald,acetaldehyde,
22,80,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzyme alphaalpha,acald,acetaldehyde,
23,81,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,acald,acetaldehyde,
24,82,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,23.0,,"P00326, P08319",RF2,,isozyme ADH2 at 21-,acald,acetaldehyde,
34,99,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,,RF1,,isoenzyme beta1beta1,etoh,ethanol,


# Trial 1 --> See how much has mapped so far

add location of reaction info from model 

Add bigg id rxn for each row

In [34]:
Kinetic__Match.index=range(len(Kinetic__Match.index))

In [35]:
dictionary.values()

dict_values([[[2, 'EC_Comp'], [64, 'EC_Comp']], [[2, 'EC_Comp'], [70, 'EC_Comp'], [79, 'EC_Comp'], [142, 'EC_Comp'], [180, 'EC_Comp'], [181, 'EC_Comp']], [[64, 'EC_Comp'], [120, 'EC_Comp']], [[2, 'EC_Comp']], [], [], [], [], [[180, 'EC_Comp']], [[66, 'EC_Comp']], [[189, 'EC_Comp']], [[63, 'EC_Comp'], [64, 'EC_Comp']], [[79, 'EC_Comp'], [181, 'EC_Comp']], [], [], [[80, 'EC_Comp'], [81, 'EC_Comp']], [[70, 'EC_Comp']], [[71, 'EC_Comp']], [], [], [[133, 'EC_Comp']], [], [], [], [], [], [], [[103, 'EC_Comp']], [], [[2, 'EC_Comp']], [], [], [], [], [], [], [], [], [[142, 'EC_Comp']], [], [], [], [], [], [], [[5, 'EC_Comp'], [116, 'EC_Comp']], [[0, 'EC_Comp'], [18, 'EC_Comp'], [102, 'EC_Comp'], [135, 'EC_Comp']], [[98, 'EC_Comp']], [[135, 'EC_Comp']], [], [], [[93, 'EC_Comp'], [94, 'EC_Comp'], [95, 'EC_Comp']], [[5, 'EC_Comp']], [], [], [[98, 'EC_Comp']], [], [[102, 'EC_Comp']], [[0, 'EC_Comp'], [18, 'EC_Comp']], [[34, 'EC_Comp']], [], [[34, 'EC_Comp']], [], [], [], [], [], [[11, 'EC_Comp']],

In [36]:
found=0
not_found=0
for i in range(len(Kinetic__Match["EC_Number"])):
    kinetic_ec=Kinetic__Match["EC_Number"][i]
    for key, value in dictionary.items():
        if kinetic_ec == key and str(value) !="[]":
        #print (kinetic_ec)
            found+=1
    else: 
        not_found+=1
        
        
print (found, not_found)
if not_found==0:
    print ("AWESOME, all mapped kinetic data")

2643 5225


In [37]:
#Lets append the data to Kinetic DF 
#Trial 1: get dictonary to map to Kinetic DF 
#Trial 2: get loc from kinetic df to map reaction info 
DF_EC_LOC_LIST=[]
for i in range(len(Kinetic__Match["EC_Number"])):
    kinetic_ec=Kinetic__Match["EC_Number"][i]
    for key, value in dictionary.items():
        if kinetic_ec == key:
            DF_EC_LOC_LIST.append(value)
            
    

In [38]:
#Sanity check
len(DF_EC_LOC_LIST)==len(Kinetic__Match["EC_Number"])

True

In [39]:
#appending row to df
Kinetic__Match["df_EC_Loc"]=DF_EC_LOC_LIST

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [40]:
Kinetic__Match.columns

Index(['Entry ID', 'Recommended Enzyme Name', 'Reaction ID',
       'Enzymatic Activity', 'EC_Number', 'Gene', 'Organism', 'Strain/Tissue',
       'Reaction', 'Cofactor', 'Cofactor Concentration', 'Substrate',
       'Value Type', 'Value', 'Deviation', 'Units', 'Structure', 'Regulation',
       'Regulation Mechanism', 'pH', 'Temp (°C)', 'Buffer/Media', 'Uniprot',
       'References', 'Data Point', 'Notes', 'BIGG ID', 'BIGG NAME', 'KEEPing',
       'df_EC_Loc'],
      dtype='object')

In [41]:
Kinetic__Match[["EC_Number",'BIGG ID', 'BIGG NAME', 'KEEPing',
       'df_EC_Loc', "Reaction" ]]

Unnamed: 0,EC_Number,BIGG ID,BIGG NAME,KEEPing,df_EC_Loc,Reaction
0,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
1,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
2,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
3,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
4,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
5,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
6,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
7,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
8,1.1.1.1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...
9,1.1.1.1,etoh,ethanol,,"[[2, EC_Comp], [64, EC_Comp]]",a primary alcohol + NAD+ <=> an aldehyde + NAD...


In [42]:
#Trial 1 successful 
#trial 2 add df_EC info to kinetic rows 

# rename reaction in Kinetic to be specific 


In [43]:
Kinetic__Match=Kinetic__Match.rename(columns={"Reaction": "Reaction_Kinetic_Scrape"})

In [44]:
Kinetic__Match

Unnamed: 0,Entry ID,Recommended Enzyme Name,Reaction ID,Enzymatic Activity,EC_Number,Gene,Organism,Strain/Tissue,Reaction_Kinetic_Scrape,Cofactor,...,Temp (°C),Buffer/Media,Uniprot,References,Data Point,Notes,BIGG ID,BIGG NAME,KEEPing,df_EC_Loc
0,74,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzyme beta1beta1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
1,75,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzyme gamma2gamma2; isoenzymes beta2beta2,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
2,76,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzymes beta2beta2,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
3,77,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzyme gamma1gamma1,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
4,78,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,23.0,,"P00326, P08319",RF2,,isozyme ADH1C at 21-,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
5,79,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzyme beta3beta3,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
6,80,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzyme alphaalpha,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
7,81,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,23.0,,"P00326, P08319",RF2,,isozyme ADH4 at 21-,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
8,82,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,23.0,,"P00326, P08319",RF2,,isozyme ADH2 at 21-,acald,acetaldehyde,,"[[2, EC_Comp], [64, EC_Comp]]"
9,99,alcohol dehydrogenase,,,1.1.1.1,,Homo sapiens,,a primary alcohol + NAD+ <=> an aldehyde + NAD...,,...,,,,RF1,,isoenzyme beta1beta1,etoh,ethanol,,"[[2, EC_Comp], [64, EC_Comp]]"


In [45]:
#Lets look at df_EC
df_EC

Unnamed: 0,ANNOT_BIGG,ANNOT_EC,ANNOT_METANET,Annot Keys,Annot Values,Full Annotation,rxn_id,rxn_string,METANETX_EC,EC_Comp
0,2OXOADOXm,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']",MNXR94818,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '2OXOADOXm', 'META...","{'SBO': 'SBO:0000375', 'bigg.reaction': '2OXOA...",2OXOADOXm,2oxoadp_m + coa_m + nad_m --> co2_m + glutcoa_...,<td>1.2.1.52<br>1.2.4.2<br>1.8.1.4<br>2.3.1.61...,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']"
1,34HPPOR,1.13.11.27,MNXR94843,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '34HPPOR', 'META:4...","{'SBO': 'SBO:0000375', 'bigg.reaction': '34HPP...",34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,<td>1.13.11.27</td>,1.13.11.27
2,3DSPHR,1.1.1.102,MNXR94866,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '3DSPHR', 'META:3-...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3DSPH...",3DSPHR,3dsphgn_c + h_c + nadph_c --> nadp_c + sphgn_c,<td>1.1.1.102</td>,1.1.1.102
3,3HAO,1.13.11.6,MNXR94889,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HAO', '1.13.11.6...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HAO'...",3HAO,3hanthrn_c + o2_c --> cmusa_c + h_c,<td>1.13.11.6</td>,1.13.11.6
4,3HBCOAHLm,3.1.2.4,MNXR94891,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HBCOAHLm', '3.1....","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HBCO...",3HBCOAHLm,3hibutcoa_m + h2o_m --> 3hmp_m + coa_m + h_m,<td>3.1.2.4</td>,3.1.2.4
5,AASAD3m,1.2.1.31,MNXR95158,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AASAD3m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AASAD...",AASAD3m,L2aadp6sa_m + h2o_m + nad_m --> L2aadp_m + 2.0...,<td>1.2.1.31</td>,1.2.1.31
6,AATA,2.6.1.39,MNXR95160,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AATA', 'META:2-AM...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AATA'...",AATA,2oxoadp_c + glu__L_c <=> L2aadp_c + akg_c,<td>2.6.1.39</td>,2.6.1.39
7,ACACT10m,"['2.3.1.16', '2.3.1.9']",MNXR95195,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT10m', 'META:...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT10m,2maacoa_m + coa_m <=> accoa_m + ppcoa_m,<td>2.3.1.16<br>2.3.1.9</br></td>,"['2.3.1.16', '2.3.1.9']"
8,ACACT1m,2.3.1.9,MNXR95194,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT1m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT1m,2.0 accoa_m --> aacoa_m + coa_m,<td>2.3.1.9</td>,2.3.1.9
9,ACITL,"['2.3.3.8', '4.1.3.6']",MNXR95268,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACITL', 'META:ATP...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACITL...",ACITL,atp_c + cit_c + coa_c --> accoa_c + adp_c + oa...,<td>2.3.3.8<br>4.1.3.6</br></td>,"['2.3.3.8', '4.1.3.6']"


In [46]:
#check some stuff
df_EC["rxn_id"].tolist()==df_EC["ANNOT_BIGG"].tolist()



True

In [47]:
writer = pd.ExcelWriter('Kinetic__matched(beforemanualcuration.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
Kinetic__Match.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

## Curation Instructions
At this point each kinetic data point needs to be correctly identified to a BIGGID 

Curation process: 
for every row in kinetic_df 
look at Kinetic_df['df_EC_loc'] 
    * Go to that [row#, colname]
    identify the right bigg id based upon metabolite in kinetic data and reaction string
priority:
1. reaction string
1. enzyme name
1. metabolites

ADD BIGG ID of reaction that matches in df_EC

### lets look at how much curation

In [48]:
empty=0
curate=0
for i in Kinetic__Match["df_EC_Loc"]:
    if str(i) =="[]":
        empty+=1
    else:
        curate+=1
    

In [49]:
print (curate, empty)

2643 2582


In [50]:
# ADD ALL DF_EC INFO TO KINETIC DATA
# CAN BE USED TO ADD MORE DATA FROM MODEL TO DATA 
df_EC


Unnamed: 0,ANNOT_BIGG,ANNOT_EC,ANNOT_METANET,Annot Keys,Annot Values,Full Annotation,rxn_id,rxn_string,METANETX_EC,EC_Comp
0,2OXOADOXm,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']",MNXR94818,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '2OXOADOXm', 'META...","{'SBO': 'SBO:0000375', 'bigg.reaction': '2OXOA...",2OXOADOXm,2oxoadp_m + coa_m + nad_m --> co2_m + glutcoa_...,<td>1.2.1.52<br>1.2.4.2<br>1.8.1.4<br>2.3.1.61...,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']"
1,34HPPOR,1.13.11.27,MNXR94843,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '34HPPOR', 'META:4...","{'SBO': 'SBO:0000375', 'bigg.reaction': '34HPP...",34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,<td>1.13.11.27</td>,1.13.11.27
2,3DSPHR,1.1.1.102,MNXR94866,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '3DSPHR', 'META:3-...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3DSPH...",3DSPHR,3dsphgn_c + h_c + nadph_c --> nadp_c + sphgn_c,<td>1.1.1.102</td>,1.1.1.102
3,3HAO,1.13.11.6,MNXR94889,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HAO', '1.13.11.6...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HAO'...",3HAO,3hanthrn_c + o2_c --> cmusa_c + h_c,<td>1.13.11.6</td>,1.13.11.6
4,3HBCOAHLm,3.1.2.4,MNXR94891,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HBCOAHLm', '3.1....","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HBCO...",3HBCOAHLm,3hibutcoa_m + h2o_m --> 3hmp_m + coa_m + h_m,<td>3.1.2.4</td>,3.1.2.4
5,AASAD3m,1.2.1.31,MNXR95158,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AASAD3m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AASAD...",AASAD3m,L2aadp6sa_m + h2o_m + nad_m --> L2aadp_m + 2.0...,<td>1.2.1.31</td>,1.2.1.31
6,AATA,2.6.1.39,MNXR95160,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AATA', 'META:2-AM...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AATA'...",AATA,2oxoadp_c + glu__L_c <=> L2aadp_c + akg_c,<td>2.6.1.39</td>,2.6.1.39
7,ACACT10m,"['2.3.1.16', '2.3.1.9']",MNXR95195,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT10m', 'META:...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT10m,2maacoa_m + coa_m <=> accoa_m + ppcoa_m,<td>2.3.1.16<br>2.3.1.9</br></td>,"['2.3.1.16', '2.3.1.9']"
8,ACACT1m,2.3.1.9,MNXR95194,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT1m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT1m,2.0 accoa_m --> aacoa_m + coa_m,<td>2.3.1.9</td>,2.3.1.9
9,ACITL,"['2.3.3.8', '4.1.3.6']",MNXR95268,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACITL', 'META:ATP...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACITL...",ACITL,atp_c + cit_c + coa_c --> accoa_c + adp_c + oa...,<td>2.3.3.8<br>4.1.3.6</br></td>,"['2.3.3.8', '4.1.3.6']"


In [51]:
# add reaction name to df_EC
reaction_name_list=[]
metabolite_dicts=[]
reactants_list=[]
products_list=[]
for i in RECON3d.reactions:
    if i.id in df_EC["ANNOT_BIGG"].tolist():
        reaction_name_list.append(i.name)
        metabolite_dicts.append(i.metabolites)
        reactants_list.append(i.reactants)
        products_list.append(i.products)
    

In [52]:
df_EC["model_rxn_name"]=reaction_name_list
df_EC["metabolite_dict"]=metabolite_dicts
df_EC["reactants"]=reactants_list
df_EC["products"]=products_list

In [53]:
df_EC

Unnamed: 0,ANNOT_BIGG,ANNOT_EC,ANNOT_METANET,Annot Keys,Annot Values,Full Annotation,rxn_id,rxn_string,METANETX_EC,EC_Comp,model_rxn_name,metabolite_dict,reactants,products
0,2OXOADOXm,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']",MNXR94818,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '2OXOADOXm', 'META...","{'SBO': 'SBO:0000375', 'bigg.reaction': '2OXOA...",2OXOADOXm,2oxoadp_m + coa_m + nad_m --> co2_m + glutcoa_...,<td>1.2.1.52<br>1.2.4.2<br>1.8.1.4<br>2.3.1.61...,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']",2-Oxoadipate:lipoamde 2-oxidoreductase(decarbo...,"{2oxoadp_m: -1.0, coa_m: -1.0, nad_m: -1.0, co...","[2oxoadp_m, coa_m, nad_m]","[co2_m, glutcoa_m, nadh_m]"
1,34HPPOR,1.13.11.27,MNXR94843,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '34HPPOR', 'META:4...","{'SBO': 'SBO:0000375', 'bigg.reaction': '34HPP...",34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,<td>1.13.11.27</td>,1.13.11.27,4 Hydroxyphenylpyruvateoxygen oxidoreductase,"{34hpp_c: -1.0, o2_c: -1.0, co2_c: 1.0, hgenti...","[34hpp_c, o2_c]","[co2_c, hgentis_c]"
2,3DSPHR,1.1.1.102,MNXR94866,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '3DSPHR', 'META:3-...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3DSPH...",3DSPHR,3dsphgn_c + h_c + nadph_c --> nadp_c + sphgn_c,<td>1.1.1.102</td>,1.1.1.102,3 Dehydrosphinganine reductase,"{3dsphgn_c: -1.0, h_c: -1.0, nadph_c: -1.0, na...","[3dsphgn_c, h_c, nadph_c]","[nadp_c, sphgn_c]"
3,3HAO,1.13.11.6,MNXR94889,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HAO', '1.13.11.6...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HAO'...",3HAO,3hanthrn_c + o2_c --> cmusa_c + h_c,<td>1.13.11.6</td>,1.13.11.6,3 hydroxyanthranilate 3 4 dioxygenase,"{3hanthrn_c: -1.0, o2_c: -1.0, cmusa_c: 1.0, h...","[3hanthrn_c, o2_c]","[cmusa_c, h_c]"
4,3HBCOAHLm,3.1.2.4,MNXR94891,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HBCOAHLm', '3.1....","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HBCO...",3HBCOAHLm,3hibutcoa_m + h2o_m --> 3hmp_m + coa_m + h_m,<td>3.1.2.4</td>,3.1.2.4,"3-hydroxyisobutyryl-CoA hydrolase, mitochondrial","{3hibutcoa_m: -1.0, h2o_m: -1.0, 3hmp_m: 1.0, ...","[3hibutcoa_m, h2o_m]","[3hmp_m, coa_m, h_m]"
5,AASAD3m,1.2.1.31,MNXR95158,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AASAD3m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AASAD...",AASAD3m,L2aadp6sa_m + h2o_m + nad_m --> L2aadp_m + 2.0...,<td>1.2.1.31</td>,1.2.1.31,L-aminoadipate-semialdehyde dehydrogenase (NAD...,"{L2aadp6sa_m: -1.0, h2o_m: -1.0, nad_m: -1.0, ...","[L2aadp6sa_m, h2o_m, nad_m]","[L2aadp_m, h_m, nadh_m]"
6,AATA,2.6.1.39,MNXR95160,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AATA', 'META:2-AM...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AATA'...",AATA,2oxoadp_c + glu__L_c <=> L2aadp_c + akg_c,<td>2.6.1.39</td>,2.6.1.39,2 aminoadipate transaminase,"{2oxoadp_c: -1.0, glu__L_c: -1.0, L2aadp_c: 1....","[2oxoadp_c, glu__L_c]","[L2aadp_c, akg_c]"
7,ACACT10m,"['2.3.1.16', '2.3.1.9']",MNXR95195,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT10m', 'META:...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT10m,2maacoa_m + coa_m <=> accoa_m + ppcoa_m,<td>2.3.1.16<br>2.3.1.9</br></td>,"['2.3.1.16', '2.3.1.9']",Acetyl-CoA C-acyltransferase,"{2maacoa_m: -1.0, coa_m: -1.0, accoa_m: 1.0, p...","[2maacoa_m, coa_m]","[accoa_m, ppcoa_m]"
8,ACACT1m,2.3.1.9,MNXR95194,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT1m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT...",ACACT1m,2.0 accoa_m --> aacoa_m + coa_m,<td>2.3.1.9</td>,2.3.1.9,Acetyl CoA C acetyltransferase mitochondrial,"{accoa_m: -2.0, aacoa_m: 1.0, coa_m: 1.0}",[accoa_m],"[aacoa_m, coa_m]"
9,ACITL,"['2.3.3.8', '4.1.3.6']",MNXR95268,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACITL', 'META:ATP...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACITL...",ACITL,atp_c + cit_c + coa_c --> accoa_c + adp_c + oa...,<td>2.3.3.8<br>4.1.3.6</br></td>,"['2.3.3.8', '4.1.3.6']",ATP-Citrate lyase,"{atp_c: -1.0, cit_c: -1.0, coa_c: -1.0, accoa_...","[atp_c, cit_c, coa_c]","[accoa_c, adp_c, oaa_c, pi_c]"


In [57]:
#reorder columns for optimal curation
print (df_EC.columns)
column_order=['ANNOT_BIGG', 'ANNOT_EC','EC_Comp' ,'model_rxn_name', 'rxn_id', 'rxn_string','metabolite_dict',
              'reactants', 'products','ANNOT_METANET', 'Annot Keys', 'Annot Values',
              'Full Annotation']
df_EC=df_EC[column_order]

Index(['ANNOT_BIGG', 'ANNOT_EC', 'ANNOT_METANET', 'Annot Keys', 'Annot Values',
       'Full Annotation', 'rxn_id', 'rxn_string', 'METANETX_EC', 'EC_Comp',
       'model_rxn_name', 'metabolite_dict', 'reactants', 'products'],
      dtype='object')


In [58]:
df_EC

Unnamed: 0,ANNOT_BIGG,ANNOT_EC,EC_Comp,model_rxn_name,rxn_id,rxn_string,metabolite_dict,reactants,products,ANNOT_METANET,Annot Keys,Annot Values,Full Annotation
0,2OXOADOXm,"['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']","['1.2.1.52', '1.2.4.2', '1.8.1.4', '2.3.1.61']",2-Oxoadipate:lipoamde 2-oxidoreductase(decarbo...,2OXOADOXm,2oxoadp_m + coa_m + nad_m --> co2_m + glutcoa_...,"{2oxoadp_m: -1.0, coa_m: -1.0, nad_m: -1.0, co...","[2oxoadp_m, coa_m, nad_m]","[co2_m, glutcoa_m, nadh_m]",MNXR94818,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '2OXOADOXm', 'META...","{'SBO': 'SBO:0000375', 'bigg.reaction': '2OXOA..."
1,34HPPOR,1.13.11.27,1.13.11.27,4 Hydroxyphenylpyruvateoxygen oxidoreductase,34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,"{34hpp_c: -1.0, o2_c: -1.0, co2_c: 1.0, hgenti...","[34hpp_c, o2_c]","[co2_c, hgentis_c]",MNXR94843,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '34HPPOR', 'META:4...","{'SBO': 'SBO:0000375', 'bigg.reaction': '34HPP..."
2,3DSPHR,1.1.1.102,1.1.1.102,3 Dehydrosphinganine reductase,3DSPHR,3dsphgn_c + h_c + nadph_c --> nadp_c + sphgn_c,"{3dsphgn_c: -1.0, h_c: -1.0, nadph_c: -1.0, na...","[3dsphgn_c, h_c, nadph_c]","[nadp_c, sphgn_c]",MNXR94866,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', '3DSPHR', 'META:3-...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3DSPH..."
3,3HAO,1.13.11.6,1.13.11.6,3 hydroxyanthranilate 3 4 dioxygenase,3HAO,3hanthrn_c + o2_c --> cmusa_c + h_c,"{3hanthrn_c: -1.0, o2_c: -1.0, cmusa_c: 1.0, h...","[3hanthrn_c, o2_c]","[cmusa_c, h_c]",MNXR94889,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HAO', '1.13.11.6...","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HAO'..."
4,3HBCOAHLm,3.1.2.4,3.1.2.4,"3-hydroxyisobutyryl-CoA hydrolase, mitochondrial",3HBCOAHLm,3hibutcoa_m + h2o_m --> 3hmp_m + coa_m + h_m,"{3hibutcoa_m: -1.0, h2o_m: -1.0, 3hmp_m: 1.0, ...","[3hibutcoa_m, h2o_m]","[3hmp_m, coa_m, h_m]",MNXR94891,"dict_keys(['SBO', 'bigg.reaction', 'ec-code', ...","dict_values(['SBO:0000375', '3HBCOAHLm', '3.1....","{'SBO': 'SBO:0000375', 'bigg.reaction': '3HBCO..."
5,AASAD3m,1.2.1.31,1.2.1.31,L-aminoadipate-semialdehyde dehydrogenase (NAD...,AASAD3m,L2aadp6sa_m + h2o_m + nad_m --> L2aadp_m + 2.0...,"{L2aadp6sa_m: -1.0, h2o_m: -1.0, nad_m: -1.0, ...","[L2aadp6sa_m, h2o_m, nad_m]","[L2aadp_m, h_m, nadh_m]",MNXR95158,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AASAD3m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AASAD..."
6,AATA,2.6.1.39,2.6.1.39,2 aminoadipate transaminase,AATA,2oxoadp_c + glu__L_c <=> L2aadp_c + akg_c,"{2oxoadp_c: -1.0, glu__L_c: -1.0, L2aadp_c: 1....","[2oxoadp_c, glu__L_c]","[L2aadp_c, akg_c]",MNXR95160,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'AATA', 'META:2-AM...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'AATA'..."
7,ACACT10m,"['2.3.1.16', '2.3.1.9']","['2.3.1.16', '2.3.1.9']",Acetyl-CoA C-acyltransferase,ACACT10m,2maacoa_m + coa_m <=> accoa_m + ppcoa_m,"{2maacoa_m: -1.0, coa_m: -1.0, accoa_m: 1.0, p...","[2maacoa_m, coa_m]","[accoa_m, ppcoa_m]",MNXR95195,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT10m', 'META:...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT..."
8,ACACT1m,2.3.1.9,2.3.1.9,Acetyl CoA C acetyltransferase mitochondrial,ACACT1m,2.0 accoa_m --> aacoa_m + coa_m,"{accoa_m: -2.0, aacoa_m: 1.0, coa_m: 1.0}",[accoa_m],"[aacoa_m, coa_m]",MNXR95194,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACACT1m', 'META:A...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACACT..."
9,ACITL,"['2.3.3.8', '4.1.3.6']","['2.3.3.8', '4.1.3.6']",ATP-Citrate lyase,ACITL,atp_c + cit_c + coa_c --> accoa_c + adp_c + oa...,"{atp_c: -1.0, cit_c: -1.0, coa_c: -1.0, accoa_...","[atp_c, cit_c, coa_c]","[accoa_c, adp_c, oaa_c, pi_c]",MNXR95268,"dict_keys(['SBO', 'bigg.reaction', 'biocyc', '...","dict_values(['SBO:0000375', 'ACITL', 'META:ATP...","{'SBO': 'SBO:0000375', 'bigg.reaction': 'ACITL..."


In [59]:
# UPDATE DF_EC so we dont need to run the scrape everytime
writer = pd.ExcelWriter('df_ec.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
df_EC.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

# Overview: 

## EC found in kinetic_df were found in df_EC where the location 

In [None]:
#str.contains too strict of a criteria, not matching to lists

df1=df_EC[df_EC["EC_Comp"].str.contains("2.6.1.5", na=False)]

In [None]:
output=Kinetic_df.EC_Number.apply(match_col)



In [None]:
for index_kin in range(len(Kinetic_df)):
    ec_kin=Kinetic_df["EC Number"][index_kin]
    for index_ec in range(len(df_EC)):
        ec_df=df_EC["EC_Comp"][index_ec]


In [None]:
len(Kinetic_df)

In [None]:
len(Kinetic_df["EC Number"])

In [None]:
Kinetic_df_dropped["EC Number"]=Kinetic_df_dropped["EC Number"].apply(_strip_lower)


In [None]:

c1=0
c2=0
n_matches=[]
match_list=[]
RXN_EC=[]
ec_kinetic_list=[]

n_list=[]

for ec_kinetic in Kinetic_df_dropped["EC Number"]:
    ec_kinetic_list.append(ec_kinetic)
    n=0
    
    #print (ec_kinetic)
    for i in range(len(df_EC["ANNOT_EC"])):
        
        
        if type(df_EC["ANNOT_EC"][i])==list:
            for j in df_EC["ANNOT_EC"][i]:
                if ec_kinetic==j: 
                    n_matches.append(j)
                    c1+=1
                    n+=1
                    
        if type(df_EC["ANNOT_EC"][i])==str:
            if ec_kinetic==df_EC["ANNOT_EC"][i]:
                n_matches.append(df_EC["ANNOT_EC"][i])
                c2+=1
                n+=1
    match_list.append(n_matches)
    print (n)
                

print ("Match found in list", c1)
print ("Match found not list", c2)
print (c1+c2, len(Kinetic_df_dropped))

In [None]:
print (len(ec_kinetic_list))
print (len(match_list))

In [None]:
df_NEW=pd.DataFrame({"EC_Kinetic": ec_kinetic_list, "EC_scrape": match_list})
df_NEW

In [None]:
EC_scrape=[]
EC_model=[]
RXN_ID=[]
RXN_STRING=[]
ANNOT_BIGG=[]

c=0
for EC_kinetic in Kinetic_df_dropped["EC Number"]: 
    for i in range(len(df_EC["ANNOT_EC"])):
        Annot_ec=df_EC["ANNOT_EC"][i]
        if type(Annot_ec)==list:
            if EC_kinetic in Annot_ec: 
                c+=1
                EC_scrape.append(EC_kinetic)
                EC_model.append(Annot_ec)
                RXN_ID.append(df_EC["rxn_id"][i])
                RXN_STRING.append(df_EC["rxn_string"][i])
                ANNOT_BIGG.append(df_EC["ANNOT_BIGG"][i])
        elif type(Annot_ec)==str: 
            if str(EC_kinetic) ==str(Annot_ec):
                c+=1
                EC_scrape.append(EC_kinetic)
                EC_model.append(Annot_ec)
                RXN_ID.append(df_EC["rxn_id"][i])
                RXN_STRING.append(df_EC["rxn_string"][i])
                ANNOT_BIGG.append(df_EC["ANNOT_BIGG"][i])
                
            
print (c)

        

In [None]:
df=pd.DataFrame({"Scrape_EC": EC_scrape, "Model_ECs": EC_model, "Rxn_id": RXN_ID,
                 "Rxn_string": RXN_STRING , "Bigg_Annot": ANNOT_BIGG})

In [None]:
df_NEW=pd.DataFrame({"EC_Kinetic": EC_kinetic, "EC_scrape": EC_model, "RXN_ID": RXN_ID,
                     "RXN_string": RXN_STRING, "ANNOT_BIGG": ANNOT_BIGG})
df_NEW

In [None]:
writer = pd.ExcelWriter('KINETIC_DATASET_FINAL(EC_matched)', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
Kinetic_df_dropped.to_excel(writer, sheet_name='Sheet1')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

# Appendix

In [None]:
# OLD FUNCTION


def match_col(df_EC,):
    output_EC_all_matches=[]
    for i in range(len(df_EC)):
        match_list=[]
        if type(df_EC["ANNOT_EC"][i])==list:
            for j in range(len(df_EC["ANNOT_EC"][i])):
                if ec_kinetic== df_EC["ANNOT_EC"][i][j]:
                    print ("LIST: ", ec_kinetic,  df_EC["ANNOT_EC"][i][j])
                    match_list.append([i,j])
            
        if type(df_EC["ANNOT_EC"][i])==str:
            if ec_kinetic==df_EC["ANNOT_EC"][i]:
                print ("STR: ", ec_kinetic,  df_EC["ANNOT_EC"][i])
                match_list.append([i])
    output_EC_all_matches.append(match_list)
            
    return output_EC_all_matches
###    

In [None]:
#str.contains too strict of a criteria, not matching to lists

df1=df_EC[df_EC["EC_Comp"].str.contains("2.6.1.5", na=False)]

In [None]:
# OLD FUNCTION


def match_col(df_EC,):
    output_EC_all_matches=[]
    for i in range(len(df_EC)):
        match_list=[]
        if type(df_EC["ANNOT_EC"][i])==list:
            for j in range(len(df_EC["ANNOT_EC"][i])):
                if ec_kinetic== df_EC["ANNOT_EC"][i][j]:
                    print ("LIST: ", ec_kinetic,  df_EC["ANNOT_EC"][i][j])
                    match_list.append([i,j])
            
        if type(df_EC["ANNOT_EC"][i])==str:
            if ec_kinetic==df_EC["ANNOT_EC"][i]:
                print ("STR: ", ec_kinetic,  df_EC["ANNOT_EC"][i])
                match_list.append([i])
    output_EC_all_matches.append(match_list)
            
    return output_EC_all_matches
###    