In [1]:
import pandas as pd
import requests

In [10]:
# define function to get kegg data linking EC to reactions
def get_ec_rn_map():
    url = 'http://rest.kegg.jp/link/rn/ec'
    r = requests.get(url)
    g = pd.DataFrame([x.split('\t') for x in r.text.split('\n')],columns=['ec','rn'])
    g = g.iloc[0:-1]
    g['ec'] = g['ec'].apply(lambda x: x.split(':')[1])
    g['rn'] = g['rn'].apply(lambda x: x.split(':')[1])
    return g

# rad in ecode2ec (Liam's parsed file)
#df = pd.read_csv('ecod2ec.txt',sep=' ',header=None)
df = pd.read_csv('ecod2ec_3Feb2021.txt',sep=' ',header=None)
df.columns = ['rule','ec']

# collapse data to skinny format
dfs = [];
for idx,row in df.iterrows():
    ec_nums = row.ec.split('|');
    dfs.append(pd.DataFrame({'rule': row.rule, 'ec': ec_nums}))
df = pd.concat(dfs,axis=0)


# download KEGG reaction mapping
ecrn = get_ec_rn_map()

# subset data, such that the EC case to be define for the first 3 digits to be a valid rule
dff = df[~df['ec'].apply(lambda x: x.split('.')[-2] == '-')]
# remove PDB chain not observed:
dff = dff[~dff['rule'].isin(['PDBChainNotFound'])]

# define 3 letter code for both mappings
dff['ec_3l'] = dff['ec'].apply(lambda x: ".".join(x.split('.')[0:3]))
ecrn['ec_3l'] = ecrn['ec'].apply(lambda x: ".".join(x.split('.')[0:3]))


In [11]:
# join both tables and keep unique Ecode to KEGG reaction rules
rules = dff.set_index('ec_3l')[['rule']].join(ecrn.set_index('ec_3l')[['rn']]).dropna().drop_duplicates()

In [13]:
rules.to_csv('ecod2rn.ec3.07Feb2021.csv')