# genomic traits analysis for the strains used in the experiment

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
sns.set(style="white", context='poster')

In [2]:
import os
dpath = 'trait_gene_annotations'
os.listdir(dpath)

['10cc_experimental_traits.xlsx',
 '10cc_signalP.csv',
 '42003_2022_3184_MOESM4_ESM_select_genomes.xlsx',
 'brenda_2024_1.json.tar.gz',
 'brenda_results_extracellluar enzymes.xlsx',
 'genetic_traits_table.tsv',
 'genetic_traits_table_filtered.tsv',
 'genomes_annotation_long_format.7z',
 'kegg_pathways.csv',
 'MASTER_table.tsv',
 'N related  enzymes.xlsx',
 'ROS related  enzymes.xlsx',
 'signalp',
 'strain_list.csv']

In [3]:
strains = pd.read_csv(os.path.join(dpath, 'strain_list.csv'))

In [4]:
strains

Unnamed: 0,Species,GTDB taxonomy,NCBI taxonID,Accession number,Filename,Filename_Elena
0,Ruegeria pomeroyi DSS-3,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,246200.0,GCA_000011965.2,GCA_000011965,246200.7.fna
1,Pseudoalteromonas haloplanktis TAC125,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,326442.0,GCA_000026085.1,GCA_000026085,326442.8.fna
2,Alteromonas macleodii HOT1A3,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,529120.0,2687454166,2687453488,28108.53.fna
3,Marinobacter adhaerens HP15,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,225937.0,650377991,650377991,225937.3.fna
4,Phaeobacter gallaeciensis DSM 26640,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,1423144.0,GCA_000511385.1,GCA_000511385,1423144.3.fna
5,Sulfitobacter pseudonitzschiae strain SMR1,d__Bacteria;p__Proteobacteria;c__Alphaproteoba...,1402135.0,CeMEB_private,Sulfitobacter_pseudonitzschiae_SMR1,1402135.12.fna
6,Roseovrius HOT5_C3,d__Bacteria;p__Pseudomonadota;c__Alphaproteoba...,,,C03,C03.fna
7,Marinovum HOT5_F3,d__Bacteria;p__Pseudomonadota;c__Alphaproteoba...,,,F03,F03.fna


In [5]:
strains['filename'] = strains.Filename_Elena.str.replace('.fna', '', regex=False)

In [6]:
strain_files = strains.Filename

In [7]:
traits_df = pd.read_csv(os.path.join(dpath, 'MASTER_table.tsv'), sep='\t', dtype={'filename': str})

In [8]:
traits_df

Unnamed: 0,filename,locus_tag,type,start,end,strand,gene,product,db_xref,eC_number,KO,KEGG_KM,KEGG_manual,antiSMASH,BioV_transp,blast_phytohormones,blast_vibrioferrin,blast_DMSP,dbCAN_CAZy
0,1004786.3,GGNLDHOK_00001,CDS,696,2360,+,dnaA_1,Chromosomal replication initiator protein DnaA,,,K02313,,,,,,,,
1,1004786.3,GGNLDHOK_00002,CDS,2393,3493,+,dnaN,Beta sliding clamp,COG:COG0592,,K02338,,,,,,,,
2,1004786.3,GGNLDHOK_00003,CDS,3569,4693,+,recF,DNA replication and repair protein RecF,COG:COG1195,,K03629,,,,,,,,
3,1004786.3,GGNLDHOK_00004,CDS,4702,7122,+,gyrB,DNA gyrase subunit B,COG:COG0187,5.6.2.2,K02470,,,,,,,,
4,1004786.3,GGNLDHOK_00005,CDS,7333,7776,-,,hypothetical protein,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59513,GCF_000012345.1_ASM1234v1_genomic,BLEGILBE_01380,CDS,1302965,1304932,-,mrdA,"Peptidoglycan D,D-transpeptidase MrdA",COG:COG0768,3.4.16.4,K05515,,,,,,,,
59514,GCF_000012345.1_ASM1234v1_genomic,BLEGILBE_01381,CDS,1304922,1305443,-,,hypothetical protein,,,K03571,,,,,,,,
59515,GCF_000012345.1_ASM1234v1_genomic,BLEGILBE_01382,CDS,1305440,1306291,-,,hypothetical protein,,,K03570,,,,,,,,
59516,GCF_000012345.1_ASM1234v1_genomic,BLEGILBE_01383,CDS,1306297,1307334,-,mreB,Cell shape-determining protein MreB,,,K03569,,,,,,,,


In [9]:
traits_df.loc[traits_df.filename.isin(['GCA_000026085'])].filename.unique()

array([], dtype=object)

In [10]:
traits_df.loc[~traits_df.filename.isin(strains.filename)].filename.unique()

array(['1004786.3', '1300254.3', '1347342.6', '314275.5', '717774.3',
       'GCF_000012345.1_ASM1234v1_genomic'], dtype=object)

In [11]:
strains.loc[~strains.filename.isin(traits_df.filename)]

Unnamed: 0,Species,GTDB taxonomy,NCBI taxonID,Accession number,Filename,Filename_Elena,filename


In [12]:
df = traits_df.loc[traits_df.filename.isin(strains.filename)]

In [13]:
df = pd.merge(df, strains[['filename', 'Species']], on='filename')

In [14]:
for i in df.columns:
    print(i)
    print(df[i].unique())

filename
['1402135.12' '1423144.3' '225937.3' '246200.7' '28108.53' '326442.8'
 'C03' 'F03']
locus_tag
['ALGBIFEJ_00001' 'ALGBIFEJ_00002' 'ALGBIFEJ_00003' ... 'FLOFJFJI_05679'
 'FLOFJFJI_05680' 'FLOFJFJI_05681']
type
['CDS' 'tRNA' 'tmRNA' 'rRNA']
start
[    7   405   803 ... 97416 97879 98387]
end
[  408   806  1381 ... 97826 98265 98743]
strand
['+' '-']
gene
[nan 'fadE' 'fadJ_1' ... 'xynR_4' 'dntB' 'mltF_8']
product
['hypothetical protein' 'Acyl-coenzyme A dehydrogenase'
 'Acetyl-CoA acetyltransferase' ... 'putative oxidoreductase/MSMEI_2347'
 'Atrazine chlorohydrolase' '4-methyl-5-nitrocatechol 5-monooxygenase']
db_xref
[nan 'COG:COG1960' 'COG:COG0183' ... 'COG:COG2706' 'COG:COG4957'
 'COG:COG2376']
eC_number
[nan '1.3.8.7' '2.3.1.9' ... '4.1.2.29' '3.8.1.8' '1.14.13.210']
KO
[nan 'K06445' 'K00626' ... 'K06221' 'K07255' 'K07256']
KEGG_KM
[nan 'M00087_beta-Oxidation'
 'M00957_Lysine_degradation,_bacteria,_L-lysine_=>_glutarate_=>_succinate/acetyl-CoA'
 'M00373_Ethylmalonyl_pathway'
 

In [15]:
df.groupby('filename').size()

filename
1402135.12    5190
1423144.3     4566
225937.3      4541
246200.7      4575
28108.53      4366
326442.8      3647
C03           4423
F03           5810
dtype: int64

In [16]:
import tarfile
import json

tar_file_name = "brenda_2024_1.json.tar.gz"
data_file_name = "brenda_2024_1.json"
with tarfile.open(os.path.join(dpath,tar_file_name), "r:gz") as tar:
    with tar.extractfile(data_file_name) as f:
        j = json.loads(f.read())


In [17]:
j['data']['1.1.1.350'].keys()

dict_keys(['id', 'protein', 'recommended_name', 'systematic_name', 'synonyms', 'reaction', 'natural_substrates_products', 'substrates_products', 'turnover_number', 'km_value', 'cofactor', 'subunits', 'protein_variants', 'cloned', 'crystallization', 'purification', 'reference', 'kcat_km_value'])

In [18]:
cols=['id', 'protein', 'recommended_name', 'systematic_name', 'synonyms']
for i in cols:
    print(i)
    print(j['data']['1.1.1.350'][i])


id
1.1.1.350
protein
{'1': {'id': '1', 'organism': 'Escherichia coli', 'references': ['1'], 'comment': ''}, '2': {'id': '2', 'organism': 'Escherichia coli', 'source': 'uniprot', 'accessions': ['P77555'], 'references': ['2'], 'comment': ''}}
recommended_name
ureidoglycolate dehydrogenase (NAD+)
systematic_name
(S)-ureidoglycolate:NAD+ oxidoreductase
synonyms
[{'value': 'ureidoglycolate dehydrogenase', 'proteins': ['1'], 'references': ['1'], 'comment': ''}, {'proteins': ['1'], 'references': ['1'], 'comment': 'gene name', 'value': 'AllD'}, {'value': 'AllD', 'proteins': ['2'], 'references': ['2'], 'comment': ''}, {'value': '(S)-ureidoglycolate dehydrogenase', 'proteins': ['2'], 'references': ['2'], 'comment': ''}]


In [19]:
def get_synonyms(x):
    if 'synonyms' in x:
        names =  [x.get('recommended_name', np.NaN), x.get('systematic_name', np.NaN),]
        
        return ';'.join({i['value'] for i in x['synonyms'] if i['value'] not in names})
    return np.NaN
get_synonyms (j['data']['1.1.1.350'])



'(S)-ureidoglycolate dehydrogenase;AllD;ureidoglycolate dehydrogenase'

In [20]:
brenda_db = pd.DataFrame([{
    'eC_number': i.get('id', np.NaN),
    'recommended_name' : i.get('recommended_name', np.NaN),
    'systematic_name' : i.get('systematic_name', np.NaN),
    'synonyms' : get_synonyms(i), 
} for i in j['data'].values()])

In [21]:
brenda_db

Unnamed: 0,eC_number,recommended_name,systematic_name,synonyms
0,spontaneous,spontaneous reaction,spontaneous non-enzymatic reaction,
1,1.1.1.1,alcohol dehydrogenase,alcohol:NAD+ oxidoreductase,ADH3;glutathione-dependent formaldehyde dehydr...
2,1.1.1.10,L-xylulose reductase,xylitol:NADP+ 4-oxidoreductase (L-xylulose-for...,dicarbonyl/L-xylulose reductase;XR;xylitol deh...
3,1.1.1.100,3-oxoacyl-[acyl-carrier-protein] reductase,(3R)-3-hydroxyacyl-[acyl-carrier protein]:NADP...,OAR1;XCC0416;3-oxoacyl-[ACP]reductase;MSMEG_67...
4,1.1.1.101,acylglycerone-phosphate reductase,1-palmitoylglycerol-3-phosphate:NADP+ oxidored...,"reductase, palmitoyl dihydroxyacetone phosphat..."
...,...,...,...,...
7870,7.6.2.5,ABC-type heme transporter,"ATP phosphohydrolase (ABC-type, heme-exporting)",heme-transporting ATPase;HmuU;HmuTUV;EC 3.6.3....
7871,7.6.2.6,ABC-type guanine transporter,"ATP phosphohydrolase (ABC-type, guanine-import...",guanine-transporting ATPase;EC 3.6.3.37
7872,7.6.2.7,ABC-type taurine transporter,"ATP phosphohydrolase (ABC-type, taurine-import...",taurine-transporting ATPase;EC 3.6.3.36
7873,7.6.2.8,ABC-type vitamin B12 transporter,"ATP phosphohydrolase (ABC-type, vitamin B12-im...",cobalamin-specific ECF-type ABC transporter;Bt...


In [22]:
df = pd.merge(df, brenda_db, on='eC_number', how='left')

In [23]:
df

Unnamed: 0,filename,locus_tag,type,start,end,strand,gene,product,db_xref,eC_number,...,antiSMASH,BioV_transp,blast_phytohormones,blast_vibrioferrin,blast_DMSP,dbCAN_CAZy,Species,recommended_name,systematic_name,synonyms
0,1402135.12,ALGBIFEJ_00001,CDS,7,408,+,,hypothetical protein,,,...,,,,,,,Sulfitobacter pseudonitzschiae strain SMR1,,,
1,1402135.12,ALGBIFEJ_00002,CDS,405,806,+,,hypothetical protein,,,...,,,,,,,Sulfitobacter pseudonitzschiae strain SMR1,,,
2,1402135.12,ALGBIFEJ_00003,CDS,803,1381,+,,hypothetical protein,,,...,,,,,,,Sulfitobacter pseudonitzschiae strain SMR1,,,
3,1402135.12,ALGBIFEJ_00004,CDS,1378,2424,+,,hypothetical protein,,,...,,,,,,,Sulfitobacter pseudonitzschiae strain SMR1,,,
4,1402135.12,ALGBIFEJ_00005,CDS,2501,4750,+,fadE,Acyl-coenzyme A dehydrogenase,COG:COG1960,1.3.8.7,...,,,,,,,Sulfitobacter pseudonitzschiae strain SMR1,medium-chain acyl-CoA dehydrogenase,medium-chain acyl-CoA:electron-transfer flavop...,LCAD;EC 1.3.2.2;acyl dehydrogenase;ACAD-9;EC 1...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37113,F03,FLOFJFJI_05677,CDS,96160,96888,-,,hypothetical protein,,,...,,,,,,,Marinovum HOT5_F3,,,
37114,F03,FLOFJFJI_05678,CDS,97013,97315,-,,hypothetical protein,,,...,,,,,,,Marinovum HOT5_F3,,,
37115,F03,FLOFJFJI_05679,CDS,97416,97826,-,,hypothetical protein,,,...,,,,,,,Marinovum HOT5_F3,,,
37116,F03,FLOFJFJI_05680,CDS,97879,98265,-,,hypothetical protein,,,...,,,,,,,Marinovum HOT5_F3,,,


In [29]:
df.loc[~df.eC_number.isna() & df.recommended_name.isna(), ['eC_number', 'KO']].value_counts(dropna=False)

eC_number  KO    
1.-.-.-    NaN       84
2.7.-.-    NaN       51
3.4.-.-    NaN       39
3.5.1.-    NaN       38
1.3.99.-   NaN       38
                     ..
3.4.13.-   K01270     1
1.18.1.-   K26139     1
3.4.14.-   K01354     1
3.4.21.-   K03503     1
1.-.-.-    K00004     1
Name: count, Length: 474, dtype: int64

In [24]:
df.isna().sum()

filename                   0
locus_tag                  0
type                       0
start                      0
end                        0
strand                     0
gene                   18216
product                    0
db_xref                23277
eC_number              24839
KO                     16720
KEGG_KM                33069
KEGG_manual            37102
antiSMASH              35867
BioV_transp            34195
blast_phytohormones    35913
blast_vibrioferrin     36915
blast_DMSP             37003
dbCAN_CAZy             37063
Species                    0
recommended_name       26976
systematic_name        27540
synonyms               27211
dtype: int64

In [30]:
kegg_df = pd.read_csv(os.path.join(dpath,'kegg_pathways.csv'))
kegg_df

Unnamed: 0.1,Unnamed: 0,main,sub,path,ecpath,path_id
0,0,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K00844 HK; hexokinase [EC:2.7.1.1],K00844
1,1,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K12407 GCK; glucokinase [EC:2.7.1.2],K12407
2,2,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K00845 glk; glucokinase [EC:2.7.1.2],K00845
3,3,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],K25026 glk; glucokinase [EC:2.7.1.2],K25026
4,4,09100 Metabolism,09101 Carbohydrate metabolism,00010 Glycolysis / Gluconeogenesis [PATH:ko00010],"K01810 GPI, pgi; glucose-6-phosphate isomeras...",K01810
...,...,...,...,...,...,...
58187,58188,09190 Not Included in Pathway or Brite,09194 Poorly characterized,99997 Function unknown,K07220 K07220; uncharacterized protein,K07220
58188,58189,09190 Not Included in Pathway or Brite,09194 Poorly characterized,99997 Function unknown,K07276 K07276; uncharacterized protein,K07276
58189,58190,09190 Not Included in Pathway or Brite,09194 Poorly characterized,99997 Function unknown,K07338 K07338; uncharacterized protein,K07338
58190,58191,09190 Not Included in Pathway or Brite,09194 Poorly characterized,99997 Function unknown,K07586 ygaC; uncharacterized protein,K07586


In [32]:
kegg_df2 = kegg_df.drop_duplicates(subset=['ecpath','path_id'])

In [35]:
kegg_df2[['ecpath','path_id']].value_counts()
kegg_df2[['path_id',]].value_counts()

path_id
K00001     1
K17447     1
K17445     1
K17444     1
K17443     1
          ..
K08860     1
K08859     1
K08858     1
K08857     1
K86952     1
Name: count, Length: 25226, dtype: int64

In [36]:
df2 = pd.merge(df, kegg_df2, left_on='KO', right_on='path_id', how='left')

In [37]:
df2

Unnamed: 0.1,filename,locus_tag,type,start,end,strand,gene,product,db_xref,eC_number,...,Species,recommended_name,systematic_name,synonyms,Unnamed: 0,main,sub,path,ecpath,path_id
0,1402135.12,ALGBIFEJ_00001,CDS,7,408,+,,hypothetical protein,,,...,Sulfitobacter pseudonitzschiae strain SMR1,,,,,,,,,
1,1402135.12,ALGBIFEJ_00002,CDS,405,806,+,,hypothetical protein,,,...,Sulfitobacter pseudonitzschiae strain SMR1,,,,,,,,,
2,1402135.12,ALGBIFEJ_00003,CDS,803,1381,+,,hypothetical protein,,,...,Sulfitobacter pseudonitzschiae strain SMR1,,,,,,,,,
3,1402135.12,ALGBIFEJ_00004,CDS,1378,2424,+,,hypothetical protein,,,...,Sulfitobacter pseudonitzschiae strain SMR1,,,,,,,,,
4,1402135.12,ALGBIFEJ_00005,CDS,2501,4750,+,fadE,Acyl-coenzyme A dehydrogenase,COG:COG1960,1.3.8.7,...,Sulfitobacter pseudonitzschiae strain SMR1,medium-chain acyl-CoA dehydrogenase,medium-chain acyl-CoA:electron-transfer flavop...,LCAD;EC 1.3.2.2;acyl dehydrogenase;ACAD-9;EC 1...,2324.0,09100 Metabolism,09103 Lipid metabolism,00071 Fatty acid degradation [PATH:ko00071],K06445 fadE; acyl-CoA dehydrogenase [EC:1.3.9...,K06445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37113,F03,FLOFJFJI_05677,CDS,96160,96888,-,,hypothetical protein,,,...,Marinovum HOT5_F3,,,,,,,,,
37114,F03,FLOFJFJI_05678,CDS,97013,97315,-,,hypothetical protein,,,...,Marinovum HOT5_F3,,,,,,,,,
37115,F03,FLOFJFJI_05679,CDS,97416,97826,-,,hypothetical protein,,,...,Marinovum HOT5_F3,,,,,,,,,
37116,F03,FLOFJFJI_05680,CDS,97879,98265,-,,hypothetical protein,,,...,Marinovum HOT5_F3,,,,,,,,,


In [38]:
signal_df = pd.read_csv(os.path.join(dpath, '10cc_signalP.csv'))

In [41]:
signal_df.rename(columns={'product': 'signal_product', 'Prediction': 'SignalP'}, inplace=True)

In [42]:
df3 = pd.merge(df2, signal_df, on=['filename', 'locus_tag'], how='left')


In [43]:
df3

Unnamed: 0.1,filename,locus_tag,type,start,end,strand,gene,product,db_xref,eC_number,...,systematic_name,synonyms,Unnamed: 0,main,sub,path,ecpath,path_id,signal_product,SignalP
0,1402135.12,ALGBIFEJ_00001,CDS,7,408,+,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
1,1402135.12,ALGBIFEJ_00002,CDS,405,806,+,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
2,1402135.12,ALGBIFEJ_00003,CDS,803,1381,+,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
3,1402135.12,ALGBIFEJ_00004,CDS,1378,2424,+,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
4,1402135.12,ALGBIFEJ_00005,CDS,2501,4750,+,fadE,Acyl-coenzyme A dehydrogenase,COG:COG1960,1.3.8.7,...,medium-chain acyl-CoA:electron-transfer flavop...,LCAD;EC 1.3.2.2;acyl dehydrogenase;ACAD-9;EC 1...,2324.0,09100 Metabolism,09103 Lipid metabolism,00071 Fatty acid degradation [PATH:ko00071],K06445 fadE; acyl-CoA dehydrogenase [EC:1.3.9...,K06445,Acyl-coenzyme A dehydrogenase,OTHER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37113,F03,FLOFJFJI_05677,CDS,96160,96888,-,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
37114,F03,FLOFJFJI_05678,CDS,97013,97315,-,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
37115,F03,FLOFJFJI_05679,CDS,97416,97826,-,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER
37116,F03,FLOFJFJI_05680,CDS,97879,98265,-,,hypothetical protein,,,...,,,,,,,,,hypothetical protein,OTHER


In [45]:
df3.rename(columns={'eC_number': 'EC_number'}, inplace=True)

In [46]:
df3.to_csv('10cc_ecpath_master.csv', index=False)