# Redundant and unfit columns removed before analysis

In [1]:
#import libraries
import pandas as pd
from IPython.display import display

In [2]:
#import self-made functions
%run '/Users/mariekececilia/Documents/master_thesis_code/methods.ipynb'

## Load data

In [3]:
[gexp_g, flux_g] = load_gerosa()
[gexp_i, flux_i] = load_ishii()

#split Ishii into growth rate samples and knock-out samples, both containing the baseline
[gexp_i_gr, gexp_i_ko] = split_ishii(gexp_i)
[flux_i_gr, flux_i_ko] = split_ishii(flux_i)

## Duplicated genes

These genes are removed before analysing the Gerosa transcriptomic data because they were duplicates of others: 

In [4]:
#only relevant for gerosa
gexp_g_clean, gexp_g_groups = clean_gexp_g(gexp_g)
print('Gene kept: List of duplicates of that gene (removed)')
for (key, value) in sorted(gexp_g_groups.items()):print (key, ':', sorted(value))
print('\nNew Gerosa shape after removing duplicated genes:', (gexp_g_clean.shape))

Gene kept: List of duplicates of that gene (removed)
b0016 : ['b0582', 'b2394']
b0021 : ['b0264', 'b0274', 'b0988', 'b1893', 'b3445', 'b4576']
b0022 : ['b0265', 'b0275', 'b1894', 'b3444', 'b4294', 'b4516']
b0256 : ['b1404', 'b4284']
b0259 : ['b0552', 'b0656', 'b1331', 'b1370', 'b1994', 'b2030', 'b2192', 'b2982', 'b3218', 'b3505']
b0298 : ['b0373', 'b0540', 'b1027', 'b2088']
b0299 : ['b0372', 'b0541', 'b1026', 'b2089']
b0360 : ['b1403', 'b1997', 'b2861', 'b3044', 'b4272']
b0361 : ['b1402', 'b1578', 'b1996', 'b2860', 'b3045', 'b4273']

New Gerosa shape after removing duplicated genes: (8, 4105)


In [5]:
#Gene expression of the genes with duplicates:
gexp_g[sorted(gexp_g_groups.keys())]

Unnamed: 0,b0016,b0021,b0022,b0256,b0259,b0298,b0299,b0360,b0361
Acetate,1.314951,1.125086,1.63832,0.680694,1.278149,0.594961,1.238808,1.255381,1.542338
Fructose,1.275108,0.999305,1.289501,0.841097,1.758984,0.631862,1.177093,0.927827,1.606869
Galactose,0.488257,1.000691,0.937356,0.820167,0.965585,1.160709,1.145474,1.228643,2.643263
Glucose,1.109477,0.940218,0.899922,1.388808,0.605762,1.197898,1.138902,0.931399,0.778949
Glycerol,0.740964,1.005713,0.979062,1.136949,0.831388,1.120681,0.935446,0.959269,0.901328
Gluconate,0.98057,1.03454,1.005674,1.069269,0.657964,0.993425,1.151898,1.000238,0.629261
Pyruvate,1.480496,1.375829,1.889096,0.464329,1.473283,0.802093,0.932664,1.059628,1.269821
Succinate,0.952453,0.821619,0.686124,1.070448,1.20376,0.972243,0.837042,0.912354,0.666947


No duplicated genes were found in the Ishii data and the same holds for the subsets:

In [6]:
print(gexp_i_gr[gexp_i_gr.duplicated(keep = False)].shape)
print(gexp_i_ko[gexp_i_ko.duplicated(keep = False)].shape)

(0, 79)
(0, 79)


## Duplicated (redundant) reactions

In [7]:
#identify reactions with equal or mirrored measurments
g_equal_dict = get_equal_or_mirrored_groups(flux_g)

#might differ between ishii subsets
i_gr_equal_dict = get_equal_or_mirrored_groups(flux_i_gr)
i_ko_equal_dict = get_equal_or_mirrored_groups(flux_i_ko)

#print result
print('Gerosa groups (reaction : its duplicates)')
for (key, value) in sorted(g_equal_dict.items()):print (key, ':', sorted(value))
print('\nIshii GR groups (reaction : its duplicates)')
for (key, value) in sorted(i_gr_equal_dict.items()):print (key, ':', sorted(value))
print('\nIshii KO groups (reaction : its duplicates)')
for (key, value) in sorted(i_ko_equal_dict.items()):print (key, ':', sorted(value))

Gerosa groups (reaction : its duplicates)
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_EDA : ['R_EDD']
R_ENO : ['R_PGM']
R_G6PDH2r : ['R_PGL']
R_GAPD : ['R_PGK']
R_ICL : ['R_MALS']
R_TALA : ['R_TKT1']

Ishii GR groups (reaction : its duplicates)
R_ACALD : ['R_ALCD2x']
R_ACKr : ['R_PTAr']
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_ENO : ['R_PGM']
R_EX_glc_e : ['R_GLCptspp']
R_EX_lac_D_e : ['R_EX_succ_e', 'R_LDH_D']
R_FBA : ['R_PFK', 'R_TPI']
R_FUM : ['R_SUCDi']
R_G6PDH2r : ['R_GND', 'R_PGL']
R_GAPD : ['R_PGK']
R_ICL : ['R_MALS']
R_TALA : ['R_TKT1']

Ishii KO groups (reaction : its duplicates)
R_ACALD : ['R_ALCD2x']
R_ACKr : ['R_EX_ac_e', 'R_EX_lac_D_e', 'R_EX_pyr_e', 'R_LDH_D', 'R_PTAr']
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_ENO : ['R_PGM']
R_EX_for_e : ['R_EX_succ_e']
R_EX_glc_e : ['R_GLCptspp']
R_FBA : ['R_PFK', 'R_TPI']
R_FUM : ['R_SUCDi']
R_GAPD : ['R_PGK']
R_GND : ['R_PGL']
R_ICL : ['R_MALS']
R_TALA : ['R_TKT1']


--> Here we see that 
- In both Ishii subsets, R_GLCptspp have the same (or mirrored) measurments as R_EX_glc_e, which makes sense since it transports glucose over the periplasm. It is thus also considered an external reaction here, and will be removed with the exchange reactions. 
- In both Ishii subsets, R_LDH_D have the same measurments as R_EX_lac_D_e. R_LDH_D is not a transporter, but it is connected to the export of lactate because it produces lactate from puryvate. The inspection showed that both reactions are in fact 0 for all measurments, so R_LDH_D will be removed due to constant values etiher way. 

### Disagreement of duplicates

In [8]:
#Intersection of internal, non-constant reactions with enough non-null entries
intersection_gr = get_intersection(remove_ex_constant_zero(flux_g), remove_ex_constant_zero(flux_i_gr))
intersection_ko = get_intersection(remove_ex_constant_zero(flux_g), remove_ex_constant_zero(flux_i_ko))

#identify duplicates among the reactions in the intersection
#..of gerosa and ishii gr
g_gr_equal_dict_intersection = get_equal_or_mirrored_groups(flux_g[intersection_gr])
i_gr_equal_dict_intersection = get_equal_or_mirrored_groups(flux_i_gr[intersection_gr])
#..of gerosa and ishii ko
g_ko_equal_dict_intersection = get_equal_or_mirrored_groups(flux_g[intersection_ko])
i_ko_equal_dict_intersection = get_equal_or_mirrored_groups(flux_i_ko[intersection_ko])

#print result
print('------------------')
print('Gerosa vs Ishii GR')
print('------------------')
print('Gerosa groups:')
for (key, value) in sorted(g_gr_equal_dict_intersection.items()):print (key, ':', sorted(value))
print('\nIshii GR groups:')
for (key, value) in sorted(i_gr_equal_dict_intersection.items()):print (key, ':', sorted(value))

print('\n------------------')
print('Gerosa vs Ishii KO')
print('------------------')
print('Gerosa groups:')
for (key, value) in sorted(g_ko_equal_dict_intersection.items()):print (key, ':', sorted(value))
print('\nIshii KO groups:')
for (key, value) in sorted(i_ko_equal_dict_intersection.items()):print (key, ':', sorted(value))

------------------
Gerosa vs Ishii GR
------------------
Gerosa groups:
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_ENO : ['R_PGM']
R_G6PDH2r : ['R_PGL']
R_GAPD : ['R_PGK']
R_TALA : ['R_TKT1']

Ishii GR groups:
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_ENO : ['R_PGM']
R_FBA : ['R_TPI']
R_FUM : ['R_SUCDi']
R_G6PDH2r : ['R_GND', 'R_PGL']
R_GAPD : ['R_PGK']
R_TALA : ['R_TKT1']

------------------
Gerosa vs Ishii KO
------------------
Gerosa groups:
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_ENO : ['R_PGM']
R_G6PDH2r : ['R_PGL']
R_GAPD : ['R_PGK']
R_ICL : ['R_MALS']
R_TALA : ['R_TKT1']

Ishii KO groups:
R_ACONTa : ['R_ACONTb', 'R_CS']
R_AKGDH : ['R_SUCOAS']
R_ENO : ['R_PGM']
R_FBA : ['R_TPI']
R_FUM : ['R_SUCDi']
R_GAPD : ['R_PGK']
R_GND : ['R_PGL']
R_ICL : ['R_MALS']
R_TALA : ['R_TKT1']


Manual inspection (see more below) shows that they agree on the groups in the intersection, except: 
- G6PDH2r, PGL and GND
    - Gerosa group PGL with G6PDH2r (and not GND), GND differs from the others in most samples
    - Ishii GR group PGL with G6PDH2r and GND
    - Ishii KO group PGL with GND (and not G6PDH2r), but G6PDH2r is equal for all observations but pgi, rpiA, tktA
- FUM and SUCDi
    - Ishii (both subsets) : equal for all observations
    - Gerosa: equal for all observations but Succinate
- FBA and TPI
    - Ishii (both subsets) : equal for all observations
    - Gerosa: FBA and TPI are equal for all observations but Glycerol

#### Duplicates in both datasets

In [9]:
#from manual inspection:
common_duplicates_gr = ['R_ACONTb', 'R_CS', 'R_SUCOAS', 'R_PGM',
                    'R_PGL', #common in ishii GR and gerosa only
                    'R_PGK', 
                    'R_TKT1']

common_duplicates_ko = ['R_ACONTb', 'R_CS', 'R_SUCOAS', 'R_PGM',
                    'R_PGK', 
                    'R_MALS', #not in ishii GR intersection (too many null-entries)
                    'R_TKT1']

#### Conflicting duplicates

In [10]:
print('Difference of R_PGL and GND in Gerosa:')
diff = (flux_g['R_PGL']-flux_g['R_GND']).abs()
display(diff[diff!=0])

print('\nDifference of PGL and G6PDH2r in Ishii:')
diff = ((flux_i['R_PGL']-flux_i['R_G6PDH2r']).abs())
display(diff[diff!=0])

print('\nDifference of FBA and TPI in Gerosa:')
diff = ((flux_g['R_FBA']-flux_g['R_TPI']).abs())
display(diff[diff!=0])

print('\nDifference of FUM and SUCDi in Gerosa:')
diff = ((flux_g['R_FUM']-flux_g['R_SUCDi']).abs())
display(diff[diff!=0])

Difference of R_PGL and GND in Gerosa:


Acetate      8.215938e-01
Fructose     1.000000e-09
Galactose    1.729721e-01
Glucose      1.086991e+00
Glycerol     1.458035e-02
Gluconate    1.464963e+00
Pyruvate     3.000000e-09
Succinate    6.554642e-01
dtype: float64


Difference of PGL and G6PDH2r in Ishii:


pgi     1.0101
rpiA    0.6042
tktA    0.5837
dtype: float64


Difference of FBA and TPI in Gerosa:


Glycerol    10.136
dtype: float64


Difference of FUM and SUCDi in Gerosa:


Succinate    1.139998
dtype: float64

## Unfit reactions removed

### External reactions

In [11]:
#get list of exchange reactions (with _EX_ in name)        
g_ex = get_ex(flux_g)
i_ex = get_ex(flux_i)

#summary
print('Gerosa:', sorted(g_ex))
print('Ishii (both subsets):', sorted(i_ex) + ['R_GLCptspp'])

Gerosa: ['R_EX_ac_e', 'R_EX_fru_e', 'R_EX_fum_e', 'R_EX_gal_e', 'R_EX_glc_e', 'R_EX_glcn_e', 'R_EX_glyc_e', 'R_EX_lac_D_e', 'R_EX_pyr_e', 'R_EX_succ_e']
Ishii (both subsets): ['R_EX_ac_e', 'R_EX_co2_e', 'R_EX_etoh_e', 'R_EX_for_e', 'R_EX_glc_e', 'R_EX_lac_D_e', 'R_EX_o2_e', 'R_EX_pyr_e', 'R_EX_succ_e', 'R_GLCptspp']


### Constant reactions

In [12]:
#identify constant reactions
g_constant = flux_g.nunique()[flux_g.nunique() == 1].index.tolist()

#might be different for the different ishii subsets
i_gr_constant = flux_i_gr.nunique()[flux_i_gr.nunique() == 1].index.tolist()
i_ko_constant = flux_i_ko.nunique()[flux_i_ko.nunique() == 1].index.tolist()

print('Gerosa:', sorted(g_constant))
print('Ishii GR:', sorted(i_gr_constant))
print('Ishii KO:', sorted(i_gr_constant))

Gerosa: []
Ishii GR: ['R_EX_lac_D_e', 'R_EX_succ_e', 'R_LDH_D']
Ishii KO: ['R_EX_lac_D_e', 'R_EX_succ_e', 'R_LDH_D']


### Reactions that are mostly zero

In [13]:
#identify reactions where more than half the entries are zero
g_mostly_zero = get_mostly_zero(flux_g)

#might be different for the different ishii subsets
i_gr_mostly_zero = get_mostly_zero(flux_i_gr)
i_ko_mostly_zero = get_mostly_zero(flux_i_ko)

print('Gerosa:', sorted(g_mostly_zero))
print('Ishii GR:', sorted(i_gr_mostly_zero))
print('Ishii KO:', sorted(i_ko_mostly_zero))

Gerosa: ['R_EX_fru_e', 'R_EX_gal_e', 'R_EX_glc_e', 'R_EX_glcn_e', 'R_EX_glyc_e', 'R_EX_lac_D_e', 'R_EX_pyr_e', 'R_EX_succ_e']
Ishii GR: ['R_ACKr', 'R_EX_etoh_e', 'R_EX_for_e', 'R_EX_lac_D_e', 'R_EX_pyr_e', 'R_EX_succ_e', 'R_ICL', 'R_LDH_D', 'R_MALS', 'R_ME2', 'R_PTAr']
Ishii KO: ['R_ACALD', 'R_ACKr', 'R_ALCD2x', 'R_EX_ac_e', 'R_EX_etoh_e', 'R_EX_for_e', 'R_EX_lac_D_e', 'R_EX_pyr_e', 'R_EX_succ_e', 'R_LDH_D', 'R_ME2', 'R_PTAr']


## Reactions used in analysis of individual datasets

Overview of which reactions were used in the exploratory analysis. Duplicates were not included, but are listed here to show which other reactions would give the same results. Since Gerosa and Ishii do not agree on all duplications in the intersection, only reactions who are duplicated in both datasets were removed when comparing the correlations of gene-reaction pairs (see next section).

In [14]:
flux_g_cleaned, flux_g_groups = clean_gerosa(flux_g)
flux_i_gr_cleaned, flux_i_gr_groups = clean_ishii(flux_i_gr)
flux_i_ko_cleaned, flux_i_ko_groups = clean_ishii(flux_i_ko)

print('Gerosa (%s reactions):' % len(flux_g_cleaned.columns))
for gene in sorted(flux_g_cleaned.columns):
    if gene in flux_g_groups:
        print(gene, "– duplicate of" ,sorted(flux_g_groups[gene]))
    else:
        print(gene)

print('\nIshii GR (%s reactions):' % len(flux_i_gr_cleaned.columns))
for gene in sorted(flux_i_gr_cleaned.columns):
    if gene in flux_i_gr_groups:
        print(gene, "– duplicate of" ,sorted(flux_i_gr_groups[gene]))
    else:
        print(gene)
        
print('\nIshii KO (%s reactions):' % len(flux_i_ko_cleaned.columns))
for gene in sorted(flux_i_ko_cleaned.columns):
    if gene in flux_i_ko_groups:
        print(gene, "– duplicate of" ,sorted(flux_i_ko_groups[gene]))
    else:
        print(gene)

Gerosa (21 reactions):
R_ACONTa – duplicate of ['R_ACONTb', 'R_CS']
R_AKGDH – duplicate of ['R_SUCOAS']
R_EDA – duplicate of ['R_EDD']
R_ENO – duplicate of ['R_PGM']
R_FBA
R_FUM
R_G6PDH2r – duplicate of ['R_PGL']
R_GAPD – duplicate of ['R_PGK']
R_GND
R_ICDHyr
R_ICL – duplicate of ['R_MALS']
R_PDH
R_PGI
R_PPC
R_PPCK
R_RPE
R_RPI
R_SUCDi
R_TALA – duplicate of ['R_TKT1']
R_TKT2
R_TPI

Ishii GR (18 reactions):
R_ACALD – duplicate of ['R_ALCD2x']
R_ACONTa – duplicate of ['R_ACONTb', 'R_CS']
R_AKGDH – duplicate of ['R_SUCOAS']
R_ENO – duplicate of ['R_PGM']
R_FBA – duplicate of ['R_PFK', 'R_TPI']
R_FUM – duplicate of ['R_SUCDi']
R_G6PDH2r – duplicate of ['R_GND', 'R_PGL']
R_GAPD – duplicate of ['R_PGK']
R_ICDHyr
R_MDH
R_PDH
R_PGI
R_PPC
R_PYK
R_RPE
R_RPI
R_TALA – duplicate of ['R_TKT1']
R_TKT2

Ishii KO (19 reactions):
R_ACONTa – duplicate of ['R_ACONTb', 'R_CS']
R_AKGDH – duplicate of ['R_SUCOAS']
R_ENO – duplicate of ['R_PGM']
R_FBA – duplicate of ['R_PFK', 'R_TPI']
R_FUM – duplicate of ['R_

## Reactions used in analysis of intersection of datasets

Since Gerosa and Ishii do not agree on all duplications in the intersection, only reactions who are duplicated in both datasets were removed when comparing the correlations of gene-reaction pairs. Some of the reactions listed below are thus redundant when considering only one of the datasets (the reaction gives the same correlation for all possible gene-reaction pairs, but only in one of the datasets). 

In [15]:
intersection_gr = (get_intersection(remove_ex_constant_zero(flux_g),
                                 remove_ex_constant_zero(flux_i_gr)))

intersection_ko = (get_intersection(remove_ex_constant_zero(flux_g),
                                 remove_ex_constant_zero(flux_i_ko)))

print('Gerosa vs Ishii GR:')
for reaction in (sorted(flux_g[intersection_gr].drop(common_duplicates_gr, axis = 1).columns.tolist())): print(reaction)

print('\nGerosa vs Ishii KO:')
for reaction in (sorted(flux_g[intersection_ko].drop(common_duplicates_gr, axis = 1).columns.tolist())): print(reaction)

Gerosa vs Ishii GR:
R_ACONTa
R_AKGDH
R_ENO
R_FBA
R_FUM
R_G6PDH2r
R_GAPD
R_GND
R_ICDHyr
R_PDH
R_PGI
R_PPC
R_RPE
R_RPI
R_SUCDi
R_TALA
R_TKT2
R_TPI

Gerosa vs Ishii KO:
R_ACONTa
R_AKGDH
R_ENO
R_FBA
R_FUM
R_G6PDH2r
R_GAPD
R_GND
R_ICDHyr
R_ICL
R_MALS
R_PDH
R_PGI
R_PPC
R_RPE
R_RPI
R_SUCDi
R_TALA
R_TKT2
R_TPI
