# Obfuscate GO drift results

Tong Shu Li

To demonstrate to Michael that he could explain any plausible sounding hypothesis, we will generate an obfuscated decoy list of GO terms that drift.

In [1]:
import pandas as pd

---

## Read the GO term modeling results

In [2]:
data = pd.read_csv("GO_drift_FDR_results.tsv", sep = '\t')

In [3]:
data.shape

(1783, 15)

In [4]:
data.head()

Unnamed: 0,go_id,day_estimate,day_std_error,day_p_value,mian_estimate,mian_std_error,mian_p_value,adj_rsq,day_fdr_pvalue,mian_fdr_pvalue,num_genes,go_name,namespace,num_seq,frac_seq
0,GO:0007049,0.139131,0.013991,1.679787e-08,-0.163877,0.021676,7.813031e-07,0.865378,1.461005e-07,7e-06,569,cell cycle,biological_process,493,0.866432
1,GO:0031974,0.106658,0.014486,1.108465e-06,-0.145002,0.022443,5.866465e-06,0.794672,2.900275e-06,2.4e-05,522,membrane-enclosed lumen,cellular_component,488,0.934866
2,GO:0008219,0.161059,0.015796,1.16418e-08,-0.184409,0.024474,8.169055e-07,0.869167,1.159627e-07,7e-06,571,cell death,biological_process,486,0.851138
3,GO:0055114,0.248249,0.017312,6.312763e-11,0.151164,0.026822,2.965003e-05,0.940934,3.751886e-09,7.8e-05,613,oxidation-reduction process,biological_process,481,0.784666
4,GO:0051641,0.215581,0.02447,9.582997e-08,-0.261866,0.037912,2.536375e-06,0.836793,4.786301e-07,1.4e-05,553,cellular localization,biological_process,480,0.867993


---

## Generate a small list of good results

We will take an arbitrary slice of what we predict will be the good results. The selection ranges are based on the plots generated in the GO drift analysis.

In [5]:
best = (data
    .query("mian_estimate < -0.3 & day_estimate > 0.4 & adj_rsq > 0.8 & num_seq < 100")
    .sort_values("adj_rsq", ascending = False)
    .reset_index(drop = True)
)

In [6]:
best.shape

(37, 15)

In [7]:
best

Unnamed: 0,go_id,day_estimate,day_std_error,day_p_value,mian_estimate,mian_std_error,mian_p_value,adj_rsq,day_fdr_pvalue,mian_fdr_pvalue,num_genes,go_name,namespace,num_seq,frac_seq
0,GO:0009617,0.781415,0.038477,2.329447e-13,-0.489191,0.059613,2.577013e-07,0.956516,1.600318e-10,4e-06,111,response to bacterium,biological_process,96,0.864865
1,GO:0042742,0.791196,0.040619,4.60724e-13,-0.479538,0.062932,7.035949e-07,0.952737,1.600318e-10,6e-06,109,defense response to bacterium,biological_process,94,0.862385
2,GO:0050829,0.817402,0.045018,1.443755e-12,-0.4907,0.069748,2.004217e-06,0.945906,2.870497e-10,1.2e-05,82,defense response to Gram-negative bacterium,biological_process,73,0.890244
3,GO:0000323,0.457148,0.030914,3.880407e-11,-0.333246,0.047895,2.31071e-06,0.922378,2.661064e-09,1.3e-05,104,lytic vacuole,cellular_component,77,0.740385
4,GO:0005764,0.463146,0.031449,4.143113e-11,-0.336597,0.048725,2.532465e-06,0.921736,2.710982e-09,1.4e-05,102,lysosome,cellular_component,76,0.745098
5,GO:0009314,0.495191,0.035774,1.099559e-10,-0.347371,0.055425,8.506518e-06,0.911712,5.601469e-09,3.1e-05,43,response to radiation,biological_process,37,0.860465
6,GO:0006869,0.482502,0.040504,1.125507e-09,-0.512598,0.062754,2.745624e-07,0.897222,2.606208e-08,4e-06,57,lipid transport,biological_process,45,0.789474
7,GO:0035966,0.721072,0.05898,7.563201e-10,-0.650568,0.09138,1.719282e-06,0.895485,2.210687e-08,1.1e-05,90,response to topologically incorrect protein,biological_process,64,0.711111
8,GO:0030968,0.888771,0.073686,9.30379e-10,-0.793172,0.114164,2.354194e-06,0.89253,2.535209e-08,1.4e-05,70,endoplasmic reticulum unfolded protein response,biological_process,47,0.671429
9,GO:0045121,0.674814,0.057951,1.591637e-09,-0.708771,0.089785,4.373908e-07,0.892334,3.246526e-08,5e-06,59,membrane raft,cellular_component,57,0.966102


---

## Obfuscate results

We will also generate another list where the results are selected from a region that doesn't have as good of a fit as the good results. We will then use the same parameters of the good results with the bad models.

In [8]:
bad = (data
    .query("30 < num_seq < 100 & 0.5 < adj_rsq < 0.6")
    .sort_values("adj_rsq", ascending = False)
    .reset_index(drop = True)
)

In [9]:
bad.shape

(43, 15)

In [10]:
bad

Unnamed: 0,go_id,day_estimate,day_std_error,day_p_value,mian_estimate,mian_std_error,mian_p_value,adj_rsq,day_fdr_pvalue,mian_fdr_pvalue,num_genes,go_name,namespace,num_seq,frac_seq
0,GO:0031123,0.076988,0.014734,6.9e-05,-0.068966,0.022828,0.0077,0.595971,0.000101,0.009574,33,RNA 3'-end processing,biological_process,31,0.939394
1,GO:0071804,0.215208,0.040585,5.8e-05,-0.172855,0.06288,0.013697,0.595127,8.7e-05,0.01638,90,cellular potassium ion transport,biological_process,37,0.411111
2,GO:0071805,0.215208,0.040585,5.8e-05,-0.172855,0.06288,0.013697,0.595127,8.7e-05,0.01638,90,potassium ion transmembrane transport,biological_process,37,0.411111
3,GO:0009179,0.147121,0.035004,0.000598,-0.244316,0.054232,0.000312,0.594136,0.00073,0.000554,33,purine ribonucleoside diphosphate metabolic pr...,biological_process,31,0.939394
4,GO:0009135,0.147121,0.035004,0.000598,-0.244316,0.054232,0.000312,0.594136,0.00073,0.000554,33,purine nucleoside diphosphate metabolic process,biological_process,31,0.939394
5,GO:0035725,0.229948,0.045563,9.9e-05,-0.235953,0.070592,0.003859,0.5924,0.000141,0.005097,83,sodium ion transmembrane transport,biological_process,43,0.518072
6,GO:0008360,0.330668,0.064542,8.5e-05,-0.308806,0.099997,0.006672,0.58971,0.000123,0.008413,100,regulation of cell shape,biological_process,31,0.31
7,GO:0019897,0.183497,0.036846,0.000114,-0.189847,0.057087,0.004003,0.586518,0.00016,0.005275,78,extrinsic component of plasma membrane,cellular_component,33,0.423077
8,GO:0006643,0.209755,0.041617,0.000101,-0.196829,0.064478,0.007198,0.581662,0.000143,0.009006,60,membrane lipid metabolic process,biological_process,47,0.783333
9,GO:0000794,0.080083,0.017415,0.000256,-0.103819,0.026982,0.00129,0.581287,0.000335,0.00189,50,condensed nuclear chromosome,cellular_component,48,0.96


We will use the bad results to label the model parameters of the good models.

---

## Generate the obfuscated results

In [11]:
obfusc = best.iloc[:, 1:10] # all the model info but no GO metadata

obfusc.insert(0, "go_id", bad["go_id"].iloc[0:len(best)])

obfusc = (obfusc
    .merge(
        pd.concat([bad.iloc[:, 0], bad.iloc[:, 10:]], axis = 1),
        how = "left", on = "go_id"
    )
)

In [12]:
obfusc.shape

(37, 15)

In [13]:
obfusc.head()

Unnamed: 0,go_id,day_estimate,day_std_error,day_p_value,mian_estimate,mian_std_error,mian_p_value,adj_rsq,day_fdr_pvalue,mian_fdr_pvalue,num_genes,go_name,namespace,num_seq,frac_seq
0,GO:0031123,0.781415,0.038477,2.329447e-13,-0.489191,0.059613,2.577013e-07,0.956516,1.600318e-10,4e-06,33,RNA 3'-end processing,biological_process,31,0.939394
1,GO:0071804,0.791196,0.040619,4.60724e-13,-0.479538,0.062932,7.035949e-07,0.952737,1.600318e-10,6e-06,90,cellular potassium ion transport,biological_process,37,0.411111
2,GO:0071805,0.817402,0.045018,1.443755e-12,-0.4907,0.069748,2.004217e-06,0.945906,2.870497e-10,1.2e-05,90,potassium ion transmembrane transport,biological_process,37,0.411111
3,GO:0009179,0.457148,0.030914,3.880407e-11,-0.333246,0.047895,2.31071e-06,0.922378,2.661064e-09,1.3e-05,33,purine ribonucleoside diphosphate metabolic pr...,biological_process,31,0.939394
4,GO:0009135,0.463146,0.031449,4.143113e-11,-0.336597,0.048725,2.532465e-06,0.921736,2.710982e-09,1.4e-05,33,purine nucleoside diphosphate metabolic process,biological_process,31,0.939394


---

## Save results

In [14]:
best.to_csv("resultsA.tsv", sep = '\t', index = False)
obfusc.to_csv("resultsB.tsv", sep = '\t', index = False)