In [50]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multitest import multipletests
from pprint import pprint

In [51]:
ASO_ID = "ASO_G0229852_03"

aso_df = pd.read_csv("../data/all_marks/H3K27ac/our_fantom_fantom_aso_genes_association.tsv", sep="\t")
aso_dict = aso_df[aso_df["lncRNAId"].str.contains(ASO_ID)].to_dict(orient="records")[0]

# pprint(aso_dict)

print("\nFANTOM:")
print(f"{aso_dict['fantom_m']} downregulated genes")
print(f"{aso_dict['fantom_p']} upregulated genes")


print("\nHIMORNA:")
print(f"{aso_dict['our_p']} genes associated with positively correlated regions")
print(f"{aso_dict['our_m']} genes associated with positively correlated regions")



FANTOM:
596 downregulated genes
432 upregulated genes

HIMORNA:
884 genes associated with positively correlated regions
502 genes associated with positively correlated regions


In [52]:
def makeExactFisherTestFromFiles(confusion_matrix_fn, genes_association_fn):
    confuion_df = pd.read_csv(confusion_matrix_fn, sep="\t") 
    df = pd.read_csv(genes_association_fn, sep="\t") 
    
    pvalue = []
    mm_pvalue = []
    mp_pvalue = []
    pm_pvalue = []
    pp_pvalue = []
    for index, row in confuion_df.iterrows():
        _, pv = stats.fisher_exact([[row['TP'], row['FP']], [row['FN'], row['TN']]])
        pvalue.append(pv)

        _, mm_pv = stats.fisher_exact([[row['mm_TP'], row['mm_FP']], [row['mm_FN'], row['mm_TN']]])
        mm_pvalue.append(mm_pv)

        _, mp_pv = stats.fisher_exact([[row['mp_TP'], row['mp_FP']], [row['mp_FN'], row['mp_TN']]])
        mp_pvalue.append(mp_pv)

        _, pm_pv = stats.fisher_exact([[row['pm_TP'], row['pm_FP']], [row['pm_FN'], row['pm_TN']]])
        pm_pvalue.append(pm_pv)

        _, pp_pv = stats.fisher_exact([[row['pp_TP'], row['pp_FP']], [row['pp_FN'], row['pp_TN']]])  
        pp_pvalue.append(pp_pv)
        
    adjusted_pvalue = multipletests(pvalue, alpha=0.05, method='fdr_bh')
    adjusted_mm_pvalue = multipletests(mm_pvalue, alpha=0.05, method='fdr_bh')
    adjusted_mp_pvalue = multipletests(mp_pvalue, alpha=0.05, method='fdr_bh')
    adjusted_pm_pvalue = multipletests(pm_pvalue, alpha=0.05, method='fdr_bh')
    adjusted_pp_pvalue = multipletests(pp_pvalue, alpha=0.05, method='fdr_bh')
    
    pvalue_df = pd.DataFrame({
        "pvalue": pvalue,
        "mm_pvalue": mm_pvalue,
        "mp_pvalue": mp_pvalue,
        "pm_pvalue": pm_pvalue,
        "pp_pvalue": pp_pvalue,
        'adj_pvalue': adjusted_pvalue[1],
        'mm_adj_pvalue': adjusted_mm_pvalue[1],
        'mp_adj_pvalue': adjusted_mp_pvalue[1],
        'pm_adj_pvalue': adjusted_pm_pvalue[1],
        'pp_adj_pvalue': adjusted_pp_pvalue[1]})
    
    df_pv = pd.concat([df[['lncRNAId', 'lncRNAName']], pvalue_df], axis = 1)
    df_pv = df_pv[df_pv.apply(lambda row: len([p for p in row.tolist()[3:] if float(p) < 0.05]) != 0, axis=1)]
    # df_pv.to_csv("../all_marks/" + target + "/" + our_fantom_file_name + annotation_prefix + "_genes_association_pvalues.tsv", sep="\t", index=None)
    
    return df_pv

In [56]:
confusion_matrix_fn = "../data/all_marks/H3K27ac/our_fantom_fantom_aso_genes_confusion_matrix.tsv"
genes_association_fn = "../data/all_marks/H3K27ac/our_fantom_fantom_aso_genes_association.tsv"
pvalue_df = makeExactFisherTestFromFiles(confusion_matrix_fn, genes_association_fn)
pvalue_dict = pvalue_df[pvalue_df["lncRNAId"].str.contains(ASO_ID)].to_dict(orient="records")[0]
pprint(pvalue_dict)

{'adj_pvalue': 0.0004948049067588278,
 'lncRNAId': 'ENSG00000229852_ASO_G0229852_03',
 'lncRNAName': 'RP11-398K22.12',
 'mm_adj_pvalue': 0.0709841230466894,
 'mm_pvalue': 0.026747640568317743,
 'mp_adj_pvalue': 0.29987348893216165,
 'mp_pvalue': 0.08257385927117494,
 'pm_adj_pvalue': 6.38947886994757e-05,
 'pm_pvalue': 1.574219431726213e-05,
 'pp_adj_pvalue': 0.8172112452778817,
 'pp_pvalue': 0.18949825977458126,
 'pvalue': 0.00016493496891960925}


In [59]:
association_df = pd.read_csv(genes_association_fn, sep="\t") 
confuion_df = pd.read_csv(confusion_matrix_fn, sep="\t") 
tmp_df = pd.concat([association_df[['lncRNAId', 'lncRNAName']], confuion_df], axis = 1)
confuion_dict = tmp_df[tmp_df["lncRNAId"].str.contains(ASO_ID)].to_dict(orient="records")[0]
pprint(confuion_dict)


{'FN': 942,
 'FP': 1290,
 'TN': 22383,
 'TP': 86,
 'Unnamed: 0': 31,
 'lncRNAId': 'ENSG00000229852_ASO_G0229852_03',
 'lncRNAName': 'RP11-398K22.12',
 'mm_FN': 576,
 'mm_FP': 482,
 'mm_TN': 23623,
 'mm_TP': 20,
 'mp_FN': 418,
 'mp_FP': 488,
 'mp_TN': 23781,
 'mp_TP': 14,
 'pm_FN': 553,
 'pm_FP': 841,
 'pm_TN': 23264,
 'pm_TP': 43,
 'pp_FN': 422,
 'pp_FP': 874,
 'pp_TN': 23395,
 'pp_TP': 10}


## Интерпретация
В мешке 24701 шаров (все гены) из них 596 красные (FANTOM downregulated genes). Мы вытаскиваем из мешка 884 шара (предсказания HIMORNA), среди них 43 оказываются красные (это TP). Какова вероятность такого события?

Для этого надо посчитать другие характеристики:
* all_balls = все шары в мешке = 24701
* all_red = все красные шары в мешке = 596
* all_white = все белые шары в мешке = all_balls - all_red = 24105
* drawn_all = общее кол-во вынутых шаров = 884
* drawn_red = кол-во красных шаров среди вынутых  = 43
* drawn_white = кол-во красных шаров среди вынутых = drawn_all - drawn_red = 841
* TP = кол-во вынутых красных шаров = drawn_red = 43
* FP = кол-во белых шаров среди вытащенных = drawn_white = 841
* FN = кол-во красных шаров, которые остались в мешке = all_red - drawn_red = 596 - 43 = 553
* TN = кол-во белых шаров, которые остались в мешке = all_white - drawn_white = 24105 - 841 = 23264

In [72]:
# makeConfusionMatrix(target[0], annotation_prefix="fantom_aso", common_genes_set_length=24701)
all_balls = 24701
all_red = 596
all_white = all_balls - all_red
drawn_all = 884
drawn_red = 43
drawn_white = drawn_all - drawn_red

TP = drawn_red
FP = drawn_all - drawn_red
FN = all_red - drawn_red
TN = all_white - drawn_white
[TP, FP, FN, TN, all_white, drawn_white]

[43, 841, 553, 23264, 24105, 841]

In [71]:
TP = 43
FP = 841
FN = 553
TN = 23264
_, pv = stats.fisher_exact([[TP, FP], [FN, TN]])
pv

1.574219431726213e-05