In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
from access_biology_data import annotation, gwas_studies, meta, phenotype_collections, properties, relations
from access_literature_data import medline
from access_mixed_data import genealacart
from access_science_shared import standardizer, utils

In [None]:
import sys
sys.path.append('./../src/')

import nar170604f_occurences as nar_attention
import nar170830f_predictions as forec
import resci_inout as rinout
import resci_tools as ret

import nar180310_mega_integrator as mega

In [None]:
taxon_id = 9606

In [None]:
ref_genes = mega.get_ref_genes()

In [None]:
papers = mega.get_publications()

In [None]:
summary_statistics = dict()

In [None]:
papers.loc[:, 'papers_enrichment'] = np.log2(papers.loc[:, 'attention'] / papers.loc[:, 'attention'].mean())

In [None]:
taxon_id=9606

In [None]:

# def get_ref_genes():
#     ref_genes = standardizer.reference_genes(
#         taxon_id=9606,
#         ref_code='orp')
#     return ref_genes


# def get_publications():
#     ref_genes = get_ref_genes()
#     ref_gene2pubmed = medline.gene2pubmed(
#         taxon_id=9606,
#         paper_kind='research',
#         ref_genes=ref_genes)

#     papers = nar_attention.count_papers_and_attention(
#         ref_genes,
#         ref_gene2pubmed)

#     return papers

In [None]:
papers['rank'] = np.ceil(papers['attention'].rank(pct=True)*10)    #   <---------------------

In [None]:
gi = mega._get_gene_ncbi_2_ensembl()
unambiguous_between_ncbi_and_ensembl = gi['gene_ncbi']

In [None]:
%%time

cl = dict()
dd = dict()
ge = dict()


categs = {
    'gwas_frequent': mega.frequent_gwas,
    'gwas_any': mega.any_gwas,
#     'gtx': mega.gtx,
    'lof': mega.LoF,
#     'orphan_disease': mega.orphan_disease,
#     'unified_disease': mega.unified_disease,
#     'human_phenotype': mega.human_phenotype,
#     'omim_disease': mega.omim_disease,
#     'rare_go': mega.rare_go,
#     'rnai': mega.rnai_phenotypes,
#     'detection_in_tissues': mega.detection_in_tissues,
#     'detection_in_cells': mega.detection_in_cells,
#     'presence_of_homologs': mega.presence_of_homologs,
}

for k, i in categs.items():
    print(k)
    a, b, c = i()
    cl[k] = a
    dd[k] = b
    ge[k] = c

In [None]:
cat_to_plot = [
#     'presence_Saccharomyces cerevisiae S288c',
#     'rnai_frequent',
#     'always_expressed',
#     'has_rare_go',
    'extremly_LoF_intolerant',
    'any_prominent_gwas',
    'any_gwas',
#     'has_omim_disease',
#     'has_unified_disease',
#     'has_human_phenotype',
#     'has_orphan_disease'
#     'gtx_fraction'
]

In [None]:
su = pd.concat(cl.values(), axis=1, join='outer').rename_axis('gene_ncbi')
su = su.loc[:, cat_to_plot]

In [None]:
papers.head()

In [None]:
su.head(5)

In [None]:
mm = su.divide(papers['attention'], axis='rows')

In [None]:
mm.head()

In [None]:

agg = []
for t in cat_to_plot:
    m = pd.merge(papers.reset_index(), mm[[t]].reset_index(), how='left')
    m[t] = m[t].fillna(False)
    m = m[['gene_ncbi', 'rank', t]].rename(columns={t: 'presence'})
    m.loc[:, 'annotation'] = t
    agg.append(m)

df = pd.concat(agg)
df = df[df['gene_ncbi'].isin(unambiguous_between_ncbi_and_ensembl)]
df = df[df['gene_ncbi'].isin(ge['lof'])]

plt.figure(figsize=(5,4))
sns.pointplot(x='rank', y='presence',data=df, hue='annotation');
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

ret.export_image('180319_enrichment_for_important_biology/presence_by_attention.pdf')



In [None]:
master = pd.merge(papers.reset_index(), su.reset_index())

In [None]:
master = master[['attention', 'rank', 'extremly_LoF_intolerant', 'any_prominent_gwas', 'any_gwas', 'gene_ncbi']]

In [None]:
# master = master.set_index(['gene_ncbi', 'rank'])

In [None]:
master = master[master['gene_ncbi'].isin(gi['gene_ncbi'])]

In [None]:
master = master.fillna(False)

In [None]:
master = master.set_index(['gene_ncbi', 'rank'])

In [None]:
master.head(2)

In [None]:
norm_master = master / master.mean()
norm_master = norm_master.astype(float)

In [None]:
# m = norm_master.astype(float).apply(lambda x: np.log2(x))

In [None]:
h = norm_master.reset_index().drop('gene_ncbi', axis=1)

In [None]:
hh = h.groupby('rank').agg(lambda x: np.mean(x)).apply(lambda x: np.log2(x))

In [None]:
d = hh.copy().reset_index()

In [None]:
agg = []
for c in ['attention', 'extremly_LoF_intolerant', 'any_prominent_gwas', 'any_gwas']:
    dd = d[['rank', c]].rename(columns={c: 'value'})
    dd.loc[:, 'category'] = c
    agg.append(dd)

u = pd.concat(agg)

In [None]:
sns.pointplot(x='rank', y='value',data=u, hue='category');
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ret.export_image('180319_enrichment_for_important_biology/fold_enrichment.pdf')


In [None]:
# agg=[]
# for j in np.arange(100):
#     hs = h.sample(frac=1, replace=True)
#     hh = hs.groupby('rank').agg(lambda x: np.mean(x)).apply(lambda x: np.log2(x))
#     agg.append(hh)

In [None]:
# d = pd.concat(agg).reset_index()

In [None]:
# agg = []
# for c in ['attention', 'extremly_LoF_intolerant', 'any_prominent_gwas', 'any_gwas']:
#     dd = d[['rank', c]].rename(columns={c: 'value'})
#     dd.loc[:, 'category'] = c
#     agg.append(dd)

# u = pd.concat(agg)

In [None]:
# sns.pointplot(x='rank', y='value',data=u, hue='category');

