In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
from access_biology_data import annotation, gwas_studies, meta, phenotype_collections, properties, relations
from access_literature_data import medline
from access_mixed_data import genealacart
from access_science_shared import standardizer, utils

In [None]:
import sys
sys.path.append('./../src/')

import nar170604f_occurences as nar_attention
import nar170830f_predictions as forec
import resci_inout as rinout
import resci_tools as ret

import nar180310_mega_integrator as mega

In [None]:
save_images = False
save_late_images = False

In [None]:
taxon_id = 9606

In [None]:
ref_genes = mega.get_ref_genes()

In [None]:
papers = mega.get_publications()

# Collect interesting summary statistics

In [None]:
summary_statistics = dict()

In [None]:
taxon_id=9606

In [None]:

def get_ref_genes():
    ref_genes = standardizer.reference_genes(
        taxon_id=9606,
        ref_code='orp')
    return ref_genes


def get_publications():
    ref_genes = get_ref_genes()
    ref_gene2pubmed = medline.gene2pubmed(
        taxon_id=9606,
        paper_kind='research',
        ref_genes=ref_genes)

    papers = nar_attention.count_papers_and_attention(
        ref_genes,
        ref_gene2pubmed)

    return papers

In [None]:
%%time

cl = dict()
dd = dict()
ge = dict()


categs = {
    'gwas': mega.frequent_gwas,
    'gtx': mega.gtx,
    'lof': mega.LoF,
    'duf': mega.DUF,
    'extreme_swissprot': mega.extreme_swissprot,
    'orphan_disease': mega.orphan_disease,
    'rare_go': mega.rare_go,
    'signal_peptide': mega.signal_peptide,
    'rnai': mega.rnai_phenotypes,
    'rare_compounds': mega.rare_compounds,
    'bioplex_fame': mega.fame_in_bioplex,
    'challenged_proteins': mega.challenged_proteins,
    'detection_in_tissues': mega.detection_in_tissues,
    'detection_in_cells': mega.detection_in_cells,
    'westernblot_in_biogrid': mega.biogrid_western_blot,
    'presence_of_homologs': mega.presence_of_homologs,
    'fame_of_homologs': mega.fame_of_homologs,
    'pi_transition': mega.pi_transition,
    'supporting_nih_institutes': mega.supporting_nih_institutes,
    'fame_rank': mega.fame_rank,
    'any_gwas': mega.any_gwas,
}

for k, i in categs.items():
    print(k)
    a, b, c = i()
    cl[k] = a
    dd[k] = b
    ge[k] = c

In [None]:
# def export(file_base):
#     p = '180310_mega_integrator/{}'.format(file_base)

#     ret.export_raster_image(p + '.png', dpi=300, insert_date_time=False)
#     ret.export_image(p + '.pdf', insert_date_time=False)


In [None]:
su = pd.concat(cl.values(), axis=1, join='outer').rename_axis('gene_ncbi')



In [None]:
su['in_most_tissues_1'] = su['fraction_of_tissues_1'] > 0.5
su['in_most_cells_1'] = su['fraction_of_cells_1'] > 0.5
su['in_most_1']  = su['in_most_tissues_1'] | su['in_most_cells_1']

su['in_most_tissues_10'] = su['fraction_of_tissues_10'] > 0.5
su['in_most_cells_10'] = su['fraction_of_cells_10'] > 0.5
su['in_most_10']  = su['in_most_tissues_10'] | su['in_most_cells_10']

su['in_any_tissues_1'] = su['fraction_of_tissues_1'] > 0
su['in_any_cells_1'] = su['fraction_of_cells_1'] > 0
su['in_any_1']  = su['in_any_tissues_1'] | su['in_any_cells_1']

su['in_any_tissues_10'] = su['fraction_of_tissues_10'] > 0
su['in_any_cells_10'] = su['fraction_of_cells_10'] > 0
su['in_any_10']  = su['in_any_tissues_10'] | su['in_any_cells_10']

su['in_any_tissues_100'] = su['fraction_of_tissues_100'] > 0
su['in_any_cells_100'] = su['fraction_of_cells_100'] > 0
su['in_any_100']  = su['in_any_tissues_100'] | su['in_any_cells_100']




su['yeast'] = su['presence_Schizosaccharomyces pombe 972h-'] | su['presence_Saccharomyces cerevisiae S288c']
su['worm_fly'] = su['presence_Drosophila melanogaster'] | su['presence_Caenorhabditis elegans']
su['c_anxient'] = su['yeast'] | su['worm_fly']
su['c_accessibility'] = su['signal_peptide'] | su['in_most_tissues_1'] | su['in_most_cells_1']

su['strong_change_in_gtx'] = su['gtx_fold'] >= 1
su['change_in_gtx'] = su['gtx_fold'] >= 0



su['c_experimental_coverage'] = su['rnai_frequent'] | su['change_in_gtx'] | su['biogrid_western_blot']

su['easy_experimentation'] = su['c_accessibility'] | su['c_experimental_coverage']

su['bp2_mean_studied'] = su['bp2_mean_unstudied'] == False

su['c_genetic_importance'] = su['extremly_LoF_intolerant'] | su['any_prominent_gwas']
su['c_close_to_interesting'] = su['extreme_swissprot'] | su['has_duf'] | su['bp2_mean_studied']
su['c_societal_support'] = su['will_be_pi'] | su['recently_supporting_institutes']

In [None]:
# [print(x) for x in su.columns];

In [None]:
su['frequent_fold'] = su['gtx_fold'].rank(pct=True, na_option='top') > 0.8

In [None]:
# c = [
#     'any_prominent_gwas',
#     'extremly_LoF_intolerant',
#     'any_gwas',
# #     'in_any_1',
#     'in_most_1',
#     'in_any_10',
# #     'in_most_10',    
#     'strong_change_in_gtx',
#     'change_in_gtx',
#     'presence_Saccharomyces cerevisiae S288c',
#     'presence_Schizosaccharomyces pombe 972h-',
#     'presence_Caenorhabditis elegans',
#     'presence_Drosophila melanogaster',
#     'rnai_frequent',
#     'biogrid_western_blot',
#     'bp2_mean_studied',
# ]



co = [
    'rnai_frequent',
    'any_prominent_gwas',
    'extremly_LoF_intolerant',
    'any_gwas',
#     'in_any_1',
    'in_most_1',
    'in_any_10',
    'in_any_100',
#     'in_most_10',    
#     'strong_change_in_gtx',
#     'change_in_gtx',
    'presence_Saccharomyces cerevisiae S288c',
    'presence_Schizosaccharomyces pombe 972h-',
    'presence_Caenorhabditis elegans',
    'presence_Drosophila melanogaster',
    'extreme_swissprot',
    'frequent_fold',
    'biogrid_western_blot',
    'bp2_mean_studied',
]




genes_of_interest = list(papers[papers['attention']>=1].index)


agg = []


mini = su.loc[genes_of_interest, co].fillna(False)
mini['something'] = mini.any(axis=1)

t = [
    'rnai_frequent',
    'any_prominent_gwas',
    'extremly_LoF_intolerant',
    'any_gwas',
]
mini['something_lof'] = mini.loc[:, t].any(axis=1)

t = [
    'in_most_1',
    'in_any_10',
    'in_any_100',
]

mini['something_expression'] = mini[t].any(axis=1)

t = [
    'presence_Saccharomyces cerevisiae S288c',
    'presence_Schizosaccharomyces pombe 972h-',
    'presence_Caenorhabditis elegans',
    'presence_Drosophila melanogaster',
]

mini['something_invertebrate'] = mini[t].any(axis=1)

t = ['extreme_swissprot']
mini['something_biophysics'] = mini[t].any(axis=1)

t = ['frequent_fold']
mini['something_gtx'] = mini[t].any(axis=1)


t = [
    'biogrid_western_blot',
    'bp2_mean_studied',
]
mini['something_interaction'] = mini[t].any(axis=1)





for c in mini.columns:
    d = mini[[c]].reset_index()
    d = d.rename(columns={c: 'presence'})
    d.loc[:, 'origin'] = c
    d.loc[:, 'hue'] = 'studied'
    agg.append(d)
    
    
genes_of_interest = list(papers[papers['attention']<1].index)


mini = su.loc[genes_of_interest, co].fillna(False)
mini['something'] = mini.any(axis=1)


t = [
    'rnai_frequent',
    'any_prominent_gwas',
    'extremly_LoF_intolerant',
    'any_gwas',
]
mini['something_lof'] = mini.loc[:, t].any(axis=1)

t = [
    'in_most_1',
    'in_any_10',
    'in_any_100',
]

mini['something_expression'] = mini[t].any(axis=1)

t = [
    'presence_Saccharomyces cerevisiae S288c',
    'presence_Schizosaccharomyces pombe 972h-',
    'presence_Caenorhabditis elegans',
    'presence_Drosophila melanogaster',
]

mini['something_invertebrate'] = mini[t].any(axis=1)

t = ['extreme_swissprot']
mini['something_biophysics'] = mini[t].any(axis=1)

t = ['frequent_fold']
mini['something_gtx'] = mini[t].any(axis=1)


t = [
    'biogrid_western_blot',
    'bp2_mean_studied',
]
mini['something_interaction'] = mini[t].any(axis=1)





for c in mini.columns:
    d = mini[[c]].reset_index()
    d = d.rename(columns={c: 'presence'})
    d.loc[:, 'origin'] = c
    d.loc[:, 'hue'] = 'unstudied'
    agg.append(d)    
    
    
df = pd.concat(agg)

plt.figure(figsize=(20,10))
sns.barplot(x='origin', y='presence', hue='hue', data=df, hue_order=['unstudied', 'studied', ]) 

if save_images:
    ret.export_image('180320_explorability/fraction_either_category_all_expanded.pdf')

In [None]:
sub = [
    'something_lof',
    'something_expression',
    'something_invertebrate',
    'something_biophysics',
    'something_gtx',
    'something_interaction',
    'something'
    ]

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x='origin', y='presence', hue='hue', data=df[df['origin'].isin(sub)], hue_order=['unstudied', 'studied', ]) 

if save_images:
    ret.export_image('180320_explorability/fraction_high_levels.pdf')

In [None]:
sub = [
    'something_lof',
    'something_expression',
    'something_invertebrate',
    'something_biophysics',
    'something_gtx',
    'something_interaction'
    ]
f = (df['hue']=='unstudied') & (df['origin'].isin(
    sub
))

In [None]:
dff = df[f]

In [None]:
h = dff[['gene_ncbi', 'presence']].groupby('gene_ncbi').agg(sum)

In [None]:
hh =h['presence'].value_counts().sort_index()

In [None]:
hh = hh/hh.sum()

In [None]:
hh

In [None]:
h[h['presence']==6]

In [None]:
hh.cumsum()

In [None]:
1-hh

In [None]:
hh

In [None]:
n

In [None]:
save_late_images=True

In [None]:
import matplotlib.pyplot as plt




labels = [str(x) for x in hh.index]
sizes = [100*x for x in hh.values]
patches, texts = plt.pie(sizes)
plt.legend(patches, labels, loc="best")
plt.axis('equal')
plt.tight_layout()

if save_late_images:
    ret.export_image('180320_explorability/fraction_either_category_all_expanded.pdf')

In [None]:
h[h['presence']==6]

In [None]:
1-hh.cumsum()

In [None]:
hh

In [None]:
(1-hh.cumsum()).plot(kind='bar')
plt.xlabel('dummy')
plt.ylabel('Fraction')

if save_late_images:
    ret.export_image('180320_explorability/fraction_cumulative.pdf')

In [None]:
1-hh.cumsum()

In [None]:
h[h['presence']>=3].shape[0]/len(h)