In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
from access_biology_data import annotation, gwas_studies, meta, phenotype_collections, properties, relations
from access_literature_data import medline
from access_mixed_data import genealacart
from access_science_shared import standardizer, utils

In [None]:
import sys
sys.path.append('./../src/')

import nar170604f_occurences as nar_attention
import nar170830f_predictions as forec
import resci_inout as rinout
import resci_tools as ret

import nar180310_mega_integrator as mega

In [None]:
taxon_id = 9606

In [None]:
save_images=True

In [None]:
def export_table(file_base, df):
    p = rinout.get_internal_path(
        '180326_make_supplemental_tables/{}.xlsx'.format(file_base)
    )
    rinout.ensure_presence_of_directory(p)
    ret.export_full_frame(p, df, insert_date_time=False, save_index=True)

In [None]:
# def export(file_base):
#     p = '180318_detail_on_clusters/{}'.format(file_base)
    
#     ppn = p + '.png'
#     rinout.ensure_presence_of_directory(ppn)
# #     ret.export_raster_image(ppn, dpi=300, insert_date_time=False)
    
#     ppd = p + '.pdf'
#     rinout.ensure_presence_of_directory(ppn)
# #     ret.export_image(ppd, insert_date_time=False)

In [None]:
ref_genes = mega.get_ref_genes()

In [None]:
papers = mega.get_publications()

In [None]:
tsne_frame = mega.load_layout(rotation_degrees=45)

In [None]:
papers['enrichment_attention'] = np.log2(
    papers['attention']/ papers['attention'].mean())

In [None]:
papers['enrichment_attention_2015'] = np.log2(
    papers['attention_2015']/ papers['attention_2015'].mean())

In [None]:
la = mega.load_group_annotation()

In [None]:
gene_info = meta.gene_info(9606, usecols=['gene_ncbi', 'symbol_ncbi', 'dbXrefs'])

In [None]:
f = gene_info['dbXrefs'].str.contains('Ensembl:')

In [None]:
gene_info.loc[f, 'gene_ensembl'] = gene_info.loc[f, 'dbXrefs'].str.extract('Ensembl:(ENSG[0-9]*)', expand=False)

In [None]:
gene_info = gene_info[['gene_ncbi', 'symbol_ncbi', 'gene_ensembl']]

# Model

In [None]:
# p = rinout.get_internal_path(
#     '170923f_predict_human_fame/170923_human_BioExp_log_papers/input/target.csv.gz'
# )
# df_target = pd.read_csv(p)

p = rinout.get_internal_path(
    '170923f_predict_human_fame/170923_human_BioExp_log_papers/zgbrh_p90_e300/pooled_target_and_prediciton.csv.gz'
)
df_prediction = pd.read_csv(p).set_index('gene_ncbi')

df_target=df_prediction[['target']]
df_prediction = df_prediction.drop('target', axis=1)

df_model = pd.merge(
    df_target.reset_index(),
    df_prediction.median(1).to_frame('predicted').reset_index()).set_index('gene_ncbi')

export_table('BioExp_publications_target_predictions', df_model)

# List of features

In [None]:
p = rinout.get_internal_path(
    '170923f_predict_human_fame/170923_human_BioExp_log_papers/input/features.csv.gz'
)

In [None]:
df = pd.read_csv(p)
df = df.replace({'TRUE': 1, 'FALSE': 0, 'False': 0, 'True': 1})
df = df.set_index('gene_ncbi')    # table of features

In [None]:
table_non_zscored_features = df

In [None]:
def normfun(x):
    m = np.nanmean(x)
    s = np.nanstd(x)
    a = (x - m) / s
    return a

df = df.astype(float)
table_zscored = df.apply(lambda x: normfun(x), 0)

export_table('z_scored_features', table_zscored)

In [None]:
not_featured = set(ref_genes) - set(df.index)
table_omitted_genes = gene_info[gene_info['gene_ncbi'].isin(not_featured)].set_index('gene_ncbi')

export_table('table_omitted_genes', table_omitted_genes)

# High-level information on genes

In [None]:
%%time

cl = dict()
dd = dict()
ge = dict()


categs = {
    'gwas': mega.frequent_gwas,
    'gtx': mega.gtx,
    'lof': mega.LoF,
    'duf': mega.DUF,
    'extreme_swissprot': mega.extreme_swissprot,
    'orphan_disease': mega.orphan_disease,
    'rare_go': mega.rare_go,
    'signal_peptide': mega.signal_peptide,
    'rnai': mega.rnai_phenotypes,
    'rare_compounds': mega.rare_compounds,
    'bioplex_fame': mega.fame_in_bioplex,
    'challenged_proteins': mega.challenged_proteins,
    'detection_in_tissues': mega.detection_in_tissues,
    'detection_in_cells': mega.detection_in_cells,
    'westernblot_in_biogrid': mega.biogrid_western_blot,
    'presence_of_homologs': mega.presence_of_homologs,
    'fame_of_homologs': mega.fame_of_homologs,
    'pi_transition': mega.pi_transition,
    'supporting_nih_institutes': mega.supporting_nih_institutes,
    'fame_rank': mega.fame_rank
}

for k, i in categs.items():
    print(k)
    a, b, c = i()
    cl[k] = a
    dd[k] = b
    ge[k] = c

In [None]:
su = pd.concat(cl.values(), axis=1, join='outer').rename_axis('gene_ncbi')

su['c_genetic_importance'] = su['extremly_LoF_intolerant'] | su['any_prominent_gwas']

su['in_most_tissues_1'] = su['fraction_of_tissues_1'] > 0.5
su['in_most_cells_1'] = su['fraction_of_cells_1'] > 0.5
su['c_accessibility'] = su['signal_peptide'] | su['in_most_tissues_1'] | su['in_most_cells_1']
su['change_in_gtx'] = su['gtx_fold'] >= 0
su['c_experimental_coverage'] = su['rnai_frequent'] | su['change_in_gtx'] | su['biogrid_western_blot']
su['easy_experimentation'] = su['c_accessibility'] | su['c_experimental_coverage']

su['yeast'] = su['presence_Schizosaccharomyces pombe 972h-'] | su['presence_Saccharomyces cerevisiae S288c']
su['worm_fly'] = su['presence_Drosophila melanogaster'] | su['presence_Caenorhabditis elegans']
su['c_anxient'] = su['yeast'] | su['worm_fly']

# Map coordinats of Fig1 map -overview

In [None]:
fig_one_map = pd.merge(tsne_frame.reset_index(), papers[['papers']].reset_index(), how='left')
fig_one_map = pd.merge(fig_one_map, gene_info, how='left')
fig_one_map = fig_one_map.sort_values('gene_ncbi')

In [None]:
fig_one_map = fig_one_map.set_index('gene_ncbi')

export_table('map_coordiantes_tsne', fig_one_map)

# curated feature sets

In [None]:
su_mini = su[['c_genetic_importance',
                'easy_experimentation',
                'c_anxient',]]

In [None]:
su_mini = su_mini.rename(columns={
    'c_genetic_importance': 'strong_genetic_evidence',
    'easy_experimentation': 'experimentally_well_accessible',
    'c_anxient': 'invertebrate_model_organism'
})

In [None]:
su_mini = su_mini[su_mini.index.isin(ref_genes)]

In [None]:
ex_high_level = pd.merge(gene_info,su_mini.reset_index(), how='right').set_index('gene_ncbi')


In [None]:
master = gene_info[gene_info['gene_ncbi'].isin(
    su_mini[su_mini.all(axis=1)].index
)]

In [None]:
d = df_model.apply(lambda x: 10**x)

In [None]:
d['ratio'] = np.log2(d['predicted'] / d['target'])

In [None]:
d = d.rename(columns={'ratio': 'log2_predicted_over_observed'})

In [None]:
favorable_too_little_studied = pd.merge(
    master,
    d[d['log2_predicted_over_observed']>0][['log2_predicted_over_observed']].reset_index(),
    how='inner'
).set_index('gene_ncbi')

export_table('favorable_too_little_studied', favorable_too_little_studied)

In [None]:
strategic_table = pd.merge(gene_info, su_mini.reset_index()).set_index('gene_ncbi')  # to export

export_table('strategic_table', strategic_table)

# Pairwise distance

In [None]:
f = tsne_frame.index.isin(favorable_too_little_studied.index)

In [None]:
coordinates_not_favorable = tsne_frame.loc[~f, :].copy()

In [None]:
coordinates_favorable = tsne_frame.loc[f, :].copy()

In [None]:
import numpy as np
from scipy.spatial.distance import cdist

In [None]:
others = [(x[0], x[1]) for x in zip(coordinates_favorable['x'], coordinates_favorable['y'])]

In [None]:
%%time
for j in coordinates_not_favorable.index:

    pt = np.array([[coordinates_not_favorable.loc[j, 'x'], coordinates_not_favorable.loc[j, 'y']]])

    distances = cdist(pt, others)
    c = distances.argmin()

    coordinates_not_favorable.loc[
        j, 'closest'] = coordinates_favorable.index[c].copy()

In [None]:
d = pd.merge(gene_info, coordinates_not_favorable.reset_index()).set_index('gene_ncbi')

In [None]:
d = pd.merge(
    d.reset_index(), 
    gene_info.reset_index(),
    how='left',
    left_on='closest',
    right_on='gene_ncbi',
    suffixes=('', '_closest')
)

In [None]:
d.columns

In [None]:
d = d[[
    'gene_ncbi',
    'symbol_ncbi',
    'gene_ensembl',
    'gene_ncbi_closest',
    'symbol_ncbi_closest',
    'gene_ensembl_closest'    
]]

In [None]:
closest_promising_genes = d.set_index('gene_ncbi')
export_table('closest_promising_genes', closest_promising_genes)