This script analyses cluster of tSNE; note that in contrast to initial submission, where external program was used, the clusters now do not miss some of the genes contained.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
from access_biology_data import annotation, gwas_studies, meta, phenotype_collections, properties, relations
from access_literature_data import medline
from access_mixed_data import genealacart
from access_science_shared import standardizer, utils

In [None]:
import sys
sys.path.append('./../src/')

import nar170604f_occurences as nar_attention
import nar170830f_predictions as forec
import resci_inout as rinout
import resci_tools as ret

import nar180310_mega_integrator as mega

In [None]:
taxon_id = 9606

In [None]:
save_images=True

In [None]:
def export(file_base):
    p = '180318_detail_on_clusters/{}'.format(file_base)
    
    ppn = p + '.png'
    rinout.ensure_presence_of_directory(ppn)
    ret.export_raster_image(ppn, dpi=300, insert_date_time=False)
    
    ppd = p + '.pdf'
    rinout.ensure_presence_of_directory(ppn)
    ret.export_image(ppd, insert_date_time=False)

In [None]:
ref_genes = mega.get_ref_genes()

In [None]:
papers = mega.get_publications()

In [None]:
tsne_frame = mega.load_layout(rotation_degrees=45)

In [None]:
papers['enrichment_attention'] = np.log2(
    papers['attention']/ papers['attention'].mean())

In [None]:
papers['enrichment_attention_2015'] = np.log2(
    papers['attention_2015']/ papers['attention_2015'].mean())

In [None]:
la = mega.load_group_annotation()

In [None]:
viz = pd.merge(tsne_frame.reset_index(), la, how='left')

In [None]:
sns.lmplot(x='x', y='y', data=viz.fillna(-1), hue='list_code', fit_reg=False, palette='Set1',scatter_kws={'s': 1})

In [None]:
sns.lmplot(x='x', y='y', data=viz.dropna(), hue='list_code', fit_reg=False, palette='Set1')

In [None]:
sns.lmplot(x='x', y='y', data=viz.dropna(), hue='list_code', fit_reg=False, palette='Paired')

In [None]:
clustered_features = mega.get_clustered_zscored_features()

In [None]:
for u in sorted(la['list_code'].unique()):

    if u >0:
        print(u)

        insert = la[la['list_code']==u]['gene_ncbi'].values

        f = clustered_features.index.isin(insert)

        sns.heatmap(data=clustered_features.loc[f, :], vmin=-3, vmax=3, cmap='PiYG')
        
        export('subcluster_{}'.format(u))
        
        plt.show()

In [None]:
%%time

cl = dict()
dd = dict()
ge = dict()


categs = {
    'gwas': mega.frequent_gwas,
    'gtx': mega.gtx,
    'lof': mega.LoF,
    'duf': mega.DUF,
    'extreme_swissprot': mega.extreme_swissprot,
    'orphan_disease': mega.orphan_disease,
    'rare_go': mega.rare_go,
    'signal_peptide': mega.signal_peptide,
    'rnai': mega.rnai_phenotypes,
    'rare_compounds': mega.rare_compounds,
    'bioplex_fame': mega.fame_in_bioplex,
    'challenged_proteins': mega.challenged_proteins,
    'detection_in_tissues': mega.detection_in_tissues,
    'detection_in_cells': mega.detection_in_cells,
    'westernblot_in_biogrid': mega.biogrid_western_blot,
    'presence_of_homologs': mega.presence_of_homologs,
    'fame_of_homologs': mega.fame_of_homologs,
    'pi_transition': mega.pi_transition,
    'supporting_nih_institutes': mega.supporting_nih_institutes,
    'fame_rank': mega.fame_rank
}

for k, i in categs.items():
    print(k)
    a, b, c = i()
    cl[k] = a
    dd[k] = b
    ge[k] = c

In [None]:
su = pd.concat(cl.values(), axis=1, join='outer').rename_axis('gene_ncbi')

In [None]:
su['in_most_tissues_1'] = su['fraction_of_tissues_1'] > 0.5
su['in_most_cells_1'] = su['fraction_of_cells_1'] > 0.5

su['yeast'] = su['presence_Schizosaccharomyces pombe 972h-'] | su['presence_Saccharomyces cerevisiae S288c']
su['worm_fly'] = su['presence_Drosophila melanogaster'] | su['presence_Caenorhabditis elegans']
su['c_anxient'] = su['yeast'] | su['worm_fly']
su['c_accessibility'] = su['signal_peptide'] | su['in_most_tissues_1'] | su['in_most_cells_1']
su['change_in_gtx'] = su['gtx_fold'] >= 0
su['c_experimental_coverage'] = su['rnai_frequent'] | su['change_in_gtx'] | su['biogrid_western_blot']

su['easy_experimentation'] = su['c_accessibility'] | su['c_experimental_coverage']

su['bp2_mean_studied'] = su['bp2_mean_unstudied'] == False

su['c_genetic_importance'] = su['extremly_LoF_intolerant'] | su['any_prominent_gwas']
su['c_close_to_interesting'] = su['extreme_swissprot'] | su['has_duf'] | su['bp2_mean_studied']
su['c_societal_support'] = su['will_be_pi'] | su['recently_supporting_institutes']

In [None]:
la = mega.load_group_annotation()

In [None]:
gi = meta.gene_info(9606, usecols=['gene_ncbi', 'symbol_ncbi'])

In [None]:
for code in la['list_code'].unique():
    
    if code >0 :
    
        print(code)
        genes_of_interest = la[la['list_code']==code]['gene_ncbi'].unique()

        d = su.loc[genes_of_interest, [
                'c_genetic_importance',
                'easy_experimentation',
                'c_anxient',
    #             'c_societal_support',
        #         'will_be_pi',
                'attention_rank'
            ]].fillna(False).astype(float)

        d = d.reset_index()
        d = pd.merge(d, gi).drop('gene_ncbi', axis=1).set_index('symbol_ncbi')

        sns.clustermap(
            d,
            cmap='YlGn',
            method='ward',
            col_cluster=False);

        if save_images:
            export('group_{}_green'.format(code))
        plt.close()

        sns.clustermap(
            d,
            cmap='autumn_r',
            method='ward',
            col_cluster=False);

        if save_images:
            export('group_{}_red'.format(code))
        plt.close()


    plt.show()

# Add visualization of seleted clusters

In [None]:
gi = meta.gene_info(9606, usecols=['gene_ncbi', 'symbol_ncbi'])

In [None]:
tsne_frame.head()

In [None]:
for code in la['list_code'].unique():
    
    if code > 0:

        to_plot = pd.merge(tsne_frame.reset_index(), papers.reset_index())
        genes_of_interest = la[la['list_code']==code]['gene_ncbi'].unique()

        to_plot = pd.merge(to_plot, gi, how='left')
        to_plot.loc[:, 'in_subgroup'] = to_plot['gene_ncbi'].isin(genes_of_interest)


        zoom_size = 10

        plt.figure(figsize=(8,6))

        f = to_plot['in_subgroup']
        plt.scatter(
                x=to_plot.loc[f, 'x'],
                y=to_plot.loc[f, 'y'],
                s=zoom_size,
                c=to_plot.loc[f, 'enrichment_attention'],
                cmap='coolwarm',
                edgecolors='none',
                vmin=-6,
                vmax=6)

        plt.colorbar()

        if save_images:    
            ret.export_image('180318_detail_on_clusters/subgroup_intelligence_{}_plain.pdf'.format(int(code)))




        zoom_size = 10

        plt.figure(figsize=(8,6))

        fig, ax = plt.subplots(figsize=(8,6))

        f = to_plot['in_subgroup']


        ax.scatter(
            x=to_plot.loc[f, 'x'],
            y=to_plot.loc[f, 'y'],
            s=1,
            color='lightgrey')

        ax.scatter(
                x=to_plot.loc[f, 'x'],
                y=to_plot.loc[f, 'y'],
                s=zoom_size,
                c=to_plot.loc[f, 'enrichment_attention'],
                cmap='coolwarm',
                edgecolors='none',
                vmin=-6,
                vmax=6)



        n = to_plot.loc[f, 'symbol_ncbi'].values
        z = to_plot.loc[f, 'x'].values
        y = to_plot.loc[f, 'y'].values

        for i, txt in enumerate(n):
            ax.annotate(txt, (z[i],y[i]))


        if save_images:    
            ret.export_image('180318_detail_on_clusters/subgroup_intelligence_{}_labelled.pdf'.format(int(code)))



    # plt.colorbar()
    # plt.title(code)

    # export('prominent_gwas_with_label')
    # # fig.colorbar(fig, ax=ax)
    # ax.set_title('Gaussian noise with vertical colorbar')

In [None]:
su

In [None]:
import calculators

In [None]:
go_annotation = annotation.go(
    taxon_id = 9606,
    category=['Function', 'Process', 'Component'],
    any_negating_support = [False],
    temporary_evidence=[False],
    unmapped_evidence=[False])


labelling = go_annotation[['gene_ncbi', 'GO_ID', 'GO_term']].drop_duplicates().rename(
    columns={'GO_ID': 'annotation_id', 'GO_term': 'annotation_name'})

for u in sorted(la['list_code'].unique()):
    if u>0:
        f = la['list_code'] == u

        sign_genes = la.loc[f, 'gene_ncbi']
        non_sign_genes = la.loc[~f, 'gene_ncbi']

        a = calculators.compute_functional_enrichment(sign_genes, non_sign_genes, labelling)
        a = a.sort_values('fdr', ascending=True)

        print('---------------------')
        print(u)
        print(a[['fdr', 'annotation_name']][:5])

In [None]:

d = su.loc[:, [
        'c_genetic_importance',
        'easy_experimentation',
        'c_anxient',
#             'c_societal_support',
#         'will_be_pi',
        'attention_rank'
    ]].fillna(False).astype(float)

d = d.reset_index()
d = pd.merge(d, gi).drop('gene_ncbi', axis=1).set_index('symbol_ncbi')

sns.clustermap(
    d,
    cmap='YlGn',
    method='ward',
    col_cluster=False);

if save_images:
    export('all_green'.format(code))
plt.close()

sns.clustermap(
    d,
    cmap='autumn_r',
    method='ward',
    col_cluster=False);

if save_images:
    export('all_red'.format(code))
plt.close()


plt.show()

In [None]:
code = 9

genes_of_interest = la[la['list_code']==code]['gene_ncbi'].unique()

d = su.loc[genes_of_interest, [
        'c_genetic_importance',
        'easy_experimentation',
        'c_anxient',
        'c_societal_support',
#         'will_be_pi',
        'attention_rank',
        'any_prominent_gwas'
    ]].fillna(False).astype(float)


d = d.reset_index()
d = pd.merge(d, gi).drop('gene_ncbi', axis=1).set_index('symbol_ncbi')



d.sum()

d[d['any_prominent_gwas']==True]