In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
from access_biology_data import annotation, gwas_studies, meta, phenotype_collections, properties, relations
from access_literature_data import medline
from access_mixed_data import genealacart
from access_science_shared import standardizer, utils

In [None]:
import sys
sys.path.append('./../src/')

import nar170604f_occurences as nar_attention
import nar170830f_predictions as forec
import resci_inout as rinout
import resci_tools as ret

import nar180310_mega_integrator as mega

In [None]:
taxon_id = 9606

In [None]:
ref_genes = mega.get_ref_genes()

In [None]:
papers = mega.get_publications()

# Collect interesting summary statistics

In [None]:
summary_statistics = dict()

In [None]:
taxon_id=9606

In [None]:

def get_ref_genes():
    ref_genes = standardizer.reference_genes(
        taxon_id=9606,
        ref_code='orp')
    return ref_genes


def get_publications():
    ref_genes = get_ref_genes()
    ref_gene2pubmed = medline.gene2pubmed(
        taxon_id=9606,
        paper_kind='research',
        ref_genes=ref_genes)

    papers = nar_attention.count_papers_and_attention(
        ref_genes,
        ref_gene2pubmed)

    return papers

In [None]:
%%time

cl = dict()
dd = dict()
ge = dict()


categs = {
    'gwas': mega.frequent_gwas,
}

for k, i in categs.items():
    print(k)
    a, b, c = i()
    cl[k] = a
    dd[k] = b
    ge[k] = c

In [None]:
def export(file_base):
    p = '180319_illustrate_gwas/{}'.format(file_base)

    ret.export_raster_image(p + '.png', dpi=600, insert_date_time=False)
    ret.export_image(p + '.pdf', insert_date_time=False)


In [None]:
su = pd.concat(cl.values(), axis=1, join='outer').rename_axis('gene_ncbi')

In [None]:
tsne_frame = mega.load_layout()

In [None]:
only_plot_true = True
categ = 'any_prominent_gwas'

zoom_size = 4

to_plot = pd.merge(tsne_frame.reset_index(), su[[categ]].reset_index(), how='left')
to_plot = to_plot.sample(frac=1, replace=False, random_state=1)

In [None]:
papers.head()

In [None]:
papers.loc[:, 'enrichment'] = np.log2(papers['attention'] / papers['attention'].mean())

In [None]:
to_plot = pd.merge(
    to_plot,
    papers.reset_index()[['gene_ncbi', 'enrichment']],
    how='left'
)

In [None]:
f = to_plot['enrichment'] <-6
to_plot.loc[f, 'enrichment'] = -6

f = to_plot['enrichment'] >6
to_plot.loc[f, 'enrichment'] = 6

In [None]:
gi = meta.gene_info(9606, usecols=['gene_ncbi', 'symbol_ncbi'])
to_plot = pd.merge(to_plot, gi, how='left')

In [None]:
zoom_size = 300

plt.figure(figsize=(8,6))

plt.scatter(
    x=to_plot.loc[:, 'x'],
    y=to_plot.loc[:, 'y'],
    s=1,
    color='lightgrey')

f = to_plot[categ].notnull()
f = f & (to_plot[categ]==True)

plt.scatter(
        x=to_plot.loc[f, 'x'],
        y=to_plot.loc[f, 'y'],
        s=zoom_size,
        c=to_plot.loc[f, 'enrichment'],
        cmap='coolwarm',
        edgecolors='k')

plt.colorbar()
plt.title(categ)

export('prominent_gwas')

In [None]:
### col='bwr'

zoom_size = 300

plt.figure(figsize=(8,6))

fig, ax = plt.subplots(figsize=(8,6))


ax.scatter(
    x=to_plot.loc[:, 'x'],
    y=to_plot.loc[:, 'y'],
    s=1,
    color='lightgrey')

f = to_plot[categ].notnull()

f = f & (to_plot[categ]==True)

ax.scatter(
        x=to_plot.loc[f, 'x'],
        y=to_plot.loc[f, 'y'],
        s=zoom_size,
        c=to_plot.loc[f, 'enrichment'],
        cmap='coolwarm',
        edgecolors='k')



n = to_plot.loc[f, 'symbol_ncbi'].values
z = to_plot.loc[f, 'x'].values
y = to_plot.loc[f, 'y'].values

for i, txt in enumerate(n):
    ax.annotate(txt, (z[i]+1,y[i]))

# # plt.colorbar()
plt.title(categ)

export('prominent_gwas_with_label')
# # fig.colorbar(fig, ax=ax)
# ax.set_title('Gaussian noise with vertical colorbar')

In [None]:
to_plot[to_plot['symbol_ncbi']=='HYKK']

In [None]:
su.loc[150962, :]  # PUS10, linked to ulcerine colitis, and Crohn's disease

In [None]:
su.loc[146198, :]

In [None]:
to_plot[to_plot['enrichment']<0][f]

In [None]:
ebi_gwas = gwas_studies.ebi_gwas()

f = ebi_gwas['MAPPED_GENE'].str.contains('[;,-]') == True
gwas = ebi_gwas.loc[
    ~f,
    ['MAPPED_GENE', 'DISEASE/TRAIT', 'PVALUE_MLOG', 'pubmed_id']].rename(
    columns={
        'MAPPED_GENE': 'symbol_ambiguous',
        'DISEASE/TRAIT': 'trait',
        'PVALUE_MLOG': 'log_pvalue'
    }
)

gwas = pd.merge(
    gwas,
    meta.gene_info(taxon_id=9606, usecols=[
                   'symbol_ncbi', 'gene_ncbi']),
    left_on='symbol_ambiguous',
    right_on='symbol_ncbi',
    how='inner'
).drop('symbol_ambiguous', axis=1).drop('symbol_ncbi', axis=1)

gwas = gwas[gwas['gene_ncbi'].isin(mega.get_ref_genes())]

gwas = gwas.sort_values('log_pvalue', ascending=False)
gwas = gwas.drop_duplicates(
    ['trait', 'pubmed_id', 'gene_ncbi'],
    keep='first')

studies_per_phenotype = gwas[
    ['pubmed_id', 'trait']].drop_duplicates()[
    'trait'].value_counts()

g = gwas[['trait', 'gene_ncbi', 'log_pvalue']].groupby(['gene_ncbi', 'trait'])

gg = pd.concat([
    g.max(), g.size()], axis=1).reset_index().rename(columns={
    0: 'studies'
})

ggg = gg.sort_values(['studies', 'log_pvalue'], ascending=False)



gggg = ggg.drop_duplicates(subset='trait', keep='first')

In [None]:
su.columns

In [None]:
gggg = gggg[gggg['trait'].isin(su.columns)]

In [None]:
gggg

In [None]:
### col='bwr'

zoom_size = 300


to_plot[categ] = to_plot[categ] & to_plot['gene_ncbi'].isin(gggg['gene_ncbi'])

plt.figure(figsize=(8,6))

# fig, ax = plt.subplots()
# ax.scatter(z, y)


plt.scatter(
    x=to_plot.loc[:, 'x'],
    y=to_plot.loc[:, 'y'],
    s=1,
    color='lightgrey')

# to_plot = to_plot.dropna()

f = to_plot[categ].notnull()

f = f & (to_plot[categ]==True)

# ax.scatter(z, y)

plt.scatter(
        x=to_plot.loc[f, 'x'],
        y=to_plot.loc[f, 'y'],
        s=zoom_size,
        c=to_plot.loc[f, 'enrichment'],
        cmap='coolwarm',
        edgecolors='k')



# n = to_plot.loc[f, 'symbol_ncbi'].values
# z = to_plot.loc[f, 'x'].values
# y = to_plot.loc[f, 'y'].values

# for i, txt in enumerate(n):
#     ax.annotate(txt, (z[i],y[i]))

plt.colorbar()
plt.title(categ)




In [None]:
### col='bwr'

zoom_size = 300




plt.figure(figsize=(8,6))

fig, ax = plt.subplots()


ax.scatter(
    x=to_plot.loc[:, 'x'],
    y=to_plot.loc[:, 'y'],
    s=1,
    color='lightgrey')

# to_plot = to_plot.dropna()

f = to_plot[categ].notnull()

f = f & (to_plot[categ]==True)

# ax.scatter(z, y)

ax.scatter(
        x=to_plot.loc[f, 'x'],
        y=to_plot.loc[f, 'y'],
        s=zoom_size,
        c=to_plot.loc[f, 'enrichment'],
        cmap='coolwarm',
        edgecolors='k')



n = to_plot.loc[f, 'symbol_ncbi'].values
z = to_plot.loc[f, 'x'].values
y = to_plot.loc[f, 'y'].values

for i, txt in enumerate(n):
    ax.annotate(txt, (z[i]+1,y[i]))

# plt.colorbar()
plt.title(categ)





In [None]:
pd.merge(gggg, meta.gene_info(9606, usecols=['gene_ncbi', 'symbol_ncbi']))

In [None]:
gg

In [None]:
# c_difficulty = [
#     'extremly_LoF_intolerant',
#     'rnai_frequent',
#     'biogrid_western_blot',
#     'signal_peptide',
#     'fraction_of_tissues_1',
#     'fraction_of_cells_1',
#     'yeast',
#     'worm_fly',
#     'extreme_swissprot',
#     'any_prominent_gwas',
# #     'has_duf',
#     'bp2_with_studied',
#     'change_in_gtx',
#     'bp2_mean_studied'
# ]

# c_support = [
#     'will_be_pi',
#     'recently_supporting_institutes'
# ]



# sns.clustermap(
#     su.loc[genes_of_interest, c_difficulty].fillna(False).astype(float),
#     cmap='autumn_r',
#     method='ward');

# sns.clustermap(
#     su.loc[genes_of_interest, c_difficulty].fillna(False).astype(float),
#     cmap='autumn_r',
#     method='ward');