In [None]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
import os
import sys

import numpy as np
import pandas as pd

from access_science_shared import standardizer
from access_literature_data import medline

sys.path.append('./../src')

import resci_tools as ret

import resci_inout as inout
import nar170604f_occurences as nar_attention

import nar170605f_funding as funding

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import spearmanr

In [None]:
save_images = True

# Get stats

In [None]:
nih_project, nih_budget, nih_publication = funding.get_paper_funding_through_nih()

taxon_id = 9606

ref_genes = standardizer.reference_genes(taxon_id, 'orp')  # offical nomenclature, liteature, protein-coding
ref_gene2pubmed = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research')
papers = nar_attention.count_papers_and_attention(ref_genes, ref_gene2pubmed)

grants_on_pubmed = nih_publication[['pubmed_id', 'project_num']].drop_duplicates()

In [None]:
# get grants per gene
master = pd.merge(
    ref_gene2pubmed,
    grants_on_pubmed[['pubmed_id', 'project_num']]
)[['gene_ncbi', 'project_num']].drop_duplicates()

grants_per_gene = master['gene_ncbi'].value_counts().to_frame('grants_per_gene').reset_index(
    ).rename(columns={'index': 'gene_ncbi'}).set_index('gene_ncbi')
grants_per_gene = grants_per_gene.loc[ref_genes, :].fillna(0)

# Overview of relation of grants and publications

In [None]:
# occurence in publications
fame = papers.reset_index()
fame['papers'] = fame['papers'].astype(float)
fame['log_papers'] = np.log10(fame['papers'].values)
bins = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4]
fame['log_papers_bin'] = pd.cut(fame['log_papers'].values, bins, include_lowest=True)

In [None]:
to_plot = pd.merge(
    fame,
    grants_per_gene.reset_index()
)
to_plot['log_grants_1'] = to_plot['grants_per_gene'].apply(lambda x: np.log10(x + 1))

sns.jointplot(
    x='log_papers',
    y='log_grants_1',
    data=to_plot,
    stat_func = spearmanr,
    joint_kws={'s': 1}
)

if save_images:
    ret.export_image('171202f_grants_per_gene/papers_vs_grants.pdf')

In [None]:
sns.boxplot(x='log_papers_bin', y='grants_per_gene', data=to_plot, notch=True, color='goldenrod')
plt.yscale('log')

if save_images:
    ret.export_image('171202f_grants_per_gene/binned_grants_per_gene.pdf')

In [None]:
import resci_inout as rinout

In [None]:
def get_predictions(comparison):

    p = os.path.join(
        rinout.get_internal_path(
                '171202f_predict_money'),
        comparison,
        'zgbrh_p90_e300',
        'pooled_target_and_prediciton.csv.gz'
    )
    df = pd.read_csv(p)
    df = df.set_index('gene_ncbi')
    target = df.pop('target')
    predictions = df
    
    return target, predictions


def pooling_fun(x):
    return np.nanmedian(x)

In [None]:
ta, po = get_predictions('171202_human_BioExpYearhomall_log_budget_for_attention')
po = po.apply(pooling_fun, axis=1)
df = pd.concat(
        [ta, po], axis=1, join='inner').rename(
        columns={0: 'predicted'})

In [None]:
master = pd.merge(
    df.reset_index(),
    grants_per_gene.reset_index()
)

In [None]:
master['log_grants_per_gene'] = np.log10(master['grants_per_gene'])

In [None]:
sns.jointplot(
    x='target',
    y='log_grants_per_gene',
    data=master,
    stat_func = spearmanr,
    joint_kws={'s': 1}
)

if save_images:
    ret.export_image('171202f_grants_per_gene/target_vs_log_grants.pdf')

In [None]:
bins = [2, 4, 6, 8, 10]
master['target_bin'] = pd.cut(master['target'].values, bins, include_lowest=True)

In [None]:
sns.boxplot(x='target_bin', y='grants_per_gene', data=master, notch=True, color='goldenrod')
plt.yscale('log')

if save_images:
    ret.export_image('171202f_grants_per_gene/grants_per_gene.pdf')