In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import spearmanr


from access_economic_data import nih
from access_literature_data import medline, wos
from access_science_shared import standardizer

In [None]:
sys.path.append('./../src/')
import nar170605f_funding as nar_funding
import nar170604f_occurences as nar_attention

import ana170508f_human_citations as ana

In [None]:
import matplotlib.pyplot as plt
import resci_tools as ret

In [None]:
taxon_id = 9606

In [None]:
sns.set(font_scale=1)   # placing it to 2 seems to lead to deceiving helper lines

In [None]:
save_images = False
save_table = True

# Load datasets

In [None]:
def get_funding_info(earliest_year, latest_year):

    ### INITIALIZATION ###

    # MedLine
    ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
    gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)

    df_m = medline.select_medline_wos_records(
        columns_sql='''
                medline.pubmed_id,
                medline.pubdate_year,
                medline.amount_of_authors,
                medline.j_name_s''',
        years_range='all',
        taxon_id=taxon_id,
        kind='research',
        unambiguous=True)

    df_m = df_m[df_m['amount_of_authors']>0]   # exclude consortia paper (-1)
    df_m = df_m[['pubmed_id', 'pubdate_year', 'amount_of_authors', 'j_name_s']]

    df_m = df_m[df_m['pubdate_year']>=earliest_year]
    df_m = df_m[df_m['pubdate_year']<=latest_year]

    _pubmed_articles_in_medline_time_span = set(df_m['pubmed_id'])   #  <========== use later for filtering all!!!!

    # NIH Exporter
    df_prj_core, df_prj_budget, df_nih_papers = nar_funding.get_paper_funding_through_nih()
    df_nih_papers = df_nih_papers.loc[:, ['project_num', 'pubmed_id']]  # skip publication year

    df_prj_core = df_prj_core[df_prj_core['FY']>=earliest_year]
    df_prj_core = df_prj_core[df_prj_core['FY']<=latest_year]

    df_prj_budget = df_prj_budget.loc[:, [ 'project_num', 'budget']]   # skip fiscal year
    df_prj_budget = df_prj_budget.groupby('project_num').agg(sum)
    df_prj_budget = df_prj_budget.reset_index()

    ### Estimations of costs for non-covered papers ###

    papers_in_nih = len(set(df_nih_papers['pubmed_id']).intersection(set(df_m['pubmed_id'])))
    papers_in_medline = len(set(df_m['pubmed_id']))
    multiplier_nih2medline = papers_in_medline / papers_in_nih
    print('Multiplier:', multiplier_nih2medline)

    ### Synchronization ###

    # PubMed
    lis = [set(df_nih_papers['pubmed_id']), set(df_m['pubmed_id']), set(gene2pubmed['pubmed_id'])]
    pubmed_in_all = set.intersection(*lis)
    print('Amount of MedLine articles:', len(pubmed_in_all))

    gene2pubmed = gene2pubmed[gene2pubmed['pubmed_id'].isin(pubmed_in_all)]
    df_m = df_m[df_m['pubmed_id'].isin(pubmed_in_all)]
    df_nih_papers = df_nih_papers[df_nih_papers['pubmed_id'].isin(pubmed_in_all)]

    # Projects
    lis = [set(df_prj_core['project_num']), set(df_prj_budget['project_num']), set(df_nih_papers['project_num'])]
    project_in_all = set.intersection(*lis)

    df_prj_core = df_prj_core[df_prj_core['project_num'].isin(project_in_all)]
    df_prj_budget = df_prj_budget[df_prj_budget['project_num'].isin(project_in_all)]
    df_nih_papers = df_nih_papers[df_nih_papers['project_num'].isin(project_in_all)]

    ### Resources per paper per gene

    # amount of publications per project
    papers_per_project = df_nih_papers['project_num'].value_counts()
    # overall budget per project
    budget_per_project = df_prj_budget.set_index('project_num')['budget']
    # budget per paper for each project
    budget_per_paper_per_project = budget_per_project.div(papers_per_project).to_frame(
        'budget_per_paper_per_project').reset_index().rename(columns={'index': 'project_num'})

    budget_per_pubmed_id = pd.merge(budget_per_paper_per_project, df_nih_papers)[
        ['pubmed_id', 'budget_per_paper_per_project']].groupby('pubmed_id').agg(sum).reset_index()

    attention_per_paper = (1 / gene2pubmed['pubmed_id'].value_counts()).to_frame('attention_per_gene').reset_index()
    attention_per_paper = attention_per_paper.rename(columns={'index': 'pubmed_id'})

    gene2pubmed_plus = pd.merge(gene2pubmed, budget_per_pubmed_id)
    gene2pubmed_plus = pd.merge(gene2pubmed_plus, attention_per_paper)
    gene2pubmed_plus = gene2pubmed_plus.rename(
        columns={'budget_per_paper_per_project': 'budget_for_paper', 'attention_per_gene': 'attention'})
    gene2pubmed_plus.loc[:, 'papers'] = 1


    gene2pubmed_plus['budget_for_attention'] = gene2pubmed_plus['attention'] * gene2pubmed_plus['budget_for_paper']

    master = gene2pubmed_plus[
        ['gene_ncbi', 'budget_for_attention', 'attention', 'papers', 'budget_for_paper']].groupby('gene_ncbi').agg(sum)

    master['budget_by_attention'] = master['budget_for_attention'] / master['attention']
    master['budget_by_papers'] = master['budget_for_paper'] / master['papers']

    gene2pubmed_full = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)
    gene2pubmed_full = gene2pubmed_full[gene2pubmed_full['pubmed_id'].isin(_pubmed_articles_in_medline_time_span)]

    fame_full = nar_attention.count_papers_and_attention(ref_genes, gene2pubmed_full)

    n = fame_full.columns
    fame_full.columns = ['full_' + x for x in fame_full.columns]
    master = pd.merge(master.reset_index(), fame_full.reset_index())

    nih_publnk = nih.publnk().drop_duplicates()
    gene2pubmed_all_nih = gene2pubmed_full[gene2pubmed_full['pubmed_id'].isin(nih_publnk['pubmed_id'])]

    fame_all_nih = nar_attention.count_papers_and_attention(ref_genes, gene2pubmed_all_nih)
    n = fame_all_nih.columns
    fame_all_nih.columns = ['all_nih_' + x for x in fame_all_nih.columns]
    master = pd.merge(master, fame_all_nih.reset_index())

    for x in n:
        master.loc[:, 'non_nih_' + x] = master.loc[:, 'full_' + x] - master.loc[:, 'all_nih_' + x]

    master = master.set_index('gene_ncbi')

    return master

In [None]:
earliest_year = 1985  # time span of analysis 
latest_year = 2015

m_full = get_funding_info(earliest_year, latest_year)

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
m_full.corr('spearman')

In [None]:
g = sns.clustermap(m_full.corr('spearman'), method='ward', annot=False, vmin=-1, vmax=1)
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)

if save_images:
    ret.export_image('170721_per_gene_funding/clustergram_budgeting_and_fame.pdf')

Description:
- full_ --> MedLine
- all_nih --> every NIH paper, also includes papers which are only linked to general institute, but not specific project / budget
- non_nih_papes --> in full_ but not all_nih
- (nothing) -> project-linkable nih publications
- budget_for -> total
- budget_by -> yield

(all within defiend 

In [None]:
to_plot = m_full.loc[:,['attention', 'budget_for_attention']].apply(lambda x: np.log10(x))

sns.jointplot(to_plot['budget_for_attention'], to_plot['attention'], joint_kws={'s':1}, stat_func=spearmanr)

if save_images:
    ret.export_image('170721_per_gene_funding/budget_vs_nih_attention.pdf')

In [None]:
m_full.loc[:,['non_nih_attention', 'attention']].min()

In [None]:
if save_table:
    ret.export_full_frame(
        '170721_per_gene_funding/budget_vs_nih_attention_data.csv',
        to_plot[['budget_for_attention', 'attention']],
        save_index=True
    )

In [None]:
to_plot = m_full.loc[:,['non_nih_attention', 'attention']].replace(0,0.00001).apply(lambda x: np.log10(x))

sns.jointplot(to_plot['non_nih_attention'], to_plot['attention'], joint_kws={'s':1}, stat_func=spearmanr)

if save_images:
    ret.export_image('170721_per_gene_funding/attention_nih_vs_not_nih_replace0to-5.pdf')