In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from access_economic_data import nih
from access_literature_data import medline, wos
from access_science_shared import standardizer

In [None]:
sys.path.append('./../src/')
# import nar170605f_funding as nar_funding
import nar170604f_occurences as nar_attention
import ana170508f_human_citations as ana
import resci_tools as ret

In [None]:
sns.set(font_scale=2)

In [None]:
taxon_id = 9606
# years_for_citation = 3
fame_of_interest = 'attention'

span = np.arange(1980, 2016)
num_bins = 20
minimal_amount_of_publications_per_year_and_category = 100

In [None]:
save_images = False
save_table = True

# Load datasets

In [None]:
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)


df_m = medline.select_medline_records(
    columns_sql='''
        medline.pubmed_id,
        medline.pubdate_year,
        medline.amount_of_authors''',
    taxon_id=taxon_id,
    kind='research',
)

# df_m = medline.select_medline_wos_records(
#     columns_sql='''
#             medline.pubmed_id,
#             medline.pubdate_year,
#             medline.amount_of_authors''',
#     years_range='all',
#     taxon_id=taxon_id,
#     kind='research',
#     unambiguous=True)

df_m = df_m[df_m['amount_of_authors']>0]   # exclude consortia paper (-1)
# df_m = ana.add_citations(df_m, years_to_include=years_for_citation)
# df_m = df_m[['pubmed_id', 'pubdate_year', 'amount_of_authors', 'citations']]
df_m = df_m[['pubmed_id', 'pubdate_year', 'amount_of_authors']]
df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])].drop_duplicates()
# df_m = ana.add_yearly_citation_rank(df_m)

genes_per_paper = gene2pubmed['pubmed_id'].value_counts().to_frame('genes_per_paper')
df_m = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='left')

In [None]:
gene2pubmed_and_year = pd.merge(gene2pubmed, df_m[['pubmed_id', 'pubdate_year']])
agg = []
for y in span: 
    pa = nar_attention.count_papers_and_attention(
        ref_genes,
        gene2pubmed_and_year[gene2pubmed_and_year['pubdate_year']<=y])

    pa = pa[fame_of_interest]
    pa.name = y
    agg.append(pa)

historic_fame = pd.concat(agg, axis=1)

In [None]:
num_genes = len(ref_genes)

historic_bin = (historic_fame.rank(ascending=False) / (num_genes + 1) * num_bins).apply(lambda x: np.floor(x))

NOte that for biomedical litearture pre 2002, only a small subset was actually covered. Thus it might not be fair to use all bins.

In [None]:
# Define NIH categories
codes = {
    'high_impact_or_innovation': ['RC2', 'RC4', 'DP2', 'DP4', 'RC2', 'RC4', 'UC2', 'UC4'],
    'exploratory':['P20', 'R33', 'R21', 'R61', 'RL2', 'U56', 'UG3', 'UH2', 'UH3'],
    'training':['F05', 'F30', 'F31', 'F32', 'F33','F37', 'F38','F99'],
    'R01':['R01'],
    'full_medine': ['iml']
#     'career':['K43', 'K76', 'K99', 'R00']    # very small
}

In [None]:
# df_projects, df_prj_budget, df_nih_papers = nar_funding.get_paper_funding_through_nih()

In [None]:
df_nih_papers = nih.publnk().rename(columns={
    'CORE_PROJECT_NUM': 'project_num',
    'PROJECT_NUMBER': 'project_num',
    'TOTAL_COST': 'budget'})

In [None]:
nih_papers_plus = pd.merge(
    df_nih_papers,
    df_m[['pubmed_id', 'pubdate_year']])

nih_papers_plus['funding_category'] = nih_papers_plus['project_num'].apply(lambda x: x[:3])

In [None]:
out = pd.DataFrame(index = codes.keys(), columns=span)

In [None]:
nih_papers_plus.head()

In [None]:
# append fake category with all medline (to allow consistent processing)
appendix = df_m[['pubmed_id', 'pubdate_year']].drop_duplicates()
appendix.loc[:, 'project_num'] = 'iml' # in medline
appendix.loc[:, 'funding_category'] = 'iml' # in medline
nih_papers_plus = pd.concat([nih_papers_plus, appendix], ignore_index=True)

In [None]:
for c in codes.keys():

    for y in span[1::]:
        current_pubmed_ids = nih_papers_plus[
            (nih_papers_plus['pubdate_year']==y) & (
            nih_papers_plus['funding_category'].isin(
                codes[c]))]['pubmed_id'].drop_duplicates().values

        if len(current_pubmed_ids)>=minimal_amount_of_publications_per_year_and_category:
            current_fame = nar_attention.count_papers_and_attention(
                ref_genes,
                gene2pubmed[gene2pubmed['pubmed_id'].isin(
                    current_pubmed_ids)])[fame_of_interest]

            last_year_top = historic_bin.loc[:, y-1] == 0
            last_year_top = historic_bin.index[last_year_top]

            fraction = current_fame.loc[last_year_top].sum() / current_fame.sum()
    
            out.loc[c, y] = fraction
        
        else:
            out.loc[c, y] = np.nan

In [None]:
out

In [None]:
ax = out.transpose().plot()

ax.set_xlabel('Year')
ax.set_ylabel('Fraction within {}%'.format(100/num_bins))

ax.set_ylim(0,1)

if save_images:
    ret.export_image('170902_grant_category_and_fame_bins/nih_research_on_top_studied.pdf')

note that the statement on 80ies would even be stronger, if more genes would have been researched:

In [None]:
(historic_bin == 0).sum()     

In [None]:
u = out.transpose()
u.index.name = 'year'
u = u.rename(columns={'full_medine': 'full medline'})

if save_table:
    ret.export_full_frame(
        '170902_grant_category_and_fame_bins/nih_research_on_top_studied_data.csv',
        u,
        save_index=True
    )