In [None]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42


import numpy as np
import pandas as pd
import seaborn as sns

from access_literature_data import medline
from access_science_shared import standardizer

import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.append('./../src/')

from calculators import gini

import resci_tools as ret

In [None]:
save_figures = False
save_tables = True

In [None]:
sns.set(font_scale=2)

In [None]:
def make_figure(standardizer_code, outname):

    ref_genes = standardizer.reference_genes(taxon_id, standardizer_code)

    gene2pubmed = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research')
    gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(ref_genes)]
    ml = medline.select_medline_records(
            columns_sql='''
                medline.pubmed_id,
                medline.pubdate_year''',
            taxon_id=taxon_id,
            kind='research',
            )

    gene2pubmed = pd.merge(gene2pubmed, ml)

    span = np.arange(1980, 2016, 5)   # only plot every fifth year

    fraction_per_year = pd.DataFrame(columns=span)

    df_gini_covered = pd.DataFrame(index=span)
    df_gini_all = pd.DataFrame(index=span)

    for y in span:

        v = (1/gene2pubmed['pubmed_id'].value_counts()).to_frame(
            'value_per_paper').reset_index().rename(columns={'index':'pubmed_id'})
        gene2pubmed = pd.merge(gene2pubmed, v)

        if any(gene2pubmed['pubdate_year'] <= y):

            df = gene2pubmed[gene2pubmed['pubdate_year']<=y][['gene_ncbi', 'value_per_paper']]
            a = df.groupby('gene_ncbi').agg(np.sum) / df['value_per_paper'].sum()
            a = a.loc[ref_genes, :].fillna(0)
            v = a.sort_values('value_per_paper',ascending=False)
            c = np.cumsum(v['value_per_paper'])

            fraction_per_year.loc[:, y] = c.values

            df_gini_covered.loc[y,'gini_covered'] = gini(v[v['value_per_paper']>0]['value_per_paper'].values)
            df_gini_all.loc[y,'gini_all'] = gini(v['value_per_paper'].values)

    fraction_per_year.index = fraction_per_year.index / len(fraction_per_year.index)

    print('half:', np.count_nonzero(fraction_per_year.loc[:,2015]<=0.5) / fraction_per_year.shape[0])
    print('90%:', np.count_nonzero(fraction_per_year.loc[:,2015]<=0.9) / fraction_per_year.shape[0])

    ax = fraction_per_year.loc[:,1980:2016].plot(colormap='coolwarm_r')
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    ax.set_xlabel('Fraction Genes')
    ax.set_ylabel('Fraction MedLine')
    ax.set_ylim(0,1)
    plt.title(taxon_id)
    
    if save_figures:
        ret.export_image(outname)
        
    out = fraction_per_year.copy()
    out.index.name = 'cum_fraction'
    
    if save_tables:
        ret.export_full_frame(outname[:-4] + '_data.csv', out)
        
    return fraction_per_year

# Make panels

In [None]:
taxon_id = 9606


In [None]:
outname = '171001_fraction_medline_cumulative/yearly_attention_human_orp.pdf'
make_figure('orp', outname) # offical nomenclature, research, protein-coding

outname = '171001_fraction_medline_cumulative/yearly_attention_human_r.pdf'
make_figure('r', outname) #  research,

outname = '171001_fraction_medline_cumulative/yearly_attention_human_rp.pdf'
make_figure('rp', outname) # research, protein-coding

outname = '171001_fraction_medline_cumulative/yearly_attention_human_p.pdf'
make_figure('p', outname) # protein-coding

outname = '171001_fraction_medline_cumulative/yearly_attention_human_no_filter.pdf'
make_figure('', outname) 

In [None]:
def make_log_figure(standardizer_code, outname):

    ref_genes = standardizer.reference_genes(taxon_id, standardizer_code)

    gene2pubmed = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research')
    gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(ref_genes)]
    ml = medline.select_medline_records(
            columns_sql='''
                medline.pubmed_id,
                medline.pubdate_year''',
            taxon_id=taxon_id,
            kind='research',
            )

    gene2pubmed = pd.merge(gene2pubmed, ml)

    span = np.arange(1980, 2016, 5)   # only plot every fifth year

    fraction_per_year = pd.DataFrame(columns=span)

    df_gini_covered = pd.DataFrame(index=span)
    df_gini_all = pd.DataFrame(index=span)

    for y in span:

        v = (1/gene2pubmed['pubmed_id'].value_counts()).to_frame(
            'value_per_paper').reset_index().rename(columns={'index':'pubmed_id'})
        gene2pubmed = pd.merge(gene2pubmed, v)

        if any(gene2pubmed['pubdate_year'] <= y):

            df = gene2pubmed[gene2pubmed['pubdate_year']<=y][['gene_ncbi', 'value_per_paper']]
            a = df.groupby('gene_ncbi').agg(np.sum) / df['value_per_paper'].sum()
            a = a.loc[ref_genes, :].fillna(0)
            v = a.sort_values('value_per_paper',ascending=False)
            c = np.cumsum(v['value_per_paper'])

            fraction_per_year.loc[:, y] = c.values

            df_gini_covered.loc[y,'gini_covered'] = gini(v[v['value_per_paper']>0]['value_per_paper'].values)
            df_gini_all.loc[y,'gini_all'] = gini(v['value_per_paper'].values)

    fraction_per_year.index = fraction_per_year.index / len(fraction_per_year.index)

    print('half:', np.count_nonzero(fraction_per_year.loc[:,2015]<=0.5) / fraction_per_year.shape[0])
    print('90%:', np.count_nonzero(fraction_per_year.loc[:,2015]<=0.9) / fraction_per_year.shape[0])

    fraction_per_year = fraction_per_year.apply(lambda x: np.log10(x))
    fraction_per_year.index = np.log10(fraction_per_year.index)
    ax = fraction_per_year = fraction_per_year.iloc[1:, :].loc[
        :,1980:2016].plot(colormap='coolwarm_r')

    
    plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
    ax.set_xlabel('Fraction Genes')
    ax.set_ylabel('Fraction MedLine')
#     ax.set_ylim(0,1)
    plt.title(taxon_id)
    
#     if save_tables:
#         out = fraction_per_year
    
    
    if save_figures:
        ret.export_image(outname)

In [None]:
# outname = '171001_fraction_medline_cumulative/log_yearly_attention_human_orp.pdf'
# make_log_figure('orp', outname) # offical nomenclature, research, protein-coding

# outname = '171001_fraction_medline_cumulative/log_yearly_attention_human_r.pdf'
# make_log_figure('r', outname) #  research,

# outname = '171001_fraction_medline_cumulative/log_yearly_attention_human_rp.pdf'
# make_log_figure('rp', outname) # research, protein-coding

# outname = '171001_fraction_medline_cumulative/log_yearly_attention_human_p.pdf'
# make_log_figure('p', outname) # protein-coding

# outname = '171001_fraction_medline_cumulative/log_yearly_attention_human_no_filter.pdf'
# make_log_figure('', outname) 