In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from access_literature_data import medline, wos
from access_science_shared import standardizer

In [None]:
sys.path.append('./../src/')
import nar170604f_occurences as nar

import ana170508f_human_citations as ana
import resci_tools as ret

# Settings

In [None]:
taxon_id = 9606

In [None]:
save_images = False
save_table = True

fame_years_of_interest = np.arange(2015, 2016)   # e.g. only last year
year_category = 'first_year'    #  'first_solo_year'
fame_of_interest = 'attention'

In [None]:
# Import genes, and pubmed for a given taxon. Note that
# this is done on a per-taxon level as some lesser
# model organisms do not support official nomeclature
# (and thus might return less confident genes)
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
gene2pubmed = medline.gene2pubmed(
    taxon_id= taxon_id,
    paper_kind='research',
    ref_genes=ref_genes)

In [None]:


df_m = medline.select_medline_wos_records(
    columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year,
            medline.amount_of_authors AS authors,
            ut2pmid.ut AS wos_id''',
    years_range=None,
    taxon_id = taxon_id,
    kind='research',
    unambiguous=True)

df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])]
columns_to_use = ['pubmed_id', 'wos_id', 'pubdate_year', 'authors']
df_m = df_m.loc[:,columns_to_use].drop_duplicates()
    
genes_per_paper = gene2pubmed['pubmed_id'].value_counts().to_frame('genes')
df_m_specified_taxon = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='inner')

### Taxon specific cumulative annual fame of genes

In [None]:
span = np.arange(1980, 2016)
fame_of_interest = 'attention'

df_h = medline.select_medline_records(
    columns_sql='''
        medline.pubmed_id,
        medline.pubdate_year''',
    taxon_id=taxon_id,
    kind='research',
)

gene2pubmed_and_year = pd.merge(gene2pubmed, df_h[['pubmed_id', 'pubdate_year']])


In [None]:
d = pd.merge(
    df_m_specified_taxon[['pubmed_id', 'pubdate_year', 'genes']],
    gene2pubmed[['gene_ncbi', 'pubmed_id']])

is_single_gene_paper = d['genes'] == 1

genes_earliest_years = pd.merge(
    d.loc[
        :,
        ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(min).reset_index().rename(
                columns={'pubdate_year':'first_year'}),
    d.loc[
        is_single_gene_paper,
        ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(min).reset_index().rename(
                columns={'pubdate_year':'first_solo_year'}),
    left_on = 'gene_ncbi',
    right_on = 'gene_ncbi',
    how = 'outer'
)

## Personalized author statistics

# Analyses

In [None]:
fame_in_interest = nar.count_papers_and_attention(
    ref_genes,
    gene2pubmed_and_year[gene2pubmed_and_year['pubdate_year'].isin(fame_years_of_interest)])

In [None]:
master = pd.merge(fame_in_interest, genes_earliest_years, left_index=True, right_on='gene_ncbi', how='left')

In [None]:
g = master[['attention', year_category]].groupby(year_category)
annual = g.agg(sum)
annual = pd.concat([annual, g.size().to_frame('genes')], axis=1)
annual['cum_attention'] = np.cumsum(annual['attention']) / annual['attention'].sum()
annual['cum_genes'] = np.cumsum(annual['genes']) / annual['genes'].sum()
annual = annual.reset_index()

In [None]:
sns.jointplot(x='cum_genes', y='cum_attention', data=annual)

if save_images:
    ret.export_image('170807_year_and_fraction_current_research/cumulative_genes_vs_{}.pdf'.format(
        fame_of_interest))

In [None]:
if save_table:
    ret.export_full_frame(
        '170807_year_and_fraction_current_research/cumulative_genes_vs_{}_data.csv'.format(
        fame_of_interest),
        annual[['first_year', 'cum_genes', 'cum_attention']],
        save_index=False
    )

In [None]:
annual[annual[year_category].isin(np.arange(1960, 2020, 10))]

In [None]:
annual.sort_values(year_category).iloc[::2, :]

In [None]:
annual.sort_values(year_category).iloc[1::2, :]

In [None]:
sns.barplot(
    x=year_category,
    y='attention',
    estimator=lambda x: np.median(x),
    data=master[master[year_category].isin(np.arange(1970, 2001))]
)