In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import re
import sys

import numpy as np
import pandas as pd
import seaborn as sns

from access_biology_data import meta
from access_literature_data import medline
from access_science_shared import standardizer

sys.path.append('./../src/')
import ana170508f_human_citations as ana
import nar170604f_occurences as nar

import matplotlib.pyplot as plt

import resci_tools as ret


from scipy.stats import spearmanr


import seaborn as sns

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')

# taxon_id = 559292
# ref_genes = standardizer.reference_genes(taxon_id, 'rp')

save_images = False
save_table = True

In [None]:
def make_hex(df_x_y):
    hexplot = sns.jointplot(
        x=df_x_y.columns[0],
        y=df_x_y.columns[1],
        data=df_x_y.dropna(),
        kind='hex',
        gridsize=15,
        stat_func=spearmanr)
    
    plt.subplots_adjust(left=0.2, right=0.8, top=0.8, bottom=0.2)  # shrink fig so cbar is visible
    cax = hexplot.fig.add_axes([.85, .25, .05, .4])  # x, y, width, height
    plt.colorbar(cax=cax)

In [None]:
gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)

In [None]:
df_m = medline.select_medline_wos_records(
    columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year,
            ut2pmid.ut AS wos_id''',
    taxon_id=taxon_id,
    kind='research',
    unambiguous=True)

# df_m = df_m[df_m['authors']>0]   # exclude consortia paper (-1)
df_m = df_m[['pubmed_id', 'wos_id', 'pubdate_year']]
df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])].drop_duplicates()

genes_per_paper = gene2pubmed['pubmed_id'].value_counts().to_frame('genes')
df_m = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='inner')

# fame = nar.count_papers_and_attention(ref_genes, gene2pubmed[gene2pubmed['pubmed_id'].isin(df_l['pubmed_id'])])

In [None]:
df_pm = pd.merge(
    df_m[['pubmed_id', 'pubdate_year']],
    gene2pubmed[['pubmed_id', 'gene_ncbi']])[['pubdate_year', 'gene_ncbi']].drop_duplicates()

first_publication_year = df_pm.groupby('gene_ncbi').agg(min).rename(columns={'pubdate_year': 'first_year'})

In [None]:
f = df_m['genes'] == 1

df_pm = pd.merge(
    df_m.loc[f, ['pubmed_id', 'pubdate_year']],
    gene2pubmed[['pubmed_id', 'gene_ncbi']])[['pubdate_year', 'gene_ncbi']].drop_duplicates()

first_single_publication_year = df_pm.groupby('gene_ncbi').agg(min).rename(columns={'pubdate_year': 'first_single_year'})

In [None]:
time_span_to_observe_discovery = np.arange(1970, 2006)
time_span_to_observe_publications = np.arange(2006, 2016)

initial_years = pd.concat([first_publication_year, first_single_publication_year], axis=1)
for c in initial_years.columns:
    f = initial_years.loc[:, c].isin(time_span_to_observe_discovery)
    initial_years.loc[~f, c] = np.nan

df_l = df_m[df_m['pubdate_year'].isin(time_span_to_observe_publications)]
fame = nar.count_papers_and_attention(ref_genes, gene2pubmed[gene2pubmed['pubmed_id'].isin(df_l['pubmed_id'])])

fun = lambda x: np.log10(x+1)
master = pd.concat([fame, initial_years], axis=1)
master['log_papers'] = fun(master['papers'])
master['log_attention'] = fun(master['attention'])

In [None]:
master.corr('spearman')

In [None]:
make_hex(master[['first_year', 'log_papers']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/till_and_after_2005_fy_vs_p.pdf')

In [None]:
if save_table:
    ret.export_full_frame(
        '170801_date_of_discovery_and_knowledge/till_and_after_2005_fy_vs_p_data.csv',
        master[['first_year', 'log_papers']],
        save_index=True
    )

In [None]:
make_hex(master[['first_year', 'log_attention']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/till_and_after_2005_fy_vs_a.pdf')

In [None]:
make_hex(master[['first_single_year', 'log_papers']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/till_and_after_2005_fsy_vs_p.pdf')

In [None]:
make_hex(master[['first_single_year', 'log_attention']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/till_and_after_2005_fsy_vs_a.pdf')

In [None]:
time_span_to_observe_discovery = np.arange(1970, 2016)
time_span_to_observe_publications = np.arange(1970, 2016)

initial_years = pd.concat([first_publication_year, first_single_publication_year], axis=1)
for c in initial_years.columns:
    f = initial_years.loc[:, c].isin(time_span_to_observe_discovery)
    initial_years.loc[~f, c] = np.nan

df_l = df_m[df_m['pubdate_year'].isin(time_span_to_observe_publications)]
fame = nar.count_papers_and_attention(ref_genes, gene2pubmed[gene2pubmed['pubmed_id'].isin(df_l['pubmed_id'])])

fun = lambda x: np.log10(x+1)
master = pd.concat([fame, initial_years], axis=1)
master['log_papers'] = fun(master['papers'])
master['log_attention'] = fun(master['attention'])

In [None]:
master.corr('spearman')

In [None]:
make_hex(master[['first_year', 'log_papers']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/pooledyears_fy_vs_p.pdf')

In [None]:
make_hex(master[['first_year', 'log_attention']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/pooledyears_fy_vs_a.pdf')

In [None]:
make_hex(master[['first_single_year', 'log_papers']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/pooledyears_fsy_vs_p.pdf')

In [None]:
make_hex(master[['first_single_year', 'log_attention']])

if save_images:
    ret.export_image('170801_date_of_discovery_and_knowledge/pooledyears_fsy_vs_a.pdf')