In [None]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

import numpy as np
import pandas as pd
# import seaborn as sns

from access_literature_data import medline
from access_science_shared import standardizer

import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.append('./../src/')
import nar171107f_citation_network as cit
import resci_tools as ret

In [None]:
save_images = False
save_tables = True

In [None]:
# get full citation network of research publications linked to pubmed
pmid_2_pmid = cit.gene_pmid_research_gene_pmid_research()

In [None]:
# get publication years of pubmed IDs
df_m = medline.select_medline_wos_records(
    columns_sql='''
        medline.pubmed_id,
        medline.pubdate_year''',
    taxon_id='all',
    kind='research',
    unambiguous=True
)

In [None]:
# pubmed, and whether pmid has a human and/or not-human
gene2pubmed = medline.gene2pubmed(taxon_id='all')
taxon_pubmed = gene2pubmed.copy()
taxon_pubmed.loc[:, 'has_human'] = taxon_pubmed['taxon_ncbi'] == 9606
taxon_pubmed.loc[:, 'has_not_human'] = taxon_pubmed['taxon_ncbi'] != 9606
taxon_pubmed = taxon_pubmed[['pubmed_id', 'has_human', 'has_not_human']].drop_duplicates()

In [None]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'orp')  # offical nomenclature, liteature, protein-coding
ref_gene2pubmed = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research')

In [None]:
# get year of initial report
import nar170902_discovery_year as yr
initial_year = yr.get_year_of_discovery(taxon_id, ref_genes).reset_index()
initial = initial_year[['gene_ncbi', 'first_year']].rename(columns={'first_year': 'pubdate_year'})
initial.loc[:, 'initial_report'] = True

In [None]:
df_m_human_gene = pd.merge(df_m, ref_gene2pubmed)

In [None]:
df_m_human_gene = pd.merge(
    df_m_human_gene,
    initial,
    left_on=['pubdate_year', 'gene_ncbi'],
    right_on=['pubdate_year', 'gene_ncbi'],
    how='left'
)

In [None]:
df_m_human_gene['initial_report'] = df_m_human_gene['initial_report'].fillna(False)

In [None]:
df_m_human = df_m_human_gene[['pubmed_id', 'pubdate_year', 'initial_report']].drop_duplicates()

In [None]:
df_m_human = df_m_human.groupby(['pubmed_id', 'pubdate_year']).agg(any).reset_index()

In [None]:
master = pd.merge(pmid_2_pmid, df_m_human_gene, left_on='citing', right_on='pubmed_id').drop('pubmed_id', 1)

In [None]:
master = pd.merge(
    master,
    taxon_pubmed,
    left_on='cited',
    right_on='pubmed_id',
    how='left'
)

In [None]:
m = master[['citing', 'initial_report', 'has_human', 'has_not_human']].groupby(
    ['citing']).agg(sum).reset_index()

In [None]:
m['excess_not_human'] =  m['has_not_human'] > m['has_human']

In [None]:
a = pd.merge(
    (m.set_index('citing') >0).reset_index(),
    master[['citing', 'pubdate_year']].drop_duplicates())

In [None]:
import seaborn as sns

In [None]:
sns.barplot(
    x='pubdate_year',
    y='excess_not_human',
    data=a[a['pubdate_year'].isin(np.arange(1980, 2001))],
    hue='initial_report'
)

if save_images:
    ret.export_image('180701_whom_do_initial_reports_cite/more_non_human_by_year.pdf')

In [None]:
sns.barplot(x='initial_report', y='excess_not_human',data=a)



if save_images:
    ret.export_image('180701_whom_do_initial_reports_cite/more_non_human.pdf')

In [None]:
from access_biology_data import meta



In [None]:
initial_reports_citing_non_human = master[
    (master['has_not_human']==True) & (master['initial_report']==True)
]['pubmed_id'].unique()

d = pmid_2_pmid[pmid_2_pmid['citing'].isin(initial_reports_citing_non_human)]

v = gene2pubmed['pubmed_id'].value_counts().to_frame('value')
v.index.name='pubmed_id'
v = v.reset_index()
v['value'] = 1  / v['value']

g = gene2pubmed.copy()
g = pd.merge(g, v)
dd = pd.merge(
    d,
    g.drop('gene_ncbi', axis=1).rename(columns={'pubmed_id': 'cited'})

)
dd = dd[dd['taxon_ncbi']!=9606]

orig = dd[['taxon_ncbi', 'value']].groupby('taxon_ncbi').agg(sum).sort_values('value', ascending=False)

orig['share'] = orig['value'] / orig['value'].sum()


for t in orig.index[:10]:
    print(t, meta.taxon_name(t))

In [None]:
orig_non_human = orig.copy()

In [None]:
initial_reports_citing_non_human = master[
    (master['has_not_human']==False) & (master['initial_report']==True)
]['pubmed_id'].unique()

d = pmid_2_pmid[pmid_2_pmid['citing'].isin(initial_reports_citing_non_human)]

v = gene2pubmed['pubmed_id'].value_counts().to_frame('value')

v.index.name='pubmed_id'
v = v.reset_index()
v['value'] = 1  / v['value']

g = gene2pubmed.copy()

g = pd.merge(g, v)

dd = pd.merge(
    d,
    g.drop('gene_ncbi', axis=1).rename(columns={'pubmed_id': 'cited'})

)

dd = dd[dd['taxon_ncbi']!=9606]
orig = dd[['taxon_ncbi', 'value']].groupby('taxon_ncbi').agg(sum).sort_values('value', ascending=False)
orig['share'] = orig['value'] / orig['value'].sum()


for t in orig.index[:10]:
    print(t, meta.taxon_name(t))

In [None]:
m = pd.merge(
    orig_non_human.reset_index(), 
    orig.reset_index(), 
    left_on='taxon_ncbi', 
    right_on='taxon_ncbi',
    suffixes=('_citing_organims', '_also_citing_humans')
)

In [None]:
m['enrichment'] = np.log2(m['share_citing_organims']/m['share_also_citing_humans'])

In [None]:
m=m.set_index('taxon_ncbi')

In [None]:
organism_names = [meta.taxon_name(t) for t in m.index]

In [None]:
patch_name = pd.DataFrame(
    index=m.index,
    data=organism_names,
    columns=['organism_name']
    
)

In [None]:
mm = pd.concat([m, patch_name], axis=1)

In [None]:
mm = mm[['organism_name', 'share_citing_organims', 'share_also_citing_humans', 'enrichment']]

In [None]:
mm = mm.sort_values('share_citing_organims', ascending=False)

In [None]:
mm.loc[:, 'log_share'] = np.log10(mm['share_citing_organims'])

In [None]:
sns.lmplot(
    x='enrichment',
    y='share_citing_organims',
    data=mm.iloc[:10, :],
    fit_reg=False
)

if save_images:
    ret.export_image('180701_whom_do_initial_reports_cite/share_of_most_cited.pdf')

In [None]:
mm.iloc[:10, :]

In [None]:
out = mm[['organism_name', 'share_citing_organims', 'share_also_citing_humans']].rename(
columns={
    'share_citing_organism': 'share_citing_model_organism',
    'organism_name': 'model organism'
})

if save_tables:
    ret.export_full_frame('180701_whom_do_initial_reports_cite/data.csv', out)