In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import glob
import os
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

# from access_economic_data import nih
from access_literature_data import medline, wos
from access_science_shared import standardizer

In [None]:
sys.path.append('./../src/')
import nar170604f_occurences as nar

import ana170508f_human_citations as ana

import resci_tools as ret

In [None]:
import resci_inout as inout


In [None]:
# ealiest_year = 1980

# years_for_citation = 3

# Settings

In [None]:
taxon_id = 9606

In [None]:
save_images = False

get_citations = True
years_for_citation = 3     #  <----------    # ony applies if the above would be True

# Taxon-independent datsets

## Import of taxon-indendent datasets

### Full MedLine

In [None]:
%%time
# For all research articles on gene-linked Medline 
# retreive pubmed_id and wos_id, and the amount of
# authors
df_m_any_taxon = medline.select_medline_wos_records(
    columns_sql = '''
            medline.pubmed_id,
            ut2pmid.ut AS wos_id,
            medline.amount_of_authors AS authors''',
    taxon_id = 'all',
    kind='research',
    unambiguous=True)

### WoS of last authors to Year

In [None]:
%%time
# For any person that occurs in gene-linked WoS
# retrieve wos, dais, and publication year of any
# publication where the dais is a last author (note:
# no further constraint, e.g.: on number of authors)
p = inout.get_internal_path('wos_dais/all_queried_wos_with_dais.csv.gz')
wos_dais_of_medline_pis = pd.read_csv(p, low_memory=False)

### Gene-linked WoS - DAIS

In [None]:
%%time
# For any person that occurs in gene-linked WoS
# retreive wos, dais, and authorship position
wos_dais_gene_linked = wos.dais('gene-linked')
wos_dais_gene_linked = wos_dais_gene_linked.sort_values([
    'wos_id',
    'position']).reset_index(drop=True).rename(
        columns={'position':'authorship_position'})

f = wos_dais_gene_linked['authorship_position'] == 1
wos_dais_gene_linked.loc[f, 'authorship'] = 'first'

f = wos_dais_gene_linked['authorship_position'] == 2
wos_dais_gene_linked.loc[f, 'authorship'] = 'second'
wos_dais_gene_linked = wos_dais_gene_linked.reset_index(drop=True)

v = wos_dais_gene_linked['wos_id'].values
is_last = np.concatenate((v[1:] != v[:-1], [True]))
wos_dais_gene_linked.loc[is_last, 'authorship'] = 'senior'
                
wos_dais_gene_linked['authorship'] = wos_dais_gene_linked['authorship'].fillna('middle')

### WoS DAIS for Medline Research articles

In [None]:
# Restrict WoS DAIS to research articles within MedLine
# (or other medline datasets filtered as above to consider
# to be main medline corpus of interest)
wos_dais_gene_linked_resarch = wos_dais_gene_linked[
    wos_dais_gene_linked['wos_id'].isin(
        df_m_any_taxon['wos_id'])
]

## Derivative taxon-independent datasets

In [None]:
%%time
# For all of gene-linked wos, test if medline and WoS authorship
# numbers align
wos_dais_gene_linked = wos_dais_gene_linked.sort_values(
    ['wos_id', 'authorship_position']).reset_index(drop=True)
v = wos_dais_gene_linked['wos_id'].values
f = np.concatenate((v[1:] != v[:-1], [True]))
a = pd.merge(
    wos_dais_gene_linked.loc[f, ['wos_id', 'authorship_position']],
    df_m_any_taxon)
a = a[a['authorship_position'] == a['authors']]
wos_medline_matching_amount_of_authors = a[
    ['wos_id', 'pubmed_id', 'authors']
].rename(columns={'authors':'authors_equal_in_dais_and_medline'})

### Define amount of publications as PI

In [None]:
# Defining PIs might be tricky since different disciplines
# have somewhat different traditions and since WoS does 
# not allow distinguish between publication types; here: define papers
# which have at least a certain amount of authos, and count
# the number of such publications for every authors

minimal_team_size_to_count_as_pi = 2

papers_as_pi = wos_dais_of_medline_pis[
    wos_dais_of_medline_pis['authorship_position']>=minimal_team_size_to_count_as_pi
    ]['dais_id'].value_counts()

# Taxon-dependent datasats

## Import of taxon-dependent datasets

## Taxons-specific gene2pubmed

In [None]:
# Import genes, and pubmed for a given taxon. Note that
# this is done on a per-taxon level as some lesser
# model organisms do not support official nomeclature
# (and thus might return less confident genes)
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
gene2pubmed = medline.gene2pubmed(
    taxon_id= taxon_id,
    paper_kind='research',
    ref_genes=ref_genes)

### Taxon specific extended WoS profile (indluding amount of genes)

In [None]:
columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year,
            medline.amount_of_authors AS authors,
            ut2pmid.ut AS wos_id'''
kind ='research'
unambiguous = True
if get_citations:
    years_range = 'all'
else:
    years_range = None

df_m = medline.select_medline_wos_records(
    columns_sql,
    years_range=years_range,
    taxon_id = taxon_id,
    kind=kind,
    unambiguous=unambiguous)

df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])]


columns_to_use = ['pubmed_id', 'wos_id', 'pubdate_year', 'authors']
if get_citations:
    df_m = ana.add_citations(df_m, years_to_include=years_for_citation)
    columns_to_use = columns_to_use + ['citations']
    
df_m = df_m.loc[:,columns_to_use].drop_duplicates()

if get_citations:
    df_m = ana.add_yearly_citation_rank(df_m)

df_m = df_m[df_m['authors']>0]   #                      < ---- have to check with other citation analysis

    
genes_per_paper = gene2pubmed['pubmed_id'].value_counts().to_frame('genes')
df_m_specified_taxon = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='inner')

### Taxon specific cumulative annual fame of genes

In [None]:
span = np.arange(1980, 2016)
fame_of_interest = 'attention'

df_h = medline.select_medline_records(
    columns_sql='''
        medline.pubmed_id,
        medline.pubdate_year''',
    taxon_id=taxon_id,
    kind='research',
)

gene2pubmed_and_year = pd.merge(gene2pubmed, df_h[['pubmed_id', 'pubdate_year']])
agg = []
for y in span: 
    pa = nar.count_papers_and_attention(
        ref_genes,
        gene2pubmed_and_year[gene2pubmed_and_year['pubdate_year']<=y])

    pa = pa[fame_of_interest]
    pa.name = y
    agg.append(pa)

historic_fame = pd.concat(agg, axis=1)
historic_rank = historic_fame.rank(ascending=False) / historic_fame.shape[0]

df_ph = pd.merge(
    df_h[['pubmed_id', 'pubdate_year']],
    gene2pubmed[['gene_ncbi', 'pubmed_id']])

agg = []
for y in span:
    yy = y  + 1
    ser = historic_rank[y]
    df = ser.to_frame('rank').reset_index(drop=False)
    df.loc[:, 'year_and_one'] = yy
    agg.append(df)
df_ranks_for_future = pd.concat(agg)

m = pd.merge(
    df_ph,
    df_ranks_for_future,
    left_on=['gene_ncbi', 'pubdate_year'],
    right_on=['gene_ncbi', 'year_and_one'])

median_rank_of_genes_within_paper = m[['pubmed_id', 'rank']].groupby(
    'pubmed_id').agg(np.median)

## Derivative taxon-dependent datasets

### Earliest occurences of genes

In [None]:
d = pd.merge(
    df_m_specified_taxon[['pubmed_id', 'pubdate_year', 'genes']],
    gene2pubmed[['gene_ncbi', 'pubmed_id']])

is_single_gene_paper = d['genes'] == 1

genes_earliest_years = pd.merge(
    d.loc[
        :,
        ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(min).reset_index().rename(
                columns={'pubdate_year':'first_year'}),
    d.loc[
        is_single_gene_paper,
        ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(min).reset_index().rename(
                columns={'pubdate_year':'first_solo_year'}),
    left_on = 'gene_ncbi',
    right_on = 'gene_ncbi',
    how = 'outer'
)

## Personalized author statistics

In [None]:
# Collect information about individual authors. Note
# that this section might be expanded subsequently
minimal_team_size_to_count_as_non_solo = 2

first_year_as_non_solo_last = wos_dais_of_medline_pis[wos_dais_of_medline_pis[
    'authorship_position'] >= minimal_team_size_to_count_as_pi
][['pubdate_year', 'dais_id']].groupby('dais_id').agg(min)

b = pd.merge(
    first_year_as_non_solo_last,
    papers_as_pi.to_frame('papers_as_pi'),
    left_index=True,
    right_index=True,
    how = 'inner').reset_index().rename(
        columns={
            'index':'dais_id',
            'pubdate_year':'first_year_as_pi'})

author_stats = pd.merge(
    b,
    wos_dais_gene_linked_resarch[['dais_id']].drop_duplicates(),
    how='outer')

author_stats['papers_as_pi'] = author_stats['papers_as_pi'].fillna(0)

# Analyses

## Preprocessing

In [None]:
mlg = pd.merge(
    df_m_specified_taxon[['pubmed_id', 'pubdate_year']],
    gene2pubmed[['gene_ncbi', 'pubmed_id']])

papers_with_first_year = pd.merge(
    mlg,
    genes_earliest_years[['gene_ncbi', 'first_year']],
    left_on=['gene_ncbi', 'pubdate_year'],
    right_on=['gene_ncbi', 'first_year'], 
    how='inner')['pubmed_id'].unique()

papers_with_first_solo_year = pd.merge(
    mlg,
    genes_earliest_years[['gene_ncbi', 'first_solo_year']],
    left_on=['gene_ncbi', 'pubdate_year'],
    right_on=['gene_ncbi', 'first_solo_year'], 
    how='inner')['pubmed_id'].unique()

paper_stats = df_m_specified_taxon.copy()

paper_stats.loc[:, 'premiere'] = paper_stats['pubmed_id'].isin(papers_with_first_year)
paper_stats.loc[:, 'premiere_solo'] = paper_stats['pubmed_id'].isin(papers_with_first_solo_year)


f = paper_stats['genes'] == 1
paper_stats.loc[f, 'gene_group'] = '1'

f = (paper_stats['genes'] > 1) & (paper_stats['genes'] <= 10)
paper_stats.loc[f, 'gene_group'] = '2-10'

f = (paper_stats['genes'] > 10)
paper_stats.loc[f, 'gene_group'] = '11+'

f = (paper_stats['gene_group'] == '1') & (paper_stats['premiere_solo'])
paper_stats.loc[f, 'detailed_gene_group'] = '1_premiere'

f = (paper_stats['gene_group'] == '1') & ~(paper_stats['premiere_solo'])
paper_stats.loc[f, 'detailed_gene_group'] = '1_no_premiere'

f = (paper_stats['gene_group'] == '2-10')
paper_stats.loc[f, 'detailed_gene_group'] = '2-10'

f = (paper_stats['gene_group'] == '11+')
paper_stats.loc[f, 'detailed_gene_group'] = '11+'

In [None]:
paper_stats = pd.merge(
    paper_stats,
    wos_dais_gene_linked_resarch[
    wos_dais_gene_linked_resarch['authorship'] == 'first'
        ][['wos_id', 'dais_id']].rename(columns={'dais_id': 'dais_first_author'}),
    left_on='wos_id',
    right_on='wos_id',
    how='inner'
)

In [None]:
paper_stats['pubmed_id'] = paper_stats['pubmed_id'].astype(float)

In [None]:
# ranks by gene
paper_stats = pd.merge(
    paper_stats,
    median_rank_of_genes_within_paper.reset_index().rename(columns={'rank':'conventionality_rank'}),
    how='left')
        
num_bins = 5
paper_stats['conventionality_bin'] = (
    paper_stats['conventionality_rank'] * num_bins).apply(lambda x: np.floor(x))

In [None]:
# rank by papers within same year
paper_stats['conventionality_rank_rank'] = paper_stats[['pubdate_year', 'conventionality_rank']].groupby('pubdate_year').rank(pct=True)

num_bins = 5
paper_stats['conventionality_rank_rank_bin'] = (
    paper_stats['conventionality_rank_rank'] * num_bins).apply(lambda x: np.floor(x))

In [None]:
required_minimal_amount_of_papers_as_pi = 3

pool = pd.merge(
    paper_stats,
    author_stats,
    left_on='dais_first_author',
    right_on='dais_id',
    how='inner')

pool.loc[:, 'will_be_pi'] = (
    pool['first_year_as_pi'] >= pool['pubdate_year']) & (
    pool['papers_as_pi'] >= required_minimal_amount_of_papers_as_pi)

f = (pool['first_year_as_pi']<pool['pubdate_year']) == True   # < --------- important
pool = pool.loc[~f, :]

### Do first authors of different types of papers have different prospects to advance to PI?

In [None]:
mini = pool[['dais_first_author', 'pubdate_year','detailed_gene_group', 'will_be_pi']].drop_duplicates()
mini = mini[mini['pubdate_year']>=1980]

In [None]:
keeper = mini.groupby(['detailed_gene_group', 'pubdate_year']).size().reset_index()
keeper = keeper[keeper[0]>=50].drop(0, axis=1)

In [None]:
ax = sns.pointplot(
    x='pubdate_year',
    y='will_be_pi',
    hue='detailed_gene_group',
    data=pd.merge(mini, keeper),
    hue_order=['11+', '2-10', '1_no_premiere', '1_premiere'],
    n_boot=10000)

for label in ax.xaxis.get_ticklabels():
    label.set_visible(False)

for label in ax.xaxis.get_ticklabels()[0::5]:
    label.set_visible(True)

ax.set_ylim(0,1)

if save_images:
    ret.export_image('170806_first_authorship_and_pi_transition/likelihood_to_transition_to_pi.pdf')

## Do first types of papers have different amount of citations?

In [None]:
if get_citations: 
    mini = pool[['pubmed_id', 'pubdate_year', 'citations', 'yearly_citation_rank', 'detailed_gene_group', 'will_be_pi']].drop_duplicates()
    mini = mini[mini['pubdate_year']>=1980]
    
    ax = sns.pointplot(
        x='pubdate_year',
        y='yearly_citation_rank',
        hue='detailed_gene_group',
        data=mini,
        estimator=np.median,
        hue_order=['11+', '2-10', '1_no_premiere', '1_premiere'],
        n_boot=10000)

    for label in ax.xaxis.get_ticklabels():
        label.set_visible(False)

    for label in ax.xaxis.get_ticklabels()[0::5]:
        label.set_visible(True)

    ax.set_ylim(0,1)

In [None]:
if get_citations: 
    
    keeper = mini.groupby(['detailed_gene_group', 'pubdate_year']).size().reset_index()
    keeper = keeper[keeper[0]>=50].drop(0, axis=1)

    ax = sns.pointplot(
        x='pubdate_year',
        y='citations',
        hue='detailed_gene_group',
        data=pd.merge(mini, keeper),
        estimator=np.median,
        hue_order=['11+', '2-10', '1_no_premiere', '1_premiere'],
        n_boot=10000)

    ax.set_ylim(0,50)

    for label in ax.xaxis.get_ticklabels():
        label.set_visible(False)

    for label in ax.xaxis.get_ticklabels()[0::5]:
        label.set_visible(True)

    if save_images:
        ret.export_image('170806_first_authorship_and_pi_transition/median_citations.pdf')

## Are papers of future PIs differenentially cited?

In [None]:
if get_citations: 
    
    mini = pool[['pubdate_year', 'citations', 'yearly_citation_rank', 'will_be_pi']]
    mini = mini[mini['pubdate_year']>=1980]
    
    ax = sns.pointplot(
        x='pubdate_year',
        y='yearly_citation_rank',
        hue='will_be_pi',
        data=mini,
        estimator=np.median,
        n_boot=10000)


    for label in ax.xaxis.get_ticklabels():
        label.set_visible(False)

    for label in ax.xaxis.get_ticklabels()[0::5]:
        label.set_visible(True)

    ax.set_ylim(0,1)


    if save_images:
        ret.export_image('170806_first_authorship_and_pi_transition/citations_of_future_pis.pdf')

## Do papers with more unconventional genes have lower chance of PI (as implied in Hoffmann)

In [None]:
mini = pool.loc[:, [
    'dais_first_author',
    'pubdate_year',
    'conventionality_bin',
    'will_be_pi']].drop_duplicates()
mini = mini[mini['pubdate_year']>=1980]

keeper = mini.groupby(['conventionality_bin', 'pubdate_year']).size().reset_index()
keeper = keeper[keeper[0]>=50].drop(0, axis=1)


ax = sns.pointplot(
    x='pubdate_year',
    y='will_be_pi',
    hue='conventionality_bin',
    data=pd.merge(mini, keeper),
    hue_order=[4, 3, 2, 1, 0],
    n_boot=10000)

for label in ax.xaxis.get_ticklabels():
    label.set_visible(False)

for label in ax.xaxis.get_ticklabels()[0::5]:
    label.set_visible(True)
    
    
if save_images:
    ret.export_image('170806_first_authorship_and_pi_transition/gene_conventionality_bins.pdf')

In [None]:
mini = pool.loc[:, [
    'dais_first_author',
    'pubdate_year',
    'conventionality_bin',
    'will_be_pi']].drop_duplicates()
mini = mini[mini['pubdate_year']>=1980]

keeper = mini.groupby(['conventionality_bin', 'pubdate_year']).size().reset_index()
keeper = keeper[keeper[0]>=50].drop(0, axis=1)


ax = sns.pointplot(
    x='pubdate_year',
    y='will_be_pi',
    hue='conventionality_bin',
    data=pd.merge(mini[mini['conventionality_bin'].isin([0, 4])], keeper),
    hue_order=[4, 0],
    n_boot=10000)

for label in ax.xaxis.get_ticklabels():
    label.set_visible(False)

for label in ax.xaxis.get_ticklabels()[0::5]:
    label.set_visible(True)
    

if save_images:
    ret.export_image('170806_first_authorship_and_pi_transition/gene_extreme_conventionality_bins.pdf')

In [None]:
mini = pool.loc[:, [
    'dais_first_author',
    'pubdate_year',
    'conventionality_bin',
    'will_be_pi']].drop_duplicates()
mini = mini[mini['pubdate_year']>=2000]

keeper = mini.groupby(['conventionality_bin', 'pubdate_year']).size().reset_index()
keeper = keeper[keeper[0]>=50].drop(0, axis=1)


ax = sns.pointplot(
    x='pubdate_year',
    y='will_be_pi',
    hue='conventionality_bin',
    data=pd.merge(mini[mini['conventionality_bin'].isin([0, 4])], keeper),
    hue_order=[4, 0],
    n_boot=10000)

for label in ax.xaxis.get_ticklabels():
    label.set_visible(False)

for label in ax.xaxis.get_ticklabels()[0::5]:
    label.set_visible(True)
    

if save_images:
    ret.export_image('170806_first_authorship_and_pi_transition/gene_extreme_conventionality_bins_since_2000.pdf')

In [None]:
save_images = True

In [None]:
mini = pool.loc[:, [
    'dais_first_author',
    'pubdate_year',
    'conventionality_bin',
    'will_be_pi']].drop_duplicates()
mini = mini[mini['pubdate_year']>=2000]

keeper = mini.groupby(['conventionality_bin', 'pubdate_year']).size().reset_index()
keeper = keeper[keeper[0]>=50].drop(0, axis=1)


ax = sns.pointplot(
    x='pubdate_year',
    y='will_be_pi',
    hue='conventionality_bin',
    data=pd.merge(mini[mini['conventionality_bin'].isin([0, 2, 4])], keeper),
    hue_order=[4, 2, 0],
    dodge=True,
    n_boot=10000)

for label in ax.xaxis.get_ticklabels():
    label.set_visible(False)

for label in ax.xaxis.get_ticklabels()[0::5]:
    label.set_visible(True)
    

if save_images:
    ret.export_image('180322_first_authorship_and_pi_transition_detailed/gene_extreme_conventionality_bins_with_center_since_2000.pdf')