In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import re
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from access_biology_data import meta
from access_literature_data import medline
from access_science_shared import standardizer

sys.path.append('./../src/')
import ana170508f_human_citations as ana

import resci_tools as ret

In [None]:
sns.set(font_scale=2)

In [None]:
taxon_id = 9606
years_to_monitor = 8   # as in Uzzi et al. 2013
save_image = True

In [None]:
ref_genes = standardizer.reference_genes(
    taxon_id, 'rop')
df_medline_wos = ana.load_medline_wos_of_taxon(   # will filter for research
    taxon_id, years_to_include=years_to_monitor, kind='research', unambiguous=True)
df_medline_wos = ana.filter_for_papers_with_reference_genes(
    taxon_id, df_medline_wos, ref_genes)
df_medline_wos = ana.add_yearly_citation_rank(
    df_medline_wos)
df_medline_wos = ana.add_team_scale(
    df_medline_wos)
gene2pubmed = ana.load_shared_gene2pubmed(
    taxon_id, df_medline_wos, ref_genes)
gene2medlinewos = pd.merge(
    gene2pubmed, df_medline_wos)

In [None]:
genes_per_paper = gene2medlinewos[
    'pubmed_id'].value_counts().to_frame('genes')
genes_per_paper.index.name = 'pubmed_id'

In [None]:
master = pd.merge(
    df_medline_wos,
    genes_per_paper,
    left_on='pubmed_id',
    right_index=True,
    how='inner')

In [None]:
def get_high_fraction_impact(df):
    s = df['yearly_citation_rank']
    a = np.count_nonzero(s[s>=0.95])/len(s)
    return a

In [None]:
agg = []

for t in master['team_scale'].unique():
    f = master['team_scale'] == t
    dff = master.loc[f, :]
    gg = sorted(dff['genes'].unique())

    df_o = pd.DataFrame(index=gg)
    df_o.loc[:, 'team_scale'] = t
    
    for g in df_o.index:

        dfff = dff[dff['genes']>=g]

        df_o.loc[g, 'fraction_high_impact'] = get_high_fraction_impact(
            dfff)
        df_o.loc[g, 'papers'] = dfff.shape[0]
    
    df_o.index.name = 'minimal_amount_of_genes'
    df_o = df_o.reset_index()
    
    agg.append(df_o)

In [None]:
minimal_amount_of_papers = 100

d = pd.concat(agg, axis=0)
ax = sns.lmplot(
    x='minimal_amount_of_genes', 
    y='fraction_high_impact', 
    data=d[d['papers']>=minimal_amount_of_papers], 
    hue='team_scale',
    hue_order=['single','pair', 'team'],
    fit_reg=False,
    aspect=1.5
)

# plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))



ax.set_xlabels('Minimal amount of genes')
ax.set_ylabels('Fraction high impact')

if save_image:
    ret.export_image(
        '170630_number_of_genes_and_high_impact/increasing_minimal_amount_of_genes_taxon_{}.pdf'.format(
            taxon_id))

In [None]:
def add_gene_category(df_m):

    f = df_m['genes'] == 1
    if any(f):
        df_m.loc[f, 'gene_category'] = '1'

    f = (df_m['genes'] > 1) & (df_m['genes'] <= 10)
    if any(f):
        df_m.loc[f, 'gene_category'] = '2-10'

    f = (df_m['genes'] > 10) & (df_m['genes'] <= 100) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '11-100'

    f = (df_m['genes'] > 100) & (df_m['genes'] <= 1000) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '101-1000'


    f = (df_m['genes'] > 1000) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '1001+'

    return df_m

In [None]:
master = add_gene_category(master)

In [None]:
ax = sns.boxplot(
    x='gene_category',
    y='yearly_citation_rank',
    hue='team_scale',
    data=master,
    hue_order=['single','pair', 'team'],
    notch=True)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))



ax.set_xlabel('Amount of genes')
ax.set_ylabel('Annual citation rank')

if save_image:
    ret.export_image(
        '170630_number_of_genes_and_high_impact/gene_category_rank_taxon_{}.pdf'.format(
            taxon_id))

In [None]:
master['add_one_then_log'] = master[
    'citations'].apply(lambda x: np.log10(x+1))

In [None]:
ax = sns.boxplot(
    x='gene_category',
    y='add_one_then_log',
    hue='team_scale',
    data=master,
    hue_order=['single','pair', 'team'],
    notch=True)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))


ax.set_xlabel('Amount of genes')
ax.set_ylabel('Citations + 1, log10')

if save_image:
    ret.export_image(
        '170630_number_of_genes_and_high_impact/gene_category_citations_taxon_{}.pdf'.format(
            taxon_id))

In [None]:
master['is_high_impact'] = master.loc[:, 'yearly_citation_rank'] >= 0.95

In [None]:
ax = sns.barplot(
    x='gene_category',
    y='is_high_impact',
    data=master,
    hue='team_scale',
    order=[
        '1',
        '2-10',
        '11-100',
        '101-1000',
        '1001+'],       
    hue_order=[
        'single', 'pair', 'team']
)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

if save_image:
    ret.export_image(
        '170630_number_of_genes_and_high_impact/fraction_high_impact_taxon_{}_with_error.pdf'.format(
            taxon_id))