In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import sys

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from access_economic_data import nih
from access_literature_data import medline, wos
from access_science_shared import standardizer

In [None]:
sys.path.append('./../src/')
# import nar170605f_funding as nar_funding
import nar170604f_occurences as nar_attention
import ana170508f_human_citations as ana
import resci_tools as ret

In [None]:
sns.set(font_scale=2)

In [None]:
taxon_id = 9606
fame_of_interest = 'attention'
span = np.arange(1980, 2016)

In [None]:
save_images = True

# Load datasets

In [None]:
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)


df_m = medline.select_medline_wos_records(
    columns_sql='''
        medline.pubmed_id,
        medline.pubdate_year,
        ut2pmid.ut AS wos_id''',
    taxon_id=taxon_id,
    kind='research',
    unambiguous=True
)

df_m = df_m[['pubmed_id', 'pubdate_year', 'wos_id']]
df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])].drop_duplicates()

genes_per_paper = gene2pubmed['pubmed_id'].value_counts().to_frame('genes_per_paper')
df_m = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='left')

In [None]:
gene2pubmed_and_year = pd.merge(gene2pubmed, df_m[['pubmed_id', 'pubdate_year']])
agg = []
for y in span: 
    pa = nar_attention.count_papers_and_attention(
        ref_genes,
        gene2pubmed_and_year[gene2pubmed_and_year['pubdate_year']<=y])

    pa = pa[fame_of_interest]
    pa.name = y
    agg.append(pa)

historic_fame = pd.concat(agg, axis=1)

In [None]:
historic_rank = historic_fame.rank(ascending=True) / historic_fame.shape[0]

# Fuse by merging

In [None]:
df_pm = pd.merge(
    df_m[['pubmed_id', 'pubdate_year']],
    gene2pubmed[['gene_ncbi', 'pubmed_id']])

In [None]:
agg = []

for y in span:
    yy = y  + 1
    ser = historic_rank[y]
    df = ser.to_frame('rank').reset_index(drop=False)
    df.loc[:, 'year_and_one'] = yy
    agg.append(df)
    
df_ranks_for_future = pd.concat(agg)

In [None]:
m = pd.merge(
    df_pm,
    df_ranks_for_future,
    left_on=['gene_ncbi', 'pubdate_year'],
    right_on=['gene_ncbi', 'year_and_one'])

In [None]:
median_rank_of_genes_within_paper = m[['pubmed_id', 'rank']].groupby('pubmed_id').agg(np.median)

# Obtain secondary features

In [None]:
master = pd.merge(
    df_m,
    median_rank_of_genes_within_paper,
    left_on='pubmed_id',
    right_index=True)

In [None]:
def add_gene_category(df_m):

    f = df_m['genes_per_paper'] == 1
    if any(f):
        df_m.loc[f, 'gene_category'] = '1'

    f = (df_m['genes_per_paper'] > 1) & (df_m['genes_per_paper'] <= 10)
    if any(f):
        df_m.loc[f, 'gene_category'] = '2-10'

    f = (df_m['genes_per_paper'] > 10) & (df_m['genes_per_paper'] <= 100) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '11-100'

    f = (df_m['genes_per_paper'] > 100) & (df_m['genes_per_paper'] <= 1000) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '101-1000'

    f = (df_m['genes_per_paper'] > 1000) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '1001+'

    return df_m

master = add_gene_category(master)

# Retreive stats on citing papers

In [None]:
sys.path.append('./../../../../unpublishable/')

from mongodb.mongoConnect import MongoConnection
from mongodb.mongosetup import *

papers_con = MongoConnection(merged_papers_settings)

In [None]:
def get_master_for_category(list_wos_id, name_of_category):

    referring_papers = dict()
    result = papers_con.collection.find(
        {"UT": {"$in":list_wos_id }},
        {"UT": 1, "citations":1}    
    )

    problems = 0 
    for i_item,item in enumerate(result):
        try:
            u = item['UT']
            y = item['citations']
            referring_papers[u] = y 
        except:
            problems = problems + 1

    referring_publications = [item for sublist in referring_papers.values() for item in sublist]

    master_citing = master.copy()[master['wos_id'].isin(referring_publications)]
    master_citing.loc[:, 'plotting_category'] = name_of_category

    return master_citing

In [None]:
list_wos_id = list(master[(master['gene_category']=='1') ]['wos_id'].values)
master_1 = get_master_for_category(list_wos_id, 'referring_1')

list_wos_id = list(master[(master['gene_category']=='2-10') ]['wos_id'].values)
master_2 = get_master_for_category(list_wos_id, 'referring_2-10')

list_wos_id = list(master[(master['gene_category']=='11-100') ]['wos_id'].values)
master_11 = get_master_for_category(list_wos_id, 'referring_11-100')

list_wos_id = list(master[(master['gene_category']=='101-1000') ]['wos_id'].values)
master_101 = get_master_for_category(list_wos_id, 'referring_101-1000')

list_wos_id = list(master[(master['gene_category']=='1001+') ]['wos_id'].values)
master_1001 = get_master_for_category(list_wos_id, 'referring_1001+')

In [None]:
## note that filtering for single genes doesn't change much!

master_1_sg = master_1[master_1['gene_category'] == '1'].copy()
master_1_sg['plotting_category'] = master_1_sg['plotting_category'] + '_sg'

master_2_sg = master_2[master_2['gene_category'] == '1'].copy()
master_2_sg['plotting_category'] = master_2_sg['plotting_category'] + '_sg'

master_11_sg = master_11[master_11['gene_category'] == '1'].copy()
master_11_sg['plotting_category'] = master_11_sg['plotting_category'] + '_sg'

master_101_sg = master_101[master_101['gene_category'] == '1'].copy()
master_101_sg['plotting_category'] = master_101_sg['plotting_category'] + '_sg'

master_1001_sg = master_1001[master_1001['gene_category'] == '1'].copy()
master_1001_sg['plotting_category'] = master_1001_sg['plotting_category'] + '_sg'

In [None]:
master_all = master.copy()
master_all.loc[:, 'plotting_category'] = 'all'

In [None]:
master['plotting_category'] = master['gene_category']

In [None]:
pool = pd.concat([
    master_all,
    master,
#     master_2,
#     master_11,
#     master_101,
#     master_1001,
    master_1_sg,
    master_2_sg,
    master_11_sg,
    master_101_sg,
    master_1001_sg
])

In [None]:
pool['plotting_category'].unique()

In [None]:
order = [
    'all',
    '1',
    'referring_1_sg',
    '2-10',
    'referring_2-10_sg',
    '11-100',
    'referring_11-100_sg',
    '101-1000',
    'referring_101-1000_sg',
    '1001+',
    'referring_1001+_sg']

In [None]:
sns.set(font_scale=0.5)

In [None]:
sns.boxplot(
    x='plotting_category',
    y='rank',
    data=pool,
    notch=True,
    color='navajowhite',
    order=order)


if save_images:
    ret.export_image('170808_conventionality_and_prevalence_of_publiction_types/conventionality_per_type.pdf')

In [None]:
sns.boxplot(
    x='plotting_category',
    y='rank',
    data=pool,
    notch=True,
    color='navajowhite',
    order=[
        'all',
        '1',
        '2-10',
        '11-100',
        '101-1000',
        '1001+'])


if save_images:
    ret.export_image('170808_conventionality_and_prevalence_of_publiction_types/conventionality_per_standard_type.pdf')

In [None]:
sns.boxplot(
    x='plotting_category',
    y='rank',
    data=pool,
    notch=True,
    color='navajowhite',
    order=[
    'referring_1_sg',
    'referring_2-10_sg',
    'referring_11-100_sg',
    'referring_101-1000_sg',
    'referring_1001+_sg'])


if save_images:
    ret.export_image('170808_conventionality_and_prevalence_of_publiction_types/conventionality_per_citing_type.pdf')

In [None]:
g = master['gene_category'].value_counts() / len(master)

In [None]:
from matplotlib.pyplot import pie, axis, show

axis('equal');
pie(g.values, labels=g.index, colors=sns.color_palette("Set2", 20));


if save_images:
    ret.export_image('170808_conventionality_and_prevalence_of_publiction_types/pie_chart_types.pdf')

In [None]:
g

In [None]:
master['gene_category'].value_counts() 