In [None]:
%matplotlib inline 
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

from access_biology_data import annotation
from access_biology_data import meta, relations
from access_biology_data import properties as pr
from access_literature_data import medline
from access_science_shared import standardizer

In [None]:
import seaborn as sns
sns.set(font_scale=2)

In [None]:
import sys
sys.path.append('./../src/')
import nar170604f_occurences as nar

import resci_tools as ret

In [None]:
save_images = False
save_tables = True

In [None]:
df_m = medline.select_medline_wos_records(
        columns_sql='''
                ut2pmid.ut AS wos_id,
                medline.pubmed_id,
                medline.pubdate_year,
                pubmed2taxon.taxon_ncbi''',
#         years_range=None,
        taxon_id = 'all',
        kind='research',
        unambiguous=True)

In [None]:
df_m.loc[:, 'is_human'] = df_m.loc[:, 'taxon_ncbi'] == 9606

In [None]:
df_m = df_m[['wos_id', 'pubdate_year', 'is_human']].drop_duplicates()

In [None]:
from access_literature_data import wos

In [None]:
dais = wos.dais(subset='gene-linked')[['dais_id', 'wos_id']]

In [None]:
print(len(set(dais['dais_id'])), ' people have participated in published gene-linked science')

In [None]:
df = pd.merge(df_m, dais)

In [None]:
df = df[['dais_id', 'pubdate_year', 'is_human']].drop_duplicates()

In [None]:
categories_per_year = df.groupby(['dais_id', 'pubdate_year']).size(
    ).reset_index().rename(columns={0: 'categories'})

In [None]:
human_and_not_human = categories_per_year[categories_per_year['categories']==2].copy()

In [None]:
human_and_not_human.loc[:, 'category'] = 'both'

In [None]:
df = pd.merge(
df,
human_and_not_human[['dais_id', 'pubdate_year', 'category']],
how='left')

In [None]:
f = (df.loc[:, 'is_human'] == True) & (df.loc[:, 'category'].isnull())

In [None]:
df.loc[f, 'category'] = 'human'

In [None]:
f = (df.loc[:, 'is_human'] == False) & (df.loc[:, 'category'].isnull())

In [None]:
df.loc[f, 'category'] = 'not_human'

In [None]:
df['category'].value_counts()

In [None]:
df = df.drop_duplicates()

In [None]:
c = ['pubdate_year', 'category']

In [None]:
d = df[c].groupby(c).size().reset_index()

In [None]:
d = d.rename(columns={0: 'counts'})

In [None]:
people_in_year = df['pubdate_year'].value_counts()

In [None]:
a = people_in_year.to_frame('total_people').reset_index().rename(columns={'index': 'pubdate_year'})

In [None]:
e = pd.merge(d, a)

In [None]:
e['fraction'] = e['counts'] / e['total_people']

In [None]:
g = e[e['pubdate_year'].isin(np.arange(1970, 2016))]

In [None]:
import matplotlib.pyplot as plt

In [None]:
import matplotlib.ticker as ticker


In [None]:
ax = sns.pointplot(x='pubdate_year', y='fraction', hue='category', data=g,hue_order=['not_human', 'both', 'human'])
ax.xaxis.set_major_locator(ticker.MultipleLocator(base=10))
ax.set(xticklabels=[0, 1970, 1980, 1990, 2000, 2010])     # ATTENTION: SEABORN VERY DIRTY FORMATTING!
plt.ylim((-0,1))
ax.grid(b=True, which='major')
plt.xlabel('Year')
plt.ylabel('Fraction of publishing workforce')
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

if save_images:
    ret.export_image('170922_composition_of_biomedical_workforce/fraction_people_on_model_organisms.pdf')

In [None]:
out = g[['pubdate_year', 'category', 'fraction']].pivot(index='pubdate_year', columns='category', values='fraction')

In [None]:
if save_tables:
    ret.export_full_frame(
        '170922_composition_of_biomedical_workforce/fraction_people_on_model_organisms_data.csv',        
        out)