measuring gini for top taxa; note that within covered inequality is increasing

In [None]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42


import numpy as np
import pandas as pd
import seaborn as sns

from access_literature_data import medline
from access_science_shared import standardizer

import matplotlib.pyplot as plt


In [None]:
import sys
sys.path.append('./../src/')

from calculators import gini

import resci_tools as ret

In [None]:
save_images = False
save_tables = True

In [None]:
sns.set(font_scale=2)

In [None]:
taxa = [9606, 10090, 10116, 7227, 511145, 559292, 3702, 7955, 9913, 9031]

In [None]:
span = np.arange(1980, 2016)

df_gini_covered = pd.DataFrame(index=span, columns=taxa)
df_gini_all = pd.DataFrame(index=span, columns=taxa)


In [None]:
for taxon_id in taxa:
    ref_genes = standardizer.reference_genes(taxon_id, 'rp')  # research, protein-coding

    gene2pubmed = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research')
    gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(ref_genes)]
    ml = medline.select_medline_records(
            columns_sql='''
                medline.pubmed_id,
                medline.pubdate_year''',
            taxon_id=taxon_id,
            kind='research',
            )

    gene2pubmed = pd.merge(gene2pubmed, ml)

    for y in span:

        v = (1/gene2pubmed['pubmed_id'].value_counts()).to_frame(
            'value_per_paper').reset_index().rename(columns={'index':'pubmed_id'})
        gene2pubmed = pd.merge(gene2pubmed, v)
        
        if any(gene2pubmed['pubdate_year'] == y):

            df = gene2pubmed[gene2pubmed['pubdate_year']==y][['gene_ncbi', 'value_per_paper']]
            a = df.groupby('gene_ncbi').agg(np.sum) / df['value_per_paper'].sum()
            a = a.loc[ref_genes, :].fillna(0)
            v = a.sort_values('value_per_paper',ascending=False)
    #         c = np.cumsum(v['value_per_paper'])
    #         fraction_per_year.loc[:, y] = c.values


            df_gini_covered.loc[y, taxon_id] = gini(v[v['value_per_paper']>0]['value_per_paper'].values)
            df_gini_all.loc[y, taxon_id] = gini(v['value_per_paper'].values)

In [None]:
df_gini_all.head()

In [None]:
sns.set_palette(sns.color_palette("hls", 12))      # <----- needs to be changed so that color is not duplicated

ax = df_gini_all.plot()
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))

# ax.set(yscale="log")

ax.set_xlabel('Year')
ax.set_ylabel('Gini Coefficient')
if save_images:
    ret.export_image('170626_gini_attention_years/top_taxa.pdf')

In [None]:
df_gini_all.index.name = 'year'

In [None]:
if save_tables:
    ret.export_full_frame(
        '170626_gini_attention_years/data.csv',
        df_gini_all[[9606]]
    )
