In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import re
import sys

import numpy as np
import pandas as pd
import seaborn as sns

from access_biology_data import meta, relations
from access_literature_data import medline
from access_science_shared import standardizer

sys.path.append('./../src/')
import ana170508f_human_citations as ana
import nar170604f_occurences as nar

import matplotlib.pyplot as plt

import resci_tools as ret

In [None]:
taxon_id = 9606
years_for_citation = 8
ealiest_year = 1980

save_images = True

# Prepare datasets

In [None]:
ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)

In [None]:
df_m = medline.select_medline_wos_records(
    columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year,
            medline.amount_of_authors,
            medline.j_name_s''',
    years_range='all',
    taxon_id=taxon_id,
    kind='research',
    unambiguous=True)

df_m = df_m[df_m['amount_of_authors']>0]   # exclude consortia paper (-1)
df_m = ana.add_citations(df_m, years_to_include=years_for_citation)
df_m = df_m[['pubmed_id', 'pubdate_year', 'amount_of_authors', 'j_name_s', 'citations']]
df_m = df_m[df_m['pubdate_year']>=ealiest_year]
df_m = df_m[df_m['pubmed_id'].isin(gene2pubmed['pubmed_id'])].drop_duplicates()
df_m = ana.add_yearly_citation_rank(df_m)

In [None]:
biogrid = relations.biogrid(taxon_id)

In [None]:
names_to_use = {
    'Entrez Gene Interactor A': 'entrez_prey',
    'Entrez Gene Interactor B': 'entrez_bait',
    'Experimental System': 'system',
    'Throughput': 'throughput',
    'Pubmed ID': 'pubmed_id',  
}
biogrid = biogrid.loc[:, names_to_use.keys()]
biogrid = biogrid.rename(columns=names_to_use)

In [None]:
f = (biogrid['entrez_prey'].isin(ref_genes)) & (biogrid['entrez_bait'].isin(ref_genes))
biogrid = biogrid.loc[f, :]
biogrid = biogrid.drop_duplicates()

In [None]:
c_prey_bait = biogrid[['entrez_prey', 'entrez_bait', 'pubmed_id']].groupby(
    'pubmed_id').agg(lambda x: len(set(x))).rename(columns={'entrez_prey': 'preys', 'entrez_bait': 'baits'})

In [None]:
c_all = pd.concat(
    [
        biogrid[['entrez_bait', 'pubmed_id']].rename(columns={'entrez_bait':'entrez'}), 
        biogrid[['entrez_prey', 'pubmed_id']].rename(columns={'entrez_prey':'entrez'})]).groupby(
    'pubmed_id').agg(lambda x: len(set(x))).rename(columns={'entrez': 'genes'})

In [None]:
gene_counts = pd.concat([c_prey_bait, c_all], axis=1).reset_index()

In [None]:
biogrid_plus = pd.merge(
    biogrid[['system', 'throughput', 'pubmed_id']].drop_duplicates(),
    gene_counts)

# Analysis

In [None]:
d = pd.merge(df_m, biogrid_plus)

In [None]:
d['is_hitpaper'] = d['yearly_citation_rank'] >= 0.95

In [None]:
def add_gene_category(df_m):

    f = df_m['genes'] == 1
    if any(f):
        df_m.loc[f, 'gene_category'] = '1'

    f = (df_m['genes'] > 1) & (df_m['genes'] <= 10)
    if any(f):
        df_m.loc[f, 'gene_category'] = '2-10'

    f = (df_m['genes'] > 10) & (df_m['genes'] <= 100) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '11-100'

    f = (df_m['genes'] > 100) & (df_m['genes'] <= 1000) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '101-1000'

    f = (df_m['genes'] > 1000) 
    if any(f):
        df_m.loc[f, 'gene_category'] = '1001+'

    return df_m

In [None]:
d = add_gene_category(d)

In [None]:
sns.barplot(x='gene_category', y='is_hitpaper', data=d, order=[
        '1',
        '2-10',
        '11-100',
        '101-1000',
        '1001+'], color='darkorange')

if save_images:
    ret.export_image(
    '170711_biogrid_and_scientometrics/biogrid_fraction_high_impact.pdf'.format(
        taxon_id))

In [None]:
sns.barplot(x='gene_category', y='is_hitpaper', data=d, order=[
        '1',
        '2-10',
        '11-100',
        '101-1000',
        '1001+'], hue='throughput')