Looking at human specific genes reveals that they are studied particularly little

In [None]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

import numpy as np
import pandas as pd
import seaborn as sns

from access_literature_data import medline
from access_science_shared import standardizer

from scipy.stats import mannwhitneyu

import matplotlib.pyplot as plt

In [None]:
import sys

sys.path.append('./../src/')
import nar170604f_occurences as nar_attention
import nar170605f_funding as nar_funding

import resci_tools as ret


import nar170604f_occurences as nar

save_images = False
save_table = True

In [None]:
sns.set(font_scale=2)

In [None]:
taxon_id = 9606

ref_genes = standardizer.reference_genes(taxon_id, 'orp')  # offical nomenclature, research, protein-coding


In [None]:
from access_biology_data import meta, relations

In [None]:

hg = relations.homologene()

In [None]:
highest_gene = hg[hg['taxon_ncbi']==9606]['gene_ncbi'].max()

In [None]:
ref_genes = np.array(ref_genes)

In [None]:
f = ref_genes <= highest_gene

In [None]:
ref_genes = ref_genes[f]

In [None]:
ref_genes = list(ref_genes)

In [None]:

gene2pubmed = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research')
gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(ref_genes)]
papers = nar.count_papers_and_attention(ref_genes, gene2pubmed)

In [None]:
nh = hg[hg['taxon_ncbi']!=9606]['homologene_group'].unique()

In [None]:
in_no_model_organism = hg[~hg['homologene_group'].isin(nh)]

In [None]:
papers['no_model'] = papers.index.isin(in_no_model_organism['gene_ncbi'])

In [None]:
papers['log_papers'] = papers['papers'].apply(lambda x: np.log10(x))
papers['log_attention'] = papers['attention'].apply(lambda x: np.log10(x))

In [None]:
sns.distplot(papers[papers['no_model']]['log_attention'])

In [None]:
papers['no_model'].sum()

In [None]:
sns.distplot(papers[~papers['no_model']]['log_attention'])

In [None]:
sns.boxplot(x = 'no_model', y='log_papers', data=papers, notch=True)

if save_images:
    ret.export_image('170928_kowledge_about_human_specific_genes/papers_homologs.pdf')

In [None]:
sns.boxplot(x = 'no_model', y='log_attention', data=papers, notch=True)

if save_images:
    ret.export_image('170928_kowledge_about_human_specific_genes/attention_homologs.pdf')

In [None]:
papers['no_model'].sum()

In [None]:
out = papers[['papers', 'no_model']]

if save_table:
    ret.export_full_frame(
        '170928_kowledge_about_human_specific_genes/papers_homologs_data.csv',
        out,
        save_index=True
    )

In [None]:
attention_no_model = papers[papers['no_model']==True]['attention'].values
attention_in_model = papers[papers['no_model']==False]['attention'].values
mannwhitneyu(attention_in_model, attention_no_model, alternative='two-sided')

In [None]:
attention_no_model = papers[papers['no_model']==True]['log_papers'].values
attention_in_model = papers[papers['no_model']==False]['log_papers'].values
mannwhitneyu(attention_in_model, attention_no_model, alternative='two-sided')