In [None]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
import os
import sys

import numpy as np
import pandas as pd

from access_biology_data import relations


from access_science_shared import standardizer
from access_literature_data import medline

sys.path.append('./../src')

import resci_inout as inout
import nar170604f_occurences as nar_attention


import seaborn as sns
import resci_tools as ret

In [None]:
save_media = True

In [None]:
threshold_for_unstudied = 1

In [None]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)
df_targets = nar_attention.count_papers_and_attention(ref_genes, gene2pubmed)

In [None]:
gene2pubmed_research = medline.gene2pubmed(taxon_id='all', paper_kind='research')

In [None]:
value_of_pubmed_id = gene2pubmed_research['pubmed_id'].value_counts().to_frame().reset_index().rename(
    columns={'index': 'pubmed_id', 'pubmed_id': 'value'})

In [None]:
value_of_pubmed_id['value']=1/value_of_pubmed_id['value']

In [None]:
gene2pubmed_research = pd.merge(gene2pubmed_research, value_of_pubmed_id)

In [None]:
extended_attention = gene2pubmed_research[['gene_ncbi', 'value']].groupby('gene_ncbi').agg(sum)

In [None]:

hg = relations.homologene()

In [None]:
hg_attention = pd.merge(hg, extended_attention.reset_index(), how='left')

In [None]:
hg_attention['value'] = hg_attention['value'].fillna(0)

In [None]:
hg_max_attention = hg_attention[['homologene_group', 'taxon_ncbi', 'value']].groupby(
    ['homologene_group', 'taxon_ncbi']).agg(max).reset_index()

In [None]:
piv = hg_max_attention.pivot(index='homologene_group',columns='taxon_ncbi', values='value')

In [None]:
f = piv[9606] < threshold_for_unstudied

In [None]:
piv = piv.loc[f, :]

In [None]:
total_genes = piv.notnull().sum().to_frame('homologs')

In [None]:
unstudied_genes = (piv < threshold_for_unstudied).sum().to_frame('unstudied_homologs')

In [None]:
vis = pd.concat([total_genes, unstudied_genes], axis=1).reset_index()

In [None]:
from access_biology_data import meta

In [None]:
vis['taxon_name'] = vis['taxon_ncbi'].apply(lambda x: meta.taxon_name(x))

In [None]:
vis['ratio'] = vis['unstudied_homologs'] / vis['homologs']

In [None]:
table_to_export = vis[
    ['taxon_name', 'homologs', 'ratio']
].rename(columns={
    'taxon_name': 'taxon',
    'homologs': 'homologs of unstudied human genes',
    'ratio': 'unstudied homologs [%]'
})

f = table_to_export.loc[:, 'taxon'] == 'Homo sapiens'
table_to_export = table_to_export.loc[~f, :]

In [None]:
names_to_simplify = {
    'Anopheles gambiae str. PEST': 'Anopheles gambiae',
    'Oryza sativa Japonica Group': 'Oryza sativa',
    'Magnaporthe oryzae 70-15': 'Magnaporthe oryzae',
    'Neurospora crassa OR74A': 'Neurospora crassa',
    'Schizosaccharomyces pombe 972h-': 'Schizosaccharomyces pombe',
    'Saccharomyces cerevisiae S288c': 'Saccharomyces cerevisiae',
    'Kluyveromyces lactis NRRL Y-1140': 'Kluyveromyces lactis',
    'Eremothecium gossypii ATCC 10895': 'Eremothecium gossypii'
}

In [None]:
table_to_export['taxon'] = table_to_export['taxon'].replace(names_to_simplify)

In [None]:
table_to_export = table_to_export.sort_values('homologs of unstudied human genes', ascending=False)

In [None]:
table_to_export['unstudied homologs [%]'] = table_to_export[
    'unstudied homologs [%]'].apply(lambda x: np.round(x*100,0))

In [None]:
if save_media:
    ret.export_full_frame(
        p='180106_unstudied_in_organisms/table_on_uns.xlsx',
        df=table_to_export,
        insert_date_time=True,
        save_index=False)

In [None]:
table_to_export