In [1]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import numpy as np
import pandas as pd

from access_science_shared import standardizer
from access_literature_data import medline

sys.path.append('./../../src')

import resci_inout as inout
import nar170604f_occurences as nar_attention
import nar170823f_prediction_datasets as pred
import nar170830f_predictions as pred_manager


import nar170605f_funding as funding

In [3]:
earliest_year = 1985  # time span of analysis 
latest_year = 2015
taxon_id = 9606

m_full = funding.get_extended_funding_info(taxon_id, earliest_year, latest_year)

Multiplier: 3.690837118213259
Amount of MedLine articles: 118362


# Shared settings

In [4]:
project_base = '180116f_predict_money'
date_stamp = '180116'

In [5]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)
f = m_full.index.isin(ref_genes)
df_targets = m_full.loc[f, ['budget_for_attention']]


target_normalizer = lambda x: np.log10(x)

def maker(sub_name, u_features, df_target, target_normalizer):
    pred_manager.make_base(
        project_base = project_base,
        sub_name = sub_name,
        ref_genes = ref_genes,
        u_features = u_features,
        df_target = df_target.applymap(target_normalizer))
    
def get_u(list_of_categories):
    u = {}
    for x in list_of_categories:
        u.update(features[x])
    return u

def supermaker(of_interest, feature_to_predict):
    sub_name = '{}_human_{}_log_{}'.format(date_stamp, ''.join(of_interest), feature_to_predict)
    u_all = get_u(of_interest)
    maker(sub_name, u_all, df_targets[[feature_to_predict]], target_normalizer)

# Specific datasets

In [6]:
%%time
features = {
    'Bio': pred.retreive_biophysics(ref_genes, taxon_id),
    'Exp': pred.retreive_human_experiments(ref_genes, taxon_id),

    'Yearhomprec': pred.retreive_homologene_discoveries(ref_genes, taxon_id),
    'Yearhomall': pred.retreive_homologene_description(ref_genes, taxon_id),
    'Yearfirstpaper': pred.retreive_years_of_first_paper(ref_genes, taxon_id),
    'Lithom': pred.retreive_homologene_literature(ref_genes, taxon_id),
#     'Litint': pred.retreive_literature_of_rolland_2014_interactors(ref_genes, taxon_id),
#    'Dis': pred.retreive_human_disease(ref_genes, taxon_id)
    'Omim': pred.retreive_all_omim_disease(ref_genes, taxon_id),
    'Unifieddiseases': pred.retreive_all_unified_disease(ref_genes, taxon_id),
}

CPU times: user 3min 56s, sys: 1min 4s, total: 5min 1s
Wall time: 5min 10s


In [7]:
%%time
for feature_to_predict in ['budget_for_attention']:
       
    supermaker([
        'Lithom'       
    ], feature_to_predict)

    supermaker([
        'Yearhomall'       
    ], feature_to_predict)
    
    supermaker([
        'Omim'       
    ], feature_to_predict)
    
    supermaker([
        'Unifieddiseases'       
    ], feature_to_predict)
    
    supermaker([
        'Bio',
        'Exp',
    ], feature_to_predict)
    
    supermaker([
        'Bio',
        'Exp',
        'Yearhomall',
    ], feature_to_predict)
    

    for g in ['Omim', 'Unifieddiseases']:
        supermaker([
            'Bio',
            'Exp',
            'Yearhomall',
            g,
        ], feature_to_predict)

CPU times: user 5min 7s, sys: 19.4 s, total: 5min 26s
Wall time: 5min 33s


In [8]:
print('done')

done
