In [1]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import numpy as np
import pandas as pd

from access_science_shared import standardizer
from access_literature_data import medline

sys.path.append('./../../src')

import resci_inout as inout
import nar170604f_occurences as nar_attention
import nar170823f_prediction_datasets as pred
import nar170830f_predictions as pred_manager

# Shared settings

In [3]:
project_base = '170919f_predict_human_fame'
date_stamp = '170919'

In [4]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)
df_targets = nar_attention.count_papers_and_attention(ref_genes, gene2pubmed)

target_normalizer = lambda x: np.log10(x)

def maker(sub_name, u_features, df_target, target_normalizer):
    pred_manager.make_base(
        project_base = project_base,
        sub_name = sub_name,
        ref_genes = ref_genes,
        u_features = u_features,
        df_target = df_target.applymap(target_normalizer))
    
def get_u(list_of_categories):
    u = {}
    for x in list_of_categories:
        u.update(features[x])
    return u

def supermaker(of_interest, feature_to_predict):
    sub_name = '{}_human_{}_log_{}'.format(date_stamp, ''.join(of_interest), feature_to_predict)
    u_all = get_u(of_interest)
    maker(sub_name, u_all, df_targets[[feature_to_predict]], target_normalizer)

# Specific datasets

In [5]:
%%time
features = {
    'Bio': pred.retreive_biophysics(ref_genes, taxon_id),
    'Exp': pred.retreive_human_experiments(ref_genes, taxon_id),
    'Reg': pred.retreive_human_regulators(ref_genes, taxon_id),
    # 'Rel': pred.retreive_related_genes(ref_genes, taxon_id),
    'Relhom': pred.retreive_related_homologenes(ref_genes, taxon_id),
    'Relint': pred.retreive_related_interactors(ref_genes, taxon_id),
    # 'Dis': pred.retreive_human_disease(ref_genes, taxon_id),
    # 'Dom': pred.retreive_domains(ref_genes, taxon_id),
    'Yearhomprec': pred.retreive_homologene_discoveries(ref_genes, taxon_id),
    'Yearhomall': pred.retreive_homologene_description(ref_genes, taxon_id),
    'Yearfirstpaper': pred.retreive_years_of_first_paper(ref_genes, taxon_id),
    'Lithom': pred.retreive_homologene_literature(ref_genes, taxon_id),
    'Litint': pred.retreive_literature_of_rolland_2014_interactors(ref_genes, taxon_id),
}

CPU times: user 2min 36s, sys: 50.3 s, total: 3min 27s
Wall time: 3min 27s


In [6]:
%%time
for feature_to_predict in ['attention', 'papers']:
       
    supermaker([
        'Bio'       
    ], feature_to_predict)
    
    supermaker([
        'Exp'       
    ], feature_to_predict)
        
    supermaker([
        'Reg'       
    ], feature_to_predict)
    
    supermaker([
        'Relhom'       
    ], feature_to_predict)
    
    supermaker([
        'Relint'       
    ], feature_to_predict)
    
    supermaker([
        'Yearhomall'       
    ], feature_to_predict)

    supermaker([
        'Yearfirstpaper'       
    ], feature_to_predict)
    
    supermaker([
        'Lithom'       
    ], feature_to_predict)
        
    supermaker([
        'Litint'       
    ], feature_to_predict)
        
    supermaker([
        'Bio',
        'Exp',
    ], feature_to_predict)
    
    
    for g in ['Relhom', 'Relint']:
        supermaker([
            'Bio',
            'Exp',
            g,
        ], feature_to_predict)


    for g in ['Yearhomprec', 'Yearhomall', 'Yearfirstpaper']:
        supermaker([
            'Bio',
            'Exp',
            g,
        ], feature_to_predict)
    
    
    for g in ['Lithom', 'Litint']:
        supermaker([
            'Bio',
            'Exp',
            g,
        ], feature_to_predict)
    
    supermaker([
        'Bio',
        'Exp',
        'Yearfirstpaper',
        'Lithom',
    ], feature_to_predict)


CPU times: user 9min 25s, sys: 13.9 s, total: 9min 39s
Wall time: 9min 41s


In [7]:
print('done')

done
