In [1]:
%matplotlib inline  
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import numpy as np
import pandas as pd

from access_science_shared import standardizer
from access_literature_data import medline

sys.path.append('./../../src')

import resci_inout as inout
import nar170604f_occurences as nar_attention
import nar170823f_prediction_datasets as pred
import nar170830f_predictions as pred_manager

# Shared settings

In [3]:
project_base = '180622f_predict_human_fame_with_main_features_restrict_to_complete'
date_stamp = '180622'

In [4]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

In [5]:
p = (
    '/Users/tstoeger/Dropbox/Work/dynamic/resci_support_data/'
    '170923f_predict_human_fame/170923_human_BioExp_log_papers/input/target.csv.gz')

In [6]:
df = pd.read_csv(p)

In [7]:
ref_genes = list(df['gene_ncbi'].values)

In [8]:
# taxon_id = 9606
# ref_genes = standardizer.reference_genes(taxon_id, 'orp')

gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes = ref_genes)
df_targets = nar_attention.count_papers_and_attention(ref_genes, gene2pubmed)

target_normalizer = lambda x: np.log10(x)

def maker(sub_name, u_features, df_target, target_normalizer):
    pred_manager.make_base(
        project_base = project_base,
        sub_name = sub_name,
        ref_genes = ref_genes,
        u_features = u_features,
        df_target = df_target.applymap(target_normalizer))
    
def get_u(list_of_categories, allowed_features):
    u = {}
    for x in list_of_categories:
        u.update(features[x])
        
    if allowed_features != 'any':
        n = {}
        for k, v in u.items():
            f = v.columns.isin(allowed_features)
            if any(f):
                n[k] = v.loc[:, f]      
        u = n

    return u

def supermaker(of_interest, feature_to_predict, allowed_features):
    sub_name = '{}_human_{}_log_{}'.format(date_stamp, ''.join(of_interest), feature_to_predict)
    u_all = get_u(of_interest, allowed_features)
    maker(sub_name, u_all, df_targets[[feature_to_predict]], target_normalizer)

# Specific datasets

In [9]:
# The fifteen features identified to be most important
features_to_use = [
    'Population variability Lek mis_z',
    'SignalP_swiss_or_trembl: cmax',
    'uhlen_2015_cells_log10fpkm: appendices_4b',
    'Population variability Lek lof_z',
    'uhlen_2015_cells_log10fpkm: liver_c',
    'Genbank__gene: SumACGT',
    'uhlen_2015_cells_log10fpkm: brain_3c',
    'uhlen_2015_fraction_detection_tissues',
    'Population variability Lek pNull',
    'Protein Itzhak Itzhak2016_Contribution to cell protein mass [ppm]',
    'uhlen_2015_cells_log10fpkm: adrenal_4d',
    'Aminoacids_swiss_or_trembl: gravy_ignoring_O_and_U',
    'Wang2015: KBM7 CS', 'Aminoacids_swiss_or_trembl: basic',
    'Genbank_validated_RNA: full_SumACGT']

In [10]:
%%time
features = {
    'Bio': pred.retreive_biophysics(ref_genes, taxon_id),
    'Exp': pred.retreive_human_experiments(ref_genes, taxon_id),
}

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


CPU times: user 22 s, sys: 1.38 s, total: 23.4 s
Wall time: 16.9 s


In [11]:
%%time
for feature_to_predict in ['papers']:

    supermaker([
        'Bio',
        'Exp',
    ], feature_to_predict, features_to_use)
    

CPU times: user 1.38 s, sys: 38.3 ms, total: 1.41 s
Wall time: 862 ms


In [12]:
print('done')

done
