In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.family'] = 'Arial'

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
import glob
import os

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [None]:
from access_biology_data import annotation, gwas_studies, meta, phenotype_collections, properties, relations
from access_literature_data import medline
from access_mixed_data import genealacart
from access_science_shared import standardizer, utils

In [None]:
import sys
sys.path.append('./../src/')

import nar170604f_occurences as nar_attention
import nar170830f_predictions as forec
import resci_inout as rinout
import resci_tools as ret

import nar170823f_prediction_datasets as pred

In [None]:
save_images = False
save_tables = True

In [None]:
taxon_id = 9606
ref_genes = standardizer.reference_genes(taxon_id, 'orp')

In [None]:
%%time
features = {
    'Bio': pred.retreive_biophysics(ref_genes, taxon_id),
    'Exp': pred.retreive_human_experiments(ref_genes, taxon_id),
}

In [None]:
def get_u(list_of_categories):
    u = {}
    for x in list_of_categories:
        u.update(features[x])
    return u

In [None]:
of_interest = ['Bio', 'Exp']

In [None]:
u_all = get_u(of_interest)



In [None]:
u_features = u_all

if isinstance(u_features, dict):
    df = pd.concat(u_features, join='outer', axis=1)
else:
    df = u_features

if isinstance(df.columns, pd.core.index.MultiIndex):
    df.columns = df.columns.droplevel(level=0)

df = df.loc[ref_genes, :]

In [None]:
a = df.isnull().sum()/df.shape[0]

In [None]:
df.dropna().shape[0] / df.shape[0]

In [None]:
value_is_nan = df.isnull()

In [None]:
sns.clustermap(
    value_is_nan.sample(frac=1, random_state=1).iloc[:1000, :],
    metric='hamming'
)

if save_images:
    ret.export_image('180404_lost_genes/clustermap.pdf', insert_date_time=False)

In [None]:
if save_tables:
    ret.export_full_frame('180404_lost_genes/fraction_lost.csv', a.to_frame('lost_genes'))

In [None]:
p = '/Users/tstoeger/Dropbox/Work/manuscripts/genes/material/180702_transform_to_readable_columns/manually_curated_name_of_features.xlsx'

In [None]:
renamer = pd.read_excel(
    os.path.join(p),
    names=['orig', 'cleaned']
)
renamer = renamer.set_index('orig').to_dict()['cleaned']

In [None]:
out = value_is_nan.rename(columns=renamer)

In [None]:
if save_tables:
    ret.export_full_frame('180404_lost_genes/data.csv', out)