In [1]:
import numpy as np
import pandas as pd
from keras import backend as K
from keras.models import load_model
from keras.layers import Layer
import pickle
from scipy.stats import describe
from utils import shannon_entropy, get_sparsity

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""

SEED = 256
TOL = 1e-4

class PhyloTransform(Layer):
    def __init__(self, transform, **kwargs):
        self.output_dim = transform.shape[1:]
        self.kernel = K.constant(transform, dtype='float32')
        super(PhyloTransform, self).__init__(**kwargs)

    def call(self, x):
        return K.dot(x, self.kernel)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], ) + self.output_dim


def predict(generator, n_samples=1000, transform=None, seed=None):
    np.random.seed(seed)
    latent_dim = generator.inputs[0].shape[-1]
    z = np.random.normal(0, 1, (n_samples, latent_dim))
    res = generator.predict(z)
    if transform is not None:
        res = transform(res)
    
    return res

## Load raw dataset
raw_data = pickle.load(open("raw_data.pkl", 'rb'))
dataset = raw_data.iloc[:,1:].values/100.
labels = raw_data["group"].values
taxa_list = raw_data.columns[1:]
data_o_case = dataset[labels == 'case']
data_o_ctrl = dataset[labels == 'ctrl']

## Generate data
GENERATOR_CASE_PATH = os.path.join('./micro_add_case_rerun', 'stool_2_case_20191023T185054/', 
                                   'models', 'stool_2_case_463000_generator.h5')
GENERATOR_CTRL_PATH = os.path.join('./micro_add_ctrl_rerun', 'stool_2_ctrl_20191024T092141', 
                                   'models', 'stool_2_ctrl_493000_generator.h5')
generator_case = load_model(GENERATOR_CASE_PATH)  #, custom_objects={'PhyloTransform': PhyloTransform})
generator_ctrl = load_model(GENERATOR_CTRL_PATH)  #, custom_objects={'PhyloTransform': PhyloTransform})
data_g_case = predict(generator_case, n_samples=1000, seed=SEED)
data_g_ctrl = predict(generator_ctrl, n_samples=1000, seed=SEED)

## Show data statistics
print("Sparsity")
display(pd.DataFrame(
    [describe(get_sparsity(data_o_ctrl, TOL)),
     describe(get_sparsity(data_g_ctrl, TOL)),
     describe(get_sparsity(data_o_case, TOL)),
     describe(get_sparsity(data_g_case, TOL)),], 
    index=['Original ctrl', 'GAN ctrl', 'Original case', 'GAN case']))
print("Shannon Entropy")
display(pd.DataFrame(
    [describe(shannon_entropy(data_o_ctrl)),
     describe(shannon_entropy(data_g_ctrl)),
     describe(shannon_entropy(data_o_case)),
     describe(shannon_entropy(data_g_case)),], 
    index=['Original ctrl', 'GAN ctrl', 'Original case', 'GAN case']))

## Save simlated data
data_g_case

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.




Sparsity


Unnamed: 0,nobs,minmax,mean,variance,skewness,kurtosis
Original ctrl,248,"(0.8191933240611962, 0.9457579972183588)",0.892122,0.000268,-0.093724,1.696552
GAN ctrl,1000,"(0.7858136300417247, 0.9624478442280946)",0.872316,0.000642,0.042301,0.522135
Original case,148,"(0.8219749652294854, 0.9429763560500696)",0.887813,0.000424,0.309246,0.427984
GAN case,1000,"(0.7649513212795549, 0.9707927677329624)",0.866573,0.00124,0.059911,-0.264223


Shannon Entropy


  return -np.sum(np.where(x > tol, x * np.log(x), 0), axis=-1)


Unnamed: 0,nobs,minmax,mean,variance,skewness,kurtosis
Original ctrl,248,"(1.4803902227728476, 3.8584069736393483)",2.97272,0.156112,-1.144235,1.845062
GAN ctrl,1000,"(0.67656195, 3.8749592)",2.972613,0.185524,-1.010242,1.546131
Original case,148,"(1.7056071992903896, 3.8471567050900752)",3.077932,0.17017,-1.038462,0.986683
GAN case,1000,"(0.91057587, 3.9654448)",3.084654,0.217414,-1.323819,2.07317


In [2]:
# import pandas as pd
# from scipy.stats import describe
# from utils import shannon_entropy, get_sparsity
# import pickle

# def load_data():
#     data_1 = pd.read_csv('./data/NielsenHB_2014.ctrl.metaphlan_bugs_list.stool.tsv', sep='\t')
#     data_2 = pd.read_csv('./data/NielsenHB_2014.case.metaphlan_bugs_list.stool.tsv', sep='\t')
#     ## extract species information only
#     name_1 = [_ for _ in data_1.index if _.split('|')[-1].startswith('s__')]
#     data_1_s = data_1.loc[name_1, :].transpose()
#     name_2 = [_ for _ in data_2.index if _.split('|')[-1].startswith('s__')]
#     data_2_s = data_2.loc[name_2, :].transpose()
    
#     rownames = data_1_s.index.values.tolist() + data_2_s.index.values.tolist()
#     data_0_s = pd.merge(data_1_s, data_2_s, 'outer')
#     data_0_s.index = pd.Index(rownames)
#     labels_0 = np.concatenate([np.zeros((len(data_1_s),1)), np.ones((len(data_2_s),1))])
#     df = data_0_s.copy()
#     df.insert(0, "group", [('case' if _ else 'ctrl')for _ in labels_0]) 
#     # print([data_1_s.shape, data_2_s.shape, data_0_s.shape])
#     # print(describe(shannon_entropy(data_1_s.values/100)))
#     # print(describe(shannon_entropy(data_2_s.values/100)))
#     # print(describe(shannon_entropy(data_0_s.values/100)))
    
#     dataset, colnames = (data_0_s.values, labels_0), data_0_s.columns
#     return dataset, colnames, df

# (dataset, labels), taxa_list, dataframe = load_data()
# pickle.dump(dataframe, open("raw_data.pkl", 'wb'))