In [1]:
import os
import random as rn
import sys

import pandas as pd
import numpy as np
from numpy.random import seed
from tensorflow import set_random_seed

from TL4HDR.data.preProcess import get_one_race, get_n_years, normalize_dataset, \
    standarize_dataset, get_dataset
from TL4HDR.examples.classify_util import run_mixture_cv, run_one_race_cv, \
    run_unsupervised_transfer_cv, run_supervised_transfer_cv, run_CCSA_transfer

Using TensorFlow backend.


In [2]:
seed(11111)
set_random_seed(11111)
os.environ['PYTHONHASHSEED'] = '0'
os.environ["KERAS_BACKEND"] = "tensorflow"
rn.seed(11111)

In [3]:
pd.set_option('display.width', 1000)
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [4]:
def run_cv(cancer_type, feature_type, target, years=3, groups=("WHITE", "BLACK")):
    print (cancer_type, feature_type, target, years)
    # dataset = get_dataset_integ(cancer_type=cancer_type, feature_type=feature_type, target=target, groups=groups)
    dataset = get_dataset(cancer_type=cancer_type, feature_type=feature_type, target=target, groups=groups)
    if dataset['X'].shape[0] < 10: return None
    dataset = standarize_dataset(dataset)
    dataset_w = get_one_race(dataset, 'WHITE')
    if dataset_w['X'].shape[0] < 5: return None
    dataset_w = get_n_years(dataset_w, years)
    dataset_b = get_one_race(dataset, 'BLACK')
    if dataset_b['X'].shape[0] < 5: return None
    dataset_b = get_n_years(dataset_b, years)

    dataset_tl = normalize_dataset(dataset)
    dataset_tl = get_n_years(dataset_tl, years)

    dataset = get_n_years(dataset, years)
    k = 200 if 'mRNA' in feature_type or 'methylation' in feature_type else -1

    # print(numpy.count_nonzero(numpy.isnan(dataset['X'])))
    X, Y, R, y_sub, y_strat = dataset
    df = pd.DataFrame(y_strat, columns=['RY'])
    df['R'] = R
    df['Y'] = Y
    print(X.shape)
    Dict = df['RY'].value_counts()
    print (Dict)
    if len(Dict) < 4: return None
    Dict = dict(Dict)
    print (Dict)
    for key in Dict:
        print (key, Dict[key])
        if Dict[key] < 5:
            return None

    parametrs_mix = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,'momentum':0.9,
                     'learning_rate':0.01, 'lr_decay':0.03, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hidden_layers': [128, 64]}
    parametrs_w = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,
                     'learning_rate':0.01, 'lr_decay':0.0, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hidden_layers': [128, 64]}
    parametrs_b = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':4,
                     'learning_rate':0.01, 'lr_decay':0.0, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hidden_layers': [128, 64]}

    parametrs_tl = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20, 'tune_epoch':500,
                     'learning_rate':0.01, 'lr_decay':0.03, 'dropout':0.5, 'tune_lr':0.002,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hidden_layers': [128, 64], 'tune_batch':10}

    parametrs_tl_unsupervised = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,
                     'learning_rate':0.001, 'lr_decay':0.03, 'dropout':0.0, 'n_epochs':100,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hidden_layers': [100]}

    # parametrs_tl_sa = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,
    #                  'learning_rate':0.005, 'lr_decay':0.0, 'dropout':0.5,
    #                  'L1_reg': 0.001, 'L2_reg': 0.001, 'hidden_layers': [128, 64]}


    parameters_CCSA = {'fold': 3, 'n_features': k, 'alpha':0.3, 'batch_size':32, 'learning_rate':0.01,
                       'hidden_layers': [100], 'dr':0.0, 'momentum':0.0,
                       'decay':0.0, 'sample_per_class':2}

    print("Begin classifier training")
    
    seed = 0
    df_m, mixture_classifiers = run_mixture_cv(seed, dataset, **parametrs_mix)
    df_w, w_classifiers = run_one_race_cv(seed, dataset_w, **parametrs_w)
    df_w = df_w.rename(columns={"Auc": "W_ind"})
    df_b, b_classifiers = run_one_race_cv(seed, dataset_b, **parametrs_b)
    df_b = df_b.rename(columns={"Auc": "B_ind"})
    
    print("Supervised transfer")
    df_tl_supervised, supervised_transfer_classifiers = run_supervised_transfer_cv(seed, dataset, **parametrs_tl)
    df_tl_supervised = df_tl_supervised.rename(columns={"TL_Auc": "XY_TL"})

    print("Unsupervised transfer")
    df_tl_unsupervised, unsupervised_transfer_classifiers = run_unsupervised_transfer_cv(seed, dataset, **parametrs_tl_unsupervised)
    df_tl_unsupervised = df_tl_unsupervised.rename(columns={"TL_Auc": "X_TL"})

    print("CCSA transfer")
    df_tl, ccsa_transfer_models = run_CCSA_transfer(seed, dataset_tl, **parameters_CCSA)
    df_tl = df_tl.rename(columns={"TL_Auc": "CCSA_TL"})

    df1 = pd.concat([df_m, df_w['W_ind'], df_b['B_ind'], df_tl['CCSA_TL'],
                    # df_tl_unsupervised['X_TL'],
                     df_tl_supervised['XY_TL']],
                    sort=False, axis=1)

    print(df1)
    
    return mixture_classifiers, w_classifiers, b_classifiers, supervised_transfer_classifiers, unsupervised_transfer_classifiers, ccsa_transfer_models


In [5]:
mixture_classifiers, w_classifiers, b_classifiers, supervised_transfer_classifiers, unsupervised_transfer_classifiers, ccsa_transfer_models \
    = run_cv('BRCA', 'Protein', 'OS', years=4)
# mixture_classifiers = run_cv('BRCA', 'Protein', 'OS', years=4)

('BRCA', 'Protein', 'OS', 4)
(811, 190)
(810, 192)
(322, 189)
1WHITE    216
0WHITE     57
1BLACK     39
0BLACK     10
Name: RY, dtype: int64
{'1BLACK': 39, '0WHITE': 57, '1WHITE': 216, '0BLACK': 10}
('1BLACK', 39)
('0WHITE', 57)
('1WHITE', 216)
('0BLACK', 10)
Begin classifier training
Supervised transfer
Unsupervised transfer


  theano.Param(corruption_level, default=0.2),
  theano.Param(learning_rate, default=0.1)


CCSA transfer
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Domain Adaptation Task: WHITE_to_BLACK
Creating pairs for repetition: 1 and sample_per_class: 2
('Training the model - Epoch 100', ' total trainings:', (1092, 189), (1092, 189))
Instructions for updating:
Use tf.cast instead.
10->20->30->40->50->60->70->80->90->99
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 189)          0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 100)          19000       input_1[0][0]                    
                                       

In [6]:
from NNet.utils.writeNNet import writeNNet

In [7]:
def get_weights(classifier):
    for layer in classifier.hidden_layers:
        yield np.transpose(layer.W.get_value())


def get_biases(classifier):
    for layer in classifier.hidden_layers:
        yield layer.b.get_value()


def save_classifier(classifier, file_path):
    weights = list(get_weights(classifier))
    biases = list(get_biases(classifier))
    num_inputs = weights[0].shape[1]
    
    print(list(map(lambda a: len(a), weights)))
    print(list(map(lambda a: len(a), biases)))

    input_mins = [-1 for _ in range(num_inputs)]
    input_maxes = [1 for _ in range(num_inputs)]
    input_ranges = [2 for _ in range(num_inputs)] + [1] # 1 scales output
    input_means = [0 for _ in range(num_inputs + 1)] + [0] # 0 added to output

    writeNNet(weights, biases, input_mins, input_maxes, input_means, input_ranges, file_path)


def save_classifiers(classifiers, file_path_prefix):
    for i, classifier in enumerate(classifiers):
        save_classifier(classifier, file_path_prefix + str(i) + ".nnet")

In [8]:
save_classifiers(mixture_classifiers, "TL4HDR/Result/mixture_")
save_classifiers(w_classifiers, "TL4HDR/Result/w_")
save_classifiers(b_classifiers, "TL4HDR/Result/w_")
save_classifiers(supervised_transfer_classifiers, "TL4HDR/Result/supervised_transfer_")
save_classifiers(unsupervised_transfer_classifiers, "TL4HDR/Result/unsupervised_transfer_")

[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[128, 64]
[100]
[100]
[100]
[100]
[100]
[100]


In [9]:
def save_keras(model, file_path):
    # Get a list of the model weights
    model_params = model.get_weights()

    # Split the network parameters into weights and biases, assuming they alternate
    weights = model_params[0:len(model_params):2]
    biases  = model_params[1:len(model_params):2]

    # Transpose weight matrices
    weights = [w.T for w in weights]
    
    # Get num inputs
    num_inputs = weights[0].shape[1]

    # Min and max values used to bound the inputs
    input_mins  = [-1 for _ in range(num_inputs)]
    input_maxes = [1 for _ in range(num_inputs)]

    # Mean and range values for normalizing the inputs and outputs. All outputs are normalized with the same value
    means  = [0 for _ in range(num_inputs + 1)] + [0]
    ranges = [2 for _ in range(num_inputs)] + [1]

    # Convert the file
    writeNNet(weights,biases,input_mins,input_maxes,means,ranges,file_path)

    
def save_kerases(models, file_path_prefix):
    for i, model in enumerate(models):
        save_keras(model, file_path_prefix + str(i) + ".model")


In [10]:
save_kerases(ccsa_transfer_models, "TL4HDR/Result/ccsa_transfer_")

In [11]:
from MarabouRepo.maraboupy import Marabou

Instructions for updating:
non-resource variables are not supported in the long term




In [12]:
net1 = Marabou.read_nnet("TL4HDR/Result/ccsa_transfer_0.model")
net1.setLowerBound(net1.outputVars[0][0][0], .5)

exitCode, vals1, stats1 = net1.solve()

sat
input 0 = -0.5
input 1 = -0.5
input 2 = -0.5
input 3 = -0.5
input 4 = -0.5
input 5 = -0.5
input 6 = -0.5
input 7 = -0.5
input 8 = -0.5
input 9 = -0.5
input 10 = -0.5
input 11 = -0.5
input 12 = -0.5
input 13 = -0.5
input 14 = -0.5
input 15 = 0.5
input 16 = -0.5
input 17 = -0.5
input 18 = -0.5
input 19 = -0.5
input 20 = -0.5
input 21 = -0.5
input 22 = -0.5
input 23 = -0.5
input 24 = -0.00617805289676
input 25 = -0.5
input 26 = -0.5
input 27 = -0.5
input 28 = 0.160021926317
input 29 = -0.5
input 30 = -0.5
input 31 = -0.5
input 32 = -0.5
input 33 = -0.5
input 34 = -0.5
input 35 = -0.5
input 36 = -0.5
input 37 = -0.5
input 38 = -0.5
input 39 = -0.5
input 40 = -0.5
input 41 = -0.5
input 42 = -0.5
input 43 = -0.5
input 44 = -0.5
input 45 = -0.101392885351
input 46 = -0.5
input 47 = -0.5
input 48 = -0.5
input 49 = -0.5
input 50 = -0.131106159845
input 51 = -0.5
input 52 = -0.5
input 53 = -0.5
input 54 = -0.384522696888
input 55 = -0.5
input 56 = 0.0749539196571
input 57 = -0.130857231349
i

In [5]:
from scipy.io import loadmat
import pandas as pd
import numpy as np
path = 'TL4HDR/simulation/PanGyn-DFI-5-base.mat'
A = loadmat(path)
data = A['data']
de = data['de'][0][0]
de = np.squeeze(de)
group = data['group'][0][0]
group = np.squeeze(group) - 1
counts = data['counts'][0][0]
counts = counts.transpose()
nGenes = counts.shape[1]
df = pd.DataFrame(counts, columns=range(nGenes))
print(df.describe())

               0           1            2            3            4    \
count  2000.000000  2000.00000  2000.000000  2000.000000  2000.000000   
mean      0.097000     0.10100     0.110000     0.102000     0.092000   
std       0.343001     0.33594     0.371442     0.338605     0.320293   
min       0.000000     0.00000     0.000000     0.000000     0.000000   
25%       0.000000     0.00000     0.000000     0.000000     0.000000   
50%       0.000000     0.00000     0.000000     0.000000     0.000000   
75%       0.000000     0.00000     0.000000     0.000000     0.000000   
max       4.000000     3.00000     4.000000     3.000000     3.000000   

               5            6            7            8            9    ...  \
count  2000.000000  2000.000000  2000.000000  2000.000000  2000.000000  ...   
mean      0.103000     0.095500     0.099000     0.102000     0.116500  ...   
std       0.335332     0.323159     0.340965     0.332643     0.367417  ...   
min       0.000000     0.0

In [6]:
print df.head()

   0    1    2    3    4    5    6    7    8    9    ...  190  191  192  193  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0  ...  0.0  0.0  0.0  1.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0  0.0   

   194  195  196  197  198  199  
0  0.0  0.0  1.0  0.0  0.0  0.0  
1  0.0  0.0  0.0  0.0  0.0  1.0  
2  0.0  0.0  1.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  1.0  0.0  0.0  0.0  

[5 rows x 200 columns]
