# Unnormalized log transformed: breast, prostate, thyroid

The unnormalized and log transformed breast, prostate and thyroid

In [1]:
from train_RNA_ResNet import ResNet
from keras import callbacks as cb
from Calibration_Util import FileIO as io
import os
import numpy as np

Using TensorFlow backend.
  return f(*args, **kwds)
You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
rnaNet = ResNet()

# GTEX as source and TCGA as target
source_file = 'unnorm-log-20PC-GTEX-breast-prostate-thyroid.csv'
target_file = 'unnorm-log-20PC-TCGA-breast-prostate-thyroid.csv'
source_path = os.path.join(io.DeepLearningRoot(), 'data/unnorm/breast-prostate-thyroid/' + source_file)
target_path = os.path.join(io.DeepLearningRoot(), 'data/unnorm/breast-prostate-thyroid/' + target_file)

# Make GTEX target and TCGA source
# target_file = 'unnorm-log-20PC-GTEX-breast-prostate-thyroid.csv'
# source_file = 'unnorm-log-20PC-TCGA-breast-prostate-thyroid.csv'
# source_path = os.path.join(io.DeepLearningRoot(), 'data/unnorm/' + source_file)
# target_path = os.path.join(io.DeepLearningRoot(), 'data/unnorm/' + target_file)

rnaNet.load_data(source_path=source_path,
                target_path=target_path)

In [3]:
print("\ngtex = source shape = " + str(rnaNet.source.shape))
print("tcga = target shape = " + str(rnaNet.target.shape))


gtex = source shape = (636, 20)
tcga = target shape = (211, 20)


In [4]:
import CostFunctions as cf
from keras import backend as K
from sklearn.cross_validation import train_test_split

tissue_map = {'breast': 0, 'thyroid':1, 'prostate':2}
tm = lambda t: tissue_map[t]
sample_ratio = 0.75

In [5]:
source_labels = rnaNet.source_df['tissue'].map(tm).values

source = rnaNet.source.astype('float32')
target = rnaNet.target.astype('float32')

target_train_df, target_test_df = train_test_split(rnaNet.target_df, test_size=0.1, random_state=42)

# sort values based on tissue
target_train_df = target_train_df.sort_values(['tissue'])
target_test_df = target_test_df.sort_values(['tissue'])

# extract tissue labels
target_train_labels = target_train_df.loc[:, 'tissue']
target_test_labels = target_test_df.loc[:, 'tissue']

target_train_counts = target_train_df['tissue'].value_counts()
target_test_counts = target_test_df['tissue'].value_counts()
print("target train counts")
print(target_train_counts)
print("\ntarget validation counts")
print(target_test_counts)

# extract values as numpy array
target_train = target_train_df.loc[:, "PC1":].values
target_test = target_test_df.loc[:, "PC1":].values

print("\ntarget train shape")
print(target_train.shape)
print("\ntarget validate shape")
print(target_test.shape)
print("")

mmd = cf.MMD(source, target)

target train counts
breast      98
thyroid     46
prostate    45
Name: tissue, dtype: int64

target validation counts
breast      12
thyroid      7
prostate     3
Name: tissue, dtype: int64

target train shape
(189, 20)

target validate shape
(22, 20)

setting scales using KNN
[17.795059671855313, 35.590119343710626, 71.180238687421252]
setting all scale weights to 1


In [6]:
sample1_low = 0
sample1_high = target_train_counts['breast']-1
sample1_size = int(target_train_counts['breast'] * sample_ratio)
sample1 = K.cast(K.round(K.random_uniform_variable(shape=tuple([sample1_size]), low=sample1_low, 
                                                   high=sample1_high)), 'int32')
sample1_labels = target_train_labels.iloc[K.eval(sample1)]
sample1_target_train = K.gather(target_train, sample1)

In [7]:
sample2_low = target_train_counts['breast']
sample2_high = sample2_low + target_train_counts['prostate']-1
sample2_size = int(target_train_counts['prostate'] * sample_ratio)
sample2 = K.cast(K.round(K.random_uniform_variable(shape=tuple([sample2_size]), low=sample2_low, 
                                                   high=sample2_high)), 'int32')

sample2_labels = target_train_labels.iloc[K.eval(sample2)]
sample2_target_train = K.gather(target_train, sample2)

In [8]:
sample3_low = target_train_counts['breast'] + target_train_counts['prostate']
sample3_high = target_train.shape[0] - 1
sample3_size = int(target_train_counts['thyroid'] * sample_ratio)
sample3 = K.cast(K.round(K.random_uniform_variable(shape=tuple([sample3_size]), low=sample3_low, 
                                                   high=sample3_high)), 'int32')

sample3_labels = target_train_labels.iloc[K.eval(sample3)]
sample3_target_train = K.gather(target_train, sample3)

In [9]:
#calculate the squared distance between x and y
def squaredDistance(X, Y):
    # X is nxd, Y is mxd, returns nxm matrix of all pairwise Euclidean distances
    # broadcasted subtraction, a square, and a sum.
    r = K.expand_dims(X, axis=1)
    return K.sum(K.square(r-Y), axis=-1)

# this will be self.MMDLayer and the labels will be y_true
def kernel(X, Y, weights, scales):
        #expand dist to a 1xnxm tensor where the 1 is broadcastable
        sQdist = K.expand_dims(squaredDistance(X,Y), 0) 
        #expand scales into a px1x1 tensor so we can do an element wise exponential
        scales = K.expand_dims(K.expand_dims(scales, -1), -1)
        #expand scales into a px1x1 tensor so we can do an element wise exponential
        weights = K.expand_dims(K.expand_dims(weights, -1), -1)
        #calculated the kernel for each scale weight on the distance matrix and sum them up
        return K.sum(weights * K.exp(-sQdist / (K.pow(scales, 2))), 0)

def cost(source, target):
        #calculate the 3 MMD terms
        xx = kernel(source, source)
        xy = kernel(source, target)
        yy = kernel(target, target)
        #calculate the bias MMD estimater (cannot be less than 0)
        MMD = K.mean(xx) - 2 * K.mean(xy) + K.mean(yy)
        #return the square root of the MMD because it optimizes better
        return K.sqrt(MMD)

In [29]:
weights = mmd.weights
scales = mmd.scales

source_index1 = np.where(np.isin(source_labels, 0))[0]
source_index2 = np.where(np.isin(source_labels, 1))[0]
source_index3 = np.where(np.isin(source_labels, 2))[0]
source_index1 = []
source1 = source[source_index1]
source2 = source[source_index2]
source3 = source[source_index3]

source1

array([], shape=(0, 20), dtype=float32)

In [40]:
# calculate MMD for tissue 1
sample1_target = K.cast(sample1_target_train, 'float32')
xx1 = kernel(source1.astype('float32'), source1.astype('float32'), weights, scales)
xy1 = kernel(source1.astype('float32'), sample1_target, weights, scales)
yy1 = kernel(sample1_target, sample1_target, weights, scales)
MMD1 = K.mean(xx1) - 2*K.mean(xy1) + K.mean(yy1)

# calculate MMD for tissue 2
sample2_target = K.cast(sample2_target_train, 'float32')
xx2 = kernel(source2.astype('float32'), source2.astype('float32'), weights, scales)
xy2 = kernel(source2.astype('float32'), sample2_target, weights, scales)
yy2 = kernel(sample2_target, sample2_target, weights, scales)
MMD2 = K.mean(xx2) - 2*K.mean(xy2) + K.mean(yy2)

# calculate MMD for tissue 3
sample3_target = K.cast(sample3_target_train, 'float32')
xx3 = kernel(source3.astype('float32'), source3.astype('float32'), weights, scales)
xy3 = kernel(source3.astype('float32'), sample3_target, weights, scales)
yy3 = kernel(sample3_target, sample3_target, weights, scales)
MMD3 = K.mean(xx3) - 2*K.mean(xy3) + K.mean(yy3)
print(np.nan_to_num(K.eval(MMD1)))
print(K.eval(MMD1))
print(K.eval(MMD2))
print(K.eval(MMD3))

0.0
nan
2.26908
2.64594


In [None]:
rnaNet.init_res_net()

In [None]:
# callbacks=[rnaNet.lrate, cb.EarlyStopping(monitor='val_loss', patience=100, mode='auto')]
# rnaNet.train(epochs=1000, callbacks=callbacks)

In [None]:
# from plots import scatter_plot, heatmap
# rnaNet.pca()
# %matplotlib inline

In [None]:
# scatter_plot(rnaNet.source_pca_df, rnaNet.target_pca_df, title="before")
# scatter_plot(rnaNet.calibrated_source_pca_df, rnaNet.target_pca_df, title="after")

In [None]:
# heatmap(rnaNet.source_df, rnaNet.target_df, title="before")
# heatmap(rnaNet.calibrated_source_df, rnaNet.target_df, title="after")

In [None]:
# import CostFunctions as cf
# from keras import backend as K

# source = rnaNet.source.astype('float32')
# target = rnaNet.target.astype('float32')
# calibrated_source = rnaNet.calibrated_source.astype('float32')

# mmd = cf.MMD(source, target, MMDTargetSampleSize=target.shape[0], n_neighbors=10)
# mmd_before = K.eval(mmd.cost(source, target))
# mmd_after = K.eval(mmd.cost(calibrated_source, target))

# print("MMD before: %0.10f" % mmd_before)
# print("MMD after: %0.10f" % mmd_after)

In [None]:
# save_file = 'calibrated-unnorm-log-20PC-GTEX-breast-prostate-thyroid.csv'
# save_path = os.path.join(io.DeepLearningRoot(), 'data/unnorm/breast-prostate-thyroid/' + save_file)

# rnaNet.save_calibrated(path=save_path)