In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import anndata as ad

## Load training data

In [302]:
## Load simulated pseudo-bulk dataset and labels.
## Pseudo-bulk GEPs were generated by using `simulate` function from Scaden
y = ad.read_h5ad('C:/Users/yw_ji/Documents/MSc Thesis/DATA/scaden_input/simulated_data/data_10000.h5ad')
y_data = np.expand_dims(y.X.astype(np.float32), axis=-1)
fractions = [y.obs[ctype] for ctype in y.uns["cell_types"]]
x_data = np.expand_dims(np.array(fractions, dtype=np.float32).transpose(), axis=-1)

In [29]:
x_data = pd.read_csv('C:/Users/yw_ji/Documents/MSc Thesis/DATA/SDY67/SDY67_250_label_pp.csv', index_col=0)
x_data = np.expand_dims(np.array(x_data, dtype=np.float32), axis=-1)

In [30]:
y_data = pd.read_csv('C:/Users/yw_ji/Documents/MSc Thesis/DATA/SDY67/SDY67_250_pp.csv', index_col=0)
y = ad.read_h5ad('C:/Users/yw_ji/Documents/MSc Thesis/DATA/scaden_input/simulated_data/data_10000.h5ad')
y_data = y_data.reindex(y.var.index)
y_data = np.expand_dims(np.array(y_data.transpose(), dtype=np.float32), axis=-1)

In [31]:
print(x_data.shape)
print(y_data.shape)

(250, 15, 1)
(250, 526, 1)


In [32]:
num_cell_types = x_data.shape[1]
num_sig_genes = y_data.shape[1]
train_size = int(x_data.shape[0]*0.9)

In [51]:
## Hyperparameters
batch_size = 10
learn_rate = 1e-4
num_steps = train_size
optmzr = tf.keras.optimizers.Adam(learn_rate)
loss_obj = tf.keras.losses.MSE


In [52]:
data = tf.data.Dataset.from_tensor_slices((x_data, y_data))
data = data.shuffle(100)
data_train = data.take(train_size).repeat().batch(batch_size)
data_valid = data.skip(train_size).repeat().batch(batch_size)

## Optimize the Signature Matrix (*S*)
Mathematical expression for deconvolution is 

> *M* = *S* x *F*

Where the mixture gene expression profile (*M*) of dimensions *g* x 1 is the linear combination of signature matrix (*S*) of dimensions *g* x *c* and cell fractions vector (*F*) of dimensions *c* x 1.
- *g* is number of features/genes; and 
- *c* is number of cell types of interest. 

Therefore, we use arbitrary number of simulated pseudo-bulk gene expression profiles (GEPs) with known cell type fractions to optimize the signature matrix (*S*). 

In [53]:
class SignatureMatrix(tf.keras.layers.Layer):
    def __init__(self, num_sig_genes):
        super(SignatureMatrix, self).__init__()
#         self.batch_size = batch_size
        self.units = num_sig_genes
    
    def build(self, input_shape):
        self.w = self.add_weight(
            'kernel',
            shape=(self.units, int(input_shape[-2])),
            initializer=tf.keras.initializers.RandomUniform(minval=0., maxval=1.),
            regularizer=tf.keras.regularizers.L1(),
#             constraint=tf.keras.constraints.NonNeg(),
            trainable=True,
        )
    
    def call(self, inputs):
        op = tf.linalg.LinearOperatorFullMatrix(self.w)
        self.add_loss(tf.reduce_mean(op.cond()))
        return tf.matmul(self.w, inputs)

In [54]:
class ScaleInterceptError(tf.keras.layers.Layer):
    def __init__(self):
        super(ScaleInterceptError, self).__init__()
    
    def build(self, input_shape):
        self.w = self.add_weight(
            'kernel',
            shape=[int(input_shape[-2]),int(input_shape[-1])],
            initializer=tf.keras.initializers.RandomUniform(minval=0., maxval=1.),
#             constraint=tf.keras.constraints.NonNeg(),
            trainable=True,
        )
        self.b = self.add_weight(
            'bias',
            shape=[int(input_shape[-2]),int(input_shape[-1])],
            initializer=tf.keras.initializers.Zeros(),
            trainable=True
        )
    
    def call(self, inputs):
        # element-wise multiplication and addition
        # W * X + B
        return tf.math.add(tf.math.multiply(self.w, inputs), self.b)

In [57]:
## Create model
inpt = tf.keras.Input(shape=(num_cell_types,1,))
otpt = SignatureMatrix(num_sig_genes)(inpt)
# otpt = ScaleInterceptError()(otpt)
# otpt = tf.keras.layers.ReLU()(otpt)
model = tf.keras.Model(inpt, otpt)
model.summary()

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 15, 1)]           0         
_________________________________________________________________
signature_matrix_4 (Signatur (None, 526, 1)            7890      
Total params: 7,890
Trainable params: 7,890
Non-trainable params: 0
_________________________________________________________________


###### Records at Epoch = 5

**Learning Rate**
- 1e0 = 1253.5981
- 1e-1 = 690.4854
- 1e-2 = 937.2850
- 1e-3 = 4284.4165

**Batch Size**
- 1 = 690.4854
- 10 = 837.7722
- 100 = 1590.9575

In [58]:
model.compile(optimizer=optmzr, loss=loss_obj)
model.fit(x=data_train, epochs=100000, steps_per_epoch=num_steps,
          validation_data=data_valid, validation_steps=25, verbose=0
         )

<tensorflow.python.keras.callbacks.History at 0x161671f4370>

In [61]:
model.losses

[<tf.Tensor 'functional_7/signature_matrix_4/Mean:0' shape=() dtype=float32>,
 <tf.Tensor: shape=(), dtype=float32, numpy=72888.19>]

## Making cell type fractions estimations

In [62]:
## Extract signature matrix (weights) and error (scale + intercept) from model
SigMat = model.layers[1].w
# model.layers[2].trainable = False
# Err = model.layers[2]

In [87]:
SigMatrix = tf.expand_dims(tf.transpose(SigMat), axis=0)

In [88]:
sample = np.expand_dims(y_data[1], axis=0)

In [89]:
## Signature matrix will be used as input data (x)
tmp = tf.data.Dataset.from_tensor_slices((SigMatrix, sample))

In [90]:
tmp = tmp.repeat().batch(10)

In [91]:
tmp

<BatchDataset shapes: ((None, 15, 526), (None, 526, 1)), types: (tf.float32, tf.float32)>

In [81]:
def SumToOne(w):
    mean = tf.reduce_mean(w)
    return tf.math.divide_no_nan(w, tf.expand_dims(mean, axis=-1))

In [82]:
class Deconvolution(tf.keras.layers.Layer):
    def __init__(self, num_cell_types):
        super(Deconvolution, self).__init__()
        self.units = num_cell_types
    
    def build(self, input_shape):
        self.w = self.add_weight(
            'kernel',
            shape=(1, self.units),
            initializer=tf.keras.initializers.RandomUniform(minval=0., maxval=1.),
            constraint=tf.keras.constraints.MinMaxNorm(),
            trainable=True,
        )
    
    def call(self, inputs):
        w = tf.math.divide_no_nan(self.w, tf.math.reduce_mean(self.w))
        return tf.matmul(w, inputs)

In [83]:
## Hyperparameters
learn_rate = 1
optmzr = tf.keras.optimizers.Adam(learn_rate)
loss_obj = tf.keras.losses.MSE

In [84]:
## Create model, compile, and fit
## Create model
inpt = tf.keras.Input(shape=(num_cell_types,num_sig_genes,))
otpt = Deconvolution(num_cell_types)(inpt)
# otpt = Err(otpt)
# otpt = tf.keras.layers.ReLU()(otpt)
model = tf.keras.Model(inpt, otpt)
model.summary()
# model.compile(optimizer=optmzr, loss=loss_obj)

Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 15, 526)]         0         
_________________________________________________________________
deconvolution (Deconvolution (None, 1, 526)            15        
Total params: 15
Trainable params: 15
Non-trainable params: 0
_________________________________________________________________


In [92]:
model.compile(optimizer=tf.keras.optimizers.Adam(1), loss=loss_obj)
model.fit(x=tmp, epochs=2, steps_per_epoch=1000)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1616aff1520>

In [94]:
def pearson_cor(logits, targets):
    mx = tf.reduce_mean(logits)
    my = tf.reduce_mean(targets)
    xm, ym = logits - mx, targets - my
    r_num = tf.reduce_sum(input_tensor=tf.multiply(xm, ym))
    r_den = tf.sqrt(
        tf.multiply(
            tf.reduce_sum(input_tensor=tf.square(xm)),
            tf.reduce_sum(input_tensor=tf.square(ym)),
        )
    )
    r = tf.divide(r_num, r_den)
    r = tf.maximum(tf.minimum(r, 1.0), -1.0)
    return r

In [95]:
pearson_cor(tf.math.divide_no_nan(model.layers[1].w, tf.math.reduce_mean(model.layers[1].w)), x_data[1])

<tf.Tensor: shape=(), dtype=float32, numpy=1.9629563e-07>

In [96]:
np.sum(model.layers[1].w.numpy())

-2.0081022

In [437]:
print(tf.math.divide_no_nan(model.layers[1].w, tf.math.reduce_mean(model.layers[1].w)))
print(x_data[1])

tf.Tensor(
[[ 0.5745999   0.6859211  -1.1074343   4.4670286   0.09732926 -0.21040112
   0.28751874  3.1787963   2.4708588   0.22963849  0.3261443 ]], shape=(1, 11), dtype=float32)
[[0.03412126]
 [0.08886534]
 [0.09182623]
 [0.05527785]
 [0.12784402]
 [0.14413412]
 [0.07295215]
 [0.09430918]
 [0.13334428]
 [0.02804618]
 [0.12927938]]


In [130]:
tmp = model.predict(np.expand_dims(x_data[0,:,:], axis=0))



In [142]:
model.layers[0].w

<tf.Variable 'signature_matrix_41/kernel:0' shape=(526, 11) dtype=float32, numpy=
array([[-0.0000000e+00, -0.0000000e+00,  4.2990665e+00, ...,
         5.6840925e+00,  1.9084435e+00, -0.0000000e+00],
       [-0.0000000e+00, -0.0000000e+00, -0.0000000e+00, ...,
        -0.0000000e+00, -0.0000000e+00, -0.0000000e+00],
       [ 6.6641030e+00,  2.7479631e+01, -0.0000000e+00, ...,
         3.0321952e-02, -0.0000000e+00,  7.1613103e-02],
       ...,
       [-0.0000000e+00,  2.0036123e+01, -0.0000000e+00, ...,
        -0.0000000e+00, -0.0000000e+00, -0.0000000e+00],
       [-0.0000000e+00, -0.0000000e+00, -0.0000000e+00, ...,
         7.4547729e+01, -0.0000000e+00, -0.0000000e+00],
       [-0.0000000e+00, -0.0000000e+00, -0.0000000e+00, ...,
        -0.0000000e+00, -0.0000000e+00, -0.0000000e+00]], dtype=float32)>

In [71]:
SignatureMatrix(num_sig_genes)(x_data[0,:,:])

<tf.Tensor: shape=(526, 1), dtype=float32, numpy=
array([[0.7130525 ],
       [0.36985388],
       [0.6936025 ],
       [0.49039376],
       [0.50110483],
       [0.40495902],
       [0.42510694],
       [0.5091285 ],
       [0.3566494 ],
       [0.41298544],
       [0.4442289 ],
       [0.36412162],
       [0.35499534],
       [0.33318052],
       [0.72463584],
       [0.47884583],
       [0.47194254],
       [0.4852562 ],
       [0.4514018 ],
       [0.38140747],
       [0.47344914],
       [0.39315426],
       [0.45081478],
       [0.27909294],
       [0.516873  ],
       [0.64744693],
       [0.5522732 ],
       [0.3950473 ],
       [0.48394987],
       [0.6058894 ],
       [0.4774903 ],
       [0.50655156],
       [0.50411314],
       [0.4787949 ],
       [0.5699633 ],
       [0.47783384],
       [0.4312151 ],
       [0.36890432],
       [0.6959908 ],
       [0.5621176 ],
       [0.56189036],
       [0.32389042],
       [0.33389255],
       [0.5089051 ],
       [0.6317799 ],
     