In [33]:
import os 
import getopt
import sys

import numpy as np
import h5py
import pickle
import random
import copy
import pandas as pd
import math 

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Normalization, Masking, Input, Lambda, concatenate, Bidirectional, Dense, Dropout, Flatten, Conv1D,BatchNormalization,  MaxPooling1D, Bidirectional, GRU, TimeDistributed
import tensorflow as tf
from tensorflow import keras


np.random.seed(1337) # for reproducibility
vocab = ["A", "G", "C", "T"]
indices = tf.range(len(vocab), dtype = tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab,indices)
table = tf.lookup.StaticVocabularyTable(table_init, 1)
record_defaults = [
    tf.constant([''], dtype=tf.string),
    tf.constant([''], dtype=tf.string),
    tf.constant([''], dtype=tf.string),
    tf.constant([''], dtype=tf.string),  
    tf.constant([''], dtype=tf.string),
    tf.constant([''], dtype=tf.string),
    tf.constant([''], dtype=tf.string),  
    tf.constant([''], dtype=tf.string),
]

# Nadav dataset

def data_reader(file, batch_size=100, n_parse_threads = 4):
    dataset = tf.data.TextLineDataset(file).skip(1)
    dataset=dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    return dataset.batch(batch_size).prefetch(1)

def preprocess(record):
    fields = tf.io.decode_csv(record, record_defaults=record_defaults)
    chars = tf.strings.bytes_split(fields[0])
    chars_indeces = table.lookup(chars)
    X = tf.one_hot(chars_indeces, depth = len(vocab))
    Y = tf.stack(fields[1:])
    Y= tf.where(tf.equal(Y,  "NA"), ["-1"], Y)
    Y = tf.strings.to_number(Y, tf.float32)
    return X,Y

# Get first item of the dataset to get the shape of the input data
for element in data_reader("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/LibA_wide_pivot.csv"):
    input_shape = element[0].shape
    output_shape = element[1].shape

print(input_shape)
print(output_shape)

(78, 262, 4)
(78, 7)


In [34]:



inputs = Input(input_shape[1],input_shape[2], name="inputs")
#layer = Masking(mask_value=-1.)(inputs)
layer = Conv1D(250, kernel_size=7, strides=1, activation='relu', name="conv1")(inputs)  # 250 7 relu
layer = Dropout(0.3)(layer)
layer = BatchNormalization()(layer)
layer = Conv1D(250, 8, strides=1, activation='softmax', name="conv2")(layer)  # 250 8 softmax
layer = BatchNormalization()(layer)
layer = MaxPooling1D(pool_size=2, strides=None, name="maxpool1")(layer)
layer = Dropout(0.3)(layer)
layer = Conv1D(250, 3, strides=1, activation='softmax', name="conv3")(layer)  # 250 3 softmax
layer = BatchNormalization()(layer)
layer = Dropout(0.3)(layer)
layer = Conv1D(100, 2, strides=1, activation='softmax', name="conv4")(layer)  # 100 3 softmax
layer = BatchNormalization()(layer)
layer = MaxPooling1D(pool_size=1, strides=None, name="maxpool2")(layer)
layer = Dropout(0.3)(layer)
layer = Flatten()(layer)
layer = Dense(300, activation='sigmoid')(layer)  # 300
layer = Dropout(0.3)(layer)
layer = Dense(200, activation='sigmoid')(layer)  # 300
predictions = Dense(1, activation='linear')(layer)

# Create trainable variables for mean and standard deviation
initial_mean = tf.constant(0.0, dtype=tf.float32)
initial_stddev = tf.constant(1.0, dtype=tf.float32)
mean_initializer = Constant(initial_mean)
variance_initializer = Constant(initial_stddev)

norm_prediction = Normalization(axis=-1, 
                              mean_initializer=mean_initializer, 
                              variance_initializer=variance_initializer)(predictions)



model = Model(inputs=inputs, outputs=norm_prediction)
model.summary()

model.compile(optimizer="adam",
              loss="mean_squared_error",
              metrics=["mse", "mae", "mape"],
              )

histories = {}
histories=model.fit(data_reader('/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/mean_with_sequence_ENCFF616IAQ_2col_train.csv',batch_size=100),
                        epochs=30,
                        validation_data=data_reader('/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/mean_with_sequence_ENCFF616IAQ_2col_validation.csv',batch_size=100),
                        callbacks=None,
                        verbose=2)

predicted = model.predict(data_reader('/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/mean_with_sequence_ENCFF616IAQ_2col_test.csv',
                                            batch_size=100))

test_data = data_reader('/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/mean_with_sequence_ENCFF616IAQ_2col_test.csv',batch_size=100)
test_tensor = X = np.empty(shape=[0,1])
for batch in test_data:
    test_tensor = np.append(test_tensor, batch[1])

import math
def pearson_correlation(x, y):
    n = len(x)
    # Calculate the mean of x and y
    mean_x = sum(x) / n
    mean_y = sum(y) / n
    
    # Calculate the numerator and denominators of the correlation coefficient
    numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    denominator_x = math.sqrt(sum((xi - mean_x) ** 2 for xi in x))
    denominator_y = math.sqrt(sum((yi - mean_y) ** 2 for yi in y))
    
    # Calculate the correlation coefficient
    correlation = numerator / (denominator_x * denominator_y)
    return correlation
    
corr_coefficient = pearson_correlation(predicted.flatten(), test_tensor)

ValueError: Input 0 of layer "conv1" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (4, 262)

In [None]:
import pandas as pd
histories = {}
for i in range(10):
    ct="EoBaso"
    print("Fold %02d in %s" % (i+1,ct))
    histories[ct]=model.fit(data_reader("/home/felix/cluster/lvelten/Analysis/SCG4SYN/LibA_HSC/analysis/complete_run1_5cellstates/005_deep/data/%s/train%02d.csv" % (ct, i+1),batch_size=32),
                            epochs=30,
                            validation_data=data_reader("/home/felix/cluster/lvelten/Analysis/SCG4SYN/LibA_HSC/analysis/complete_run1_5cellstates/005_deep/data/%s/test%02d.csv" % (ct,i+1),batch_size=32),
                            callbacks=None,
                            verbose=2)
    predicted = model.predict(data_reader("/home/felix/cluster/lvelten/Analysis/SCG4SYN/LibA_HSC/analysis/complete_run1_5cellstates/005_deep/data/%s/valid%02d.csv" % (ct, i+1),
                                              batch_size=100))
    break


In [None]:
def create_plots(history):
        plt.plot(history.history["pearson_r"])
        plt.plot(history.history["val_pearson_r"])
    else :
        plt.plot(history.history[metric[4:]])
        plt.plot(history.history[metric])
    plt.title('model metric')
    plt.ylabel(metric[4:])
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(foldername + 'metric.png')
    plt.clf()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig(foldername + 'loss.png')
    plt.clf()