In [8]:
import os 
import getopt
import sys

import numpy as np
import h5py
import pickle
import random
import copy
import pandas as pd
import math 

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Lambda, concatenate, Bidirectional, Dense, Dropout, Flatten, Conv1D,BatchNormalization,  MaxPooling1D, Bidirectional, GRU, TimeDistributed
import tensorflow as tf
from tensorflow import keras


np.random.seed(1337) # for reproducibility
vocab = ["A", "G", "C", "T"]
indices = tf.range(len(vocab), dtype = tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab,indices)
table = tf.lookup.StaticVocabularyTable(table_init, 1)
defs = [0.] * 1 + [tf.constant([], dtype = "string")]

# Nadav dataset

def data_reader(file, batch_size=100, n_parse_threads = 4):
    dataset = tf.data.TextLineDataset(file).skip(1)
    dataset=dataset.map(preprocess, num_parallel_calls = n_parse_threads)
    return dataset.batch(batch_size).prefetch(1)

def preprocess(record):
    fields = tf.io.decode_csv(record, record_defaults=defs)
    chars = tf.strings.bytes_split(fields[1])
    chars_indeces = table.lookup(chars)
    X = tf.one_hot(chars_indeces, depth = len(vocab))
    Y = fields[0]
    return X,Y


In [9]:
# CROSS VALIDATION (10 fold)
from sklearn.model_selection import train_test_split, KFold

# Split the data in three partitions
file="/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/LibA_wide_pivot_state3.csv"
whole_data = pd.read_csv("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/LibA_wide_pivot_state3.csv")

kf = KFold(n_splits = 10, shuffle = True, random_state = 2008)
result = next(kf.split(whole_data), None)

o=1
for i in kf.split(whole_data):
    train = whole_data.iloc[i[0]]
    test =  whole_data.iloc[i[1]]
    
    train, validation = train_test_split(whole_data, test_size=0.10, random_state=42)
    
    train.to_csv("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/CV"+str(o)+"_LibA_wide_pivot_state3_train.csv", index=False)
    test.to_csv("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/CV"+str(o)+"_LibA_wide_pivot_state3_test.csv", index=False)
    validation.to_csv("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/CV"+str(o)+"_LibA_wide_pivot_state3_validation.csv", index=False)
    o+=1
        

In [10]:
df_test_overall  = pd.DataFrame(columns=['State_3E', "seq", "prediction"])
print(df_test_overall)
corr_list = []

for i in range(1,11):
    
    input_path_train = "/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/CV"+str(i)+"_LibA_wide_pivot_state3_train.csv"
    input_path_valid = "/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/CV"+str(i)+"_LibA_wide_pivot_state3_validation.csv"
    input_path_test = "/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/CV"+str(i)+"_LibA_wide_pivot_state3_test.csv"
    
    df_test = pd.read_csv(input_path_test)

    # Get first item of the dataset to get the shape of the input data
    for element in data_reader(input_path_train):
        input_shape = element[0].shape

    inputs = Input(shape=(input_shape[1],input_shape[2]), name="inputs")
    layer = Conv1D(250, kernel_size=7, strides=1, activation='relu', name="conv1")(inputs)  # 250 7 relu
    layer = Dropout(0.3)(layer)
    layer = BatchNormalization()(layer)
    layer = Conv1D(250, 8, strides=1, activation='softmax', name="conv2")(layer)  # 250 8 softmax
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(pool_size=2, strides=None, name="maxpool1")(layer)
    layer = Dropout(0.3)(layer)
    layer = Conv1D(250, 3, strides=1, activation='softmax', name="conv3")(layer)  # 250 3 softmax
    layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)
    layer = Conv1D(100, 2, strides=1, activation='softmax', name="conv4")(layer)  # 100 3 softmax
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(pool_size=1, strides=None, name="maxpool2")(layer)
    layer = Dropout(0.3)(layer)
    layer = Flatten()(layer)
    layer = Dense(300, activation='sigmoid')(layer)  # 300
    layer = Dropout(0.3)(layer)
    layer = Dense(200, activation='sigmoid')(layer)  # 300
    predictions = Dense(1, activation='linear')(layer)

    model = Model(inputs=inputs, outputs=predictions)
    model.summary()

    model.compile(optimizer="adam",
                loss="mean_squared_error",
                metrics=["mse", "mae", "mape"],
                )

    history=model.fit(data_reader(input_path_train, batch_size=100),
                            epochs=20,
                            validation_data=data_reader(input_path_valid,batch_size=100),
                            callbacks=None,
                            verbose=1)

    predicted = model.predict(data_reader(input_path_test,
                                                batch_size=100))

    test_data = data_reader(input_path_test,batch_size=100)
    test_tensor = X = np.empty(shape=[0,1])
    for batch in test_data:
        test_tensor = np.append(test_tensor, batch[1])

    df_test["prediction"] = predicted
    df_test_overall = df_test_overall.append(df_test, ignore_index=True)
    
    def pearson_correlation(x, y):
        n = len(x)
        # Calculate the mean of x and y
        mean_x = sum(x) / n
        mean_y = sum(y) / n
        
        # Calculate the numerator and denominators of the correlation coefficient
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator_x = math.sqrt(sum((xi - mean_x) ** 2 for xi in x))
        denominator_y = math.sqrt(sum((yi - mean_y) ** 2 for yi in y))
        
        # Calculate the correlation coefficient
        correlation = numerator / (denominator_x * denominator_y)
        return correlation
        
    corr_coefficient = pearson_correlation(predicted.flatten(), test_tensor)
    corr_list.append(corr_coefficient)

df_test_overall.to_csv("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/LibA_wide_pivot_state3_test_predicted_cv10fold.csv", index=False)

Empty DataFrame
Columns: [State_3E, seq, prediction]
Index: []
Model: "model_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 262, 4)]          0         
                                                                 
 conv1 (Conv1D)              (None, 256, 250)          7250      
                                                                 
 dropout_50 (Dropout)        (None, 256, 250)          0         
                                                                 
 batch_normalization_40 (Ba  (None, 256, 250)          1000      
 tchNormalization)                                               
                                                                 
 conv2 (Conv1D)              (None, 249, 250)          500250    
                                                                 
 batch_normalization_41 (Ba  (None, 249, 250)          1000  

2023-10-17 15:46:25.808644: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 15565383368236152796
  df_test_overall = df_test_overall.append(df_test, ignore_index=True)


Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 262, 4)]          0         
                                                                 
 conv1 (Conv1D)              (None, 256, 250)          7250      
                                                                 
 dropout_55 (Dropout)        (None, 256, 250)          0         
                                                                 
 batch_normalization_44 (Ba  (None, 256, 250)          1000      
 tchNormalization)                                               
                                                                 
 conv2 (Conv1D)              (None, 249, 250)          500250    
                                                                 
 batch_normalization_45 (Ba  (None, 249, 250)          1000      
 tchNormalization)                                        

2023-10-17 15:46:30.452854: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 15565383368236152796


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

KeyboardInterrupt: 

In [None]:
df_test_overall.to_csv("/home/felix/cluster/fpacheco/Data/Robert_data/processed_data/10fold_cv/LibA_wide_pivot_state3_test_predicted_cv10fold.csv", index=False)
df_test_overall

Unnamed: 0,State_3E,seq,prediction
0,-0.007714,AGGACCGGATCAACTAAACAACTCAAACAAGGGCTAATATAACCCA...,0.005275
1,0.137953,AGGACCGGATCAACTAAACACTAGTCATACTTAAAAATTGCAAGGA...,0.005275
2,-0.048706,AGGACCGGATCAACTAAACAGGTTCTGACGTATGCTCCTCTATGGA...,0.005274
3,-0.052804,AGGACCGGATCAACTAAACCCGAGCCTGCCTAGCCCTAGCTTCTCT...,0.005274
4,0.213652,AGGACCGGATCAACTAAACGGAGCAGAGTTAGTGTCAGGTCAAAAA...,0.005275
...,...,...,...
8473,0.167100,AGGACCGGATCAACTTTTCCGCCTTTTATTATCAGGACTTCACGGG...,-0.049231
8474,0.099489,AGGACCGGATCAACTTTTCGCTCATTAGTACAGGGTATAACGGAAG...,-0.049231
8475,-0.046939,AGGACCGGATCAACTTTTGGTCGGTTGACGGTCGCCTTGATTATTC...,-0.049233
8476,0.093662,AGGACCGGATCAACTTTTTTATCTGGTTATCATTCTAGTCTAGTGC...,-0.049234


In [None]:
corr_list

[0.6109516526851736,
 0.5577751589096964,
 0.5824317832428231,
 0.646415735678849,
 0.6531711700731101,
 0.6220736243472968,
 0.5739929657529798,
 0.5967556394910829,
 0.6648464327652903,
 0.6024677659508967]

In [None]:
# CROSS VALIDATION
from sklearn.model_selection import train_test_split, KFold

# Split the data in three partitions
whole_data=pd.read_csv("/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/mean_with_sequence_ENCFF616IAQ.csv")

kf = KFold(n_splits = 10, shuffle = True, random_state = 2008)
result = next(kf.split(whole_data), None)

o=1
for i in kf.split(whole_data):
    train = whole_data.iloc[i[0]]
    test =  whole_data.iloc[i[1]]
    
    train, validation = train_test_split(whole_data, test_size=0.10, random_state=42)
    
    train[["meanVal", "sequence"]].to_csv("/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/CV"+str(o)+"_mean_with_sequence_ENCFF616IAQ_train.csv", index=False)
    test[["meanVal", "sequence"]].to_csv("/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/CV"+str(o)+"_mean_with_sequence_ENCFF616IAQ_test.csv", index=False)
    validation[["meanVal", "sequence"]].to_csv("/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/CV"+str(o)+"_mean_with_sequence_ENCFF616IAQ_validation.csv", index=False)
    o+=1
        

In [None]:
df_test_overall  = pd.DataFrame(columns=["meanVal", "Sequence"])
print(df_test_overall)
corr_list = []

for i in range(1,10):
    
    input_path_train = "/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/CV"+str(i)+"_mean_with_sequence_ENCFF616IAQ_train.csv"
    input_path_valid = "/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/CV"+str(i)+"_mean_with_sequence_ENCFF616IAQ_validation.csv"
    input_path_test = "/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/CV"+str(i)+"_mean_with_sequence_ENCFF616IAQ_test.csv"
   
    df_test = pd.read_csv(input_path_test)

    # Get first item of the dataset to get the shape of the input data
    for element in data_reader(input_path_train):
        input_shape = element[0].shape

    inputs = Input(shape=(input_shape[1],input_shape[2]), name="inputs")
    layer = Conv1D(250, kernel_size=7, strides=1, activation='relu', name="conv1")(inputs)  # 250 7 relu
    layer = Dropout(0.3)(layer)
    layer = BatchNormalization()(layer)
    layer = Conv1D(250, 8, strides=1, activation='softmax', name="conv2")(layer)  # 250 8 softmax
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(pool_size=2, strides=None, name="maxpool1")(layer)
    layer = Dropout(0.3)(layer)
    layer = Conv1D(250, 3, strides=1, activation='softmax', name="conv3")(layer)  # 250 3 softmax
    layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)
    layer = Conv1D(100, 2, strides=1, activation='softmax', name="conv4")(layer)  # 100 3 softmax
    layer = BatchNormalization()(layer)
    layer = MaxPooling1D(pool_size=1, strides=None, name="maxpool2")(layer)
    layer = Dropout(0.3)(layer)
    layer = Flatten()(layer)
    layer = Dense(300, activation='sigmoid')(layer)  # 300
    layer = Dropout(0.3)(layer)
    layer = Dense(200, activation='sigmoid')(layer)  # 300
    predictions = Dense(1, activation='linear')(layer)

    model = Model(inputs=inputs, outputs=predictions)
    model.summary()

    model.compile(optimizer="adam",
                loss="mean_squared_error",
                metrics=["mse", "mae", "mape"],
                )

    history=model.fit(data_reader(input_path_train, batch_size=1024),
                            epochs=20,
                            validation_data=data_reader(input_path_valid,batch_size=100),
                            callbacks=None,
                            verbose=1)

    predicted = model.predict(data_reader(input_path_test,
                                                batch_size=100))

    test_data = data_reader(input_path_test,batch_size=100)
    test_tensor = X = np.empty(shape=[0,1])
    for batch in test_data:
        test_tensor = np.append(test_tensor, batch[1])

    df_test["prediction"] = predicted
    df_test_overall = df_test_overall.append(df_test, ignore_index=True)
    
    def pearson_correlation(x, y):
        n = len(x)
        # Calculate the mean of x and y
        mean_x = sum(x) / n
        mean_y = sum(y) / n
        
        # Calculate the numerator and denominators of the correlation coefficient
        numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
        denominator_x = math.sqrt(sum((xi - mean_x) ** 2 for xi in x))
        denominator_y = math.sqrt(sum((yi - mean_y) ** 2 for yi in y))
        
        # Calculate the correlation coefficient
        correlation = numerator / (denominator_x * denominator_y)
        return correlation
        
    corr_coefficient = pearson_correlation(predicted.flatten(), test_tensor)
    corr_list.append(corr_coefficient)

df_test_overall.to_csv("/home/felix/cluster/fpacheco/Data/Nadav_lab/K562/10fold_cv/mean_with_sequence_ENCFF616IAQ_test_predicted_cv10fold.csv", index=False)