# Data Analysis-- Supervised Learning
* Created on Mon Sep. 29 2021 by Shangying Wang
* Last Modified: April 6, 2023
* this code is used for prediction of the phenotypes from the combinatory motifs
* This code uses the convolutional neural network and LSTM.

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import csv
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="ticks", color_codes=True)
#import pdb
from platform import python_version
print(python_version())

In [None]:
from tensorflow.keras.optimizers import SGD, Adam
from keras import layers,Sequential
from keras.layers import Input, Dense, Dropout, Flatten, Average, BatchNormalization, LSTM, TimeDistributed
from tensorflow.keras.layers import Conv1D,MaxPool1D, concatenate
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.initializers import RandomNormal,HeNormal,GlorotNormal,HeUniform,LecunNormal,LecunUniform,Orthogonal
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from itertools import product
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
print(tf.__version__)

# For arrayed data analysis

## Load data from csv file
- CSV files available at https://www.science.org/doi/suppl/10.1126/science.abq0225/suppl_file/science.abq0225_data_s1_to_s3.zip
- Change schema to match version used in original commits
- Change motif values to match those used in original commits (i.e., change 17 to 14)

In [None]:
tf.keras.utils.set_random_seed(1)
train_data=pd.read_csv('science.abq0225_data_s2.csv',encoding= 'unicode_escape',sep=',')
test_data=pd.read_csv('science.abq0225_data_s3.csv',encoding= 'unicode_escape',sep=',')
train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)
rename_dict = {'Initial CAR T Cell Number': 'Cell Number',
               'motif i': 'motif',
               'motif j': 'motif.1',
               'motif k': 'motif.2',
               'motif l': 'motif.3',
               'motif m': 'motif.4',
               'Cytotoxicity (Nalm 6 Survival)': 'Nalm 6 Cytotoxicity',
               'Stemness (% IL7Ra+ KLRG1-)': 'IL7RaKLRG1 stemness'}
train_data.rename(columns=rename_dict, inplace=True)
test_data.rename(columns=rename_dict, inplace=True)
train_data.drop([train_data.columns[7]], axis=1, inplace=True)
test_data.drop([test_data.columns[7]], axis=1, inplace=True)
train_data.replace(17, 14, inplace=True)
test_data.replace(17, 14, inplace=True)
test_data

In [None]:
#data statistics
all_data = pd.concat([test_data, train_data])
max_cell=max(all_data['Cell Number'])
all_data['Cell Number']=all_data['Cell Number']/max_cell
train_data['Cell Number']=train_data['Cell Number']/max_cell
test_data['Cell Number']=test_data['Cell Number']/max_cell
stats_df = all_data.describe()
stats_df

In [None]:
skew_vals1=all_data['Nalm 6 Cytotoxicity'].skew()
skew_vals1

In [None]:
skew_vals2=all_data['IL7RaKLRG1 stemness'].skew()
skew_vals2

In [None]:
plt.figure(figsize=[10,20])
pheno=['IL7RaKLRG1 stemness']
new_data = all_data.copy()
pp=0
plt.subplot(3,2,1)
new_data[pheno[pp]].hist(bins=10)
plt.xlabel('value', fontsize=20)
plt.ylabel('frequency', fontsize=20)
plt.title('before np.log1p', fontsize=20)

plt.subplot(3,2,2)
new_data[pheno[pp]]=new_data[pheno[pp]].apply(np.log1p)
new_data[pheno[pp]].hist(bins=10)
plt.xlabel('value', fontsize=20)
#plt.ylabel('frequency', fontsize=20)
plt.title('after np.log1p', fontsize=20)

In [None]:
skew_vals2=new_data['IL7RaKLRG1 stemness'].skew()
skew_vals2

In [None]:
train_data['IL7RaKLRG1 stemness'] = train_data['IL7RaKLRG1 stemness'].apply(np.log1p)
test_data['IL7RaKLRG1 stemness'] = test_data['IL7RaKLRG1 stemness'].apply(np.log1p)

In [None]:
stats_df = train_data.describe()
stats_df

## Deep Neural Network for Nalm 6 Cytotoxicity

In [None]:
num_motifs=5
num_class=num_class=len(np.unique(new_data.iloc[:,1:(num_motifs+1)]))
np.unique(new_data.iloc[:,1:(num_motifs+1)])

In [None]:
max_y=np.max(new_data['Nalm 6 Cytotoxicity'])
max_y

In [None]:
ICN_train, X_train, Y_train = train_data.iloc[:, :1], train_data.iloc[:,1:(num_motifs+1)], train_data['Nalm 6 Cytotoxicity']/max_y
ICN_test, X_test, Y_test = test_data.iloc[:, :1], test_data.iloc[:,1:(num_motifs+1)], test_data['Nalm 6 Cytotoxicity']/max_y

### One-hot encoding for 14 linear motifs: 

In [None]:
X_train_channel=to_categorical(X_train, num_classes=num_class)
X_test_channel=to_categorical(X_test, num_classes=num_class)
print(np.shape(X_test_channel)) #3D tensor with shape (batch_size, steps, features/channels)

In [None]:
output_dim = 1
batch_size = 10

In [None]:
stats_df = Y_train.describe()
stats_df

### CNN+LSTM 

In [None]:
#define two sets of inputs
#64, 4, 0.1, 64,0.5,4
input_position=Input(shape=(num_motifs, num_class), name='input_position')
input_ICN=Input(shape=(1), name='input_ICN')
#initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)#kernel_initializer=initializer, 
x=Conv1D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(num_motifs, num_class))(1.0*input_position)
#x=Conv1D(filters=16, kernel_size=3, padding='same', activation='relu')(x)
x=LSTM(4,return_sequences=True, dropout=0.3)(x)#return_sequences=True,
x = Flatten()(x)
#x = Dropout(0.5)(x)
#x = Dense(8*output_dim,activation='relu')(x)
model1 = keras.Model(inputs=input_position, outputs=x)


# combine the output of the two branches
combined = concatenate([model1.output, input_ICN])
# apply a FC layer and then a regression prediction on the
# combined outputs

z = Dense(50*output_dim, activation='relu')(combined)
#z = Dense(16*output_dim, activation='relu')(z)
z = Dropout(0.5)(z)
#z = Dense(4*output_dim, activation='relu')(z)
z = Dense(output_dim, activation='relu')(z)

# our model will accept the inputs of the two branches and
# then output a single value
model = keras.Model(inputs=[input_position, input_ICN], outputs=z)
model.summary()

In [None]:
# Compile the model
model.compile(
    #loss=keras.losses.MeanSquaredLogarithmicError(),
    loss=keras.losses.MeanAbsoluteError(),
    optimizer=keras.optimizers.RMSprop(learning_rate=1e-4),
    #optimizer=tfa.optimizers.RectifiedAdam(lr=1e-3),
    #optimizer=Adam(lr=0.0001), #Adam optimizer
    #optimizer=SGD(lr=0.001, momentum=0.99),
    metrics=[tf.keras.metrics.MeanSquaredError()])

history = model.fit([X_train_channel, ICN_train], Y_train,  batch_size=batch_size, epochs=1200, verbose=1)
#validation_data=([X_test_channel, ICN_test], Y_test), 

In [None]:
train_scores = model.evaluate([X_train_channel,ICN_train], Y_train, verbose=0)
test_scores = model.evaluate([X_test_channel,ICN_test], Y_test, verbose=0)
print("MSE:", train_scores[1])
print("MSE:", test_scores[1])

In [None]:
p1=plt.plot(history.history['mean_squared_error'])
#p2=plt.plot(history.history['val_mean_squared_error'])
plt.title('mean_squared_error')
plt.ylabel('MSE')
plt.xlabel('No. epoch')
plt.yscale('log')
#plt.legend([p1,p2], ['training','validation'])
plt.show()

In [None]:
pred_train=model.predict([X_train_channel,ICN_train])*max_y
pred_test=model.predict([X_test_channel,ICN_test])*max_y
# pred_train=np.dot(pred_train_sc, np.diag(max_y))
# pred_test=np.dot(pred_test_sc,np.diag(max_y))

In [None]:
from sklearn.metrics import r2_score
title_list=['Nalm 6 Cytotoxicity','IL7RaKLRG1 stemness']
plt.figure(figsize=[15,70])
i=0
gt=Y_train*max_y
plt.subplot(1,2,1)
plt.scatter(gt,pred_train)
xmin=min(min(gt),min(pred_train))
xmax=max(max(gt),max(pred_train))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Training)', fontsize=20)
plt.ylabel('Predictions (Training)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_train[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

plt.subplot(1,2,2)
gt=Y_test*max_y
plt.scatter(gt,pred_test[:,i])
xmin=min(min(gt),min(pred_test))
xmax=max(max(gt),max(pred_test[:,i]))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Test)', fontsize=20)
plt.ylabel('Predictions (Test)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_test[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

## Ensemble Predistions 

In [None]:
## merge inputs
combine_input_train = np.concatenate([np.reshape(X_train_channel, [np.shape(X_train_channel)[0], -1]), ICN_train], axis=1)
combine_input_test = np.concatenate([np.reshape(X_test_channel, [np.shape(X_test_channel)[0], -1]), ICN_test], axis=1)

In [None]:
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_1.h5')
pred_test1=model.predict(combine_input_test)
pred_train1=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_2.h5')
pred_test2=model.predict(combine_input_test)
pred_train2=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_3.h5')
pred_test3=model.predict(combine_input_test)
pred_train3=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_4.h5')
pred_test4=model.predict(combine_input_test)
pred_train4=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_5.h5')
pred_test5=model.predict(combine_input_test)
pred_train5=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_6.h5')
pred_test6=model.predict(combine_input_test)
pred_train6=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_7.h5')
pred_test7=model.predict(combine_input_test)
pred_train7=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_8.h5')
pred_test8=model.predict(combine_input_test)
pred_train8=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_9.h5')
pred_test9=model.predict(combine_input_test)
pred_train9=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_Cytotoxicity_10.h5')
pred_test10=model.predict(combine_input_test)
pred_train10=model.predict(combine_input_train)
pred_test=(pred_test1+pred_test2+pred_test3+pred_test4+pred_test5+pred_test6+pred_test7+pred_test8+pred_test9+pred_test10)/10*max_y
pred_train=(pred_train1+pred_train2+pred_train3+pred_train4+pred_train5+pred_train6+pred_train7+pred_train8+pred_train9+pred_train10)/10*max_y
df = pd.DataFrame(np.around(pred_test,3), columns = ['Column_1'])
df.T

In [None]:
from sklearn.metrics import r2_score
title_list=['Nalm 6 Cytotoxicity','IL7RaKLRG1 stemness']
plt.figure(figsize=[15,70])
i=0
gt=Y_train*max_y
plt.subplot(1,2,1)
plt.scatter(gt,pred_train)
xmin=min(min(gt),min(pred_train))
xmax=max(max(gt),max(pred_train))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Training)', fontsize=20)
plt.ylabel('Predictions (Training)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_train[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

plt.subplot(1,2,2)
gt=Y_test*max_y
plt.scatter(gt,pred_test[:,i])
xmin=min(min(gt),min(pred_test))
xmax=max(max(gt),max(pred_test[:,i]))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Test)', fontsize=20)
plt.ylabel('Predictions (Test)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_test[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

## Deep Neural Network for IL7RaKLRG1 stemness

In [None]:
max_y=np.max(new_data['IL7RaKLRG1 stemness'])
max_y

In [None]:
Y_train = train_data['IL7RaKLRG1 stemness']/max_y
Y_test = test_data['IL7RaKLRG1 stemness']/max_y

In [None]:
#define two sets of inputs
#64, 4, 0.1, 64,0.5,4
input_position=Input(shape=(num_motifs, num_class), name='input_position')
input_ICN=Input(shape=(1), name='input_ICN')
#initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None)#kernel_initializer=initializer, 
x=Conv1D(filters=32, kernel_size=3, padding='same', activation='relu', input_shape=(num_motifs, num_class))(1.0*input_position)
#x=Conv1D(filters=16, kernel_size=3, padding='same', activation='relu')(x)
x=LSTM(4,return_sequences=True, dropout=0.3)(x)#return_sequences=True,
x = Flatten()(x)
#x = Dropout(0.5)(x)
#x = Dense(8*output_dim,activation='relu')(x)
model1 = keras.Model(inputs=input_position, outputs=x)


# combine the output of the two branches
combined = concatenate([model1.output, input_ICN])
# apply a FC layer and then a regression prediction on the
# combined outputs

z = Dense(50*output_dim, activation='relu')(combined)
#z = Dense(16*output_dim, activation='relu')(z)
z = Dropout(0.5)(z)
#z = Dense(4*output_dim, activation='relu')(z)
z = Dense(output_dim, activation='relu')(z)

# our model will accept the inputs of the two branches and
# then output a single value
model = keras.Model(inputs=[input_position, input_ICN], outputs=z)
model.summary()

In [None]:
# Compile the model
model.compile(
    #loss=keras.losses.MeanSquaredLogarithmicError(),
    loss=keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.RMSprop(learning_rate=1e-4),
    #optimizer=tfa.optimizers.RectifiedAdam(lr=1e-3),
    #optimizer=Adam(lr=0.0001), #Adam optimizer
    #optimizer=SGD(lr=0.001, momentum=0.99),
    metrics=[tf.keras.metrics.MeanSquaredError()])

history = model.fit([X_train_channel, ICN_train], Y_train, batch_size=batch_size, epochs=1200, verbose=1)
#validation_data=([X_test_channel, ICN_test], Y_test), 

In [None]:
train_scores = model.evaluate([X_train_channel,ICN_train], Y_train, verbose=0)
test_scores = model.evaluate([X_test_channel,ICN_test], Y_test, verbose=0)
print("MSE:", train_scores[1])
print("MSE:", test_scores[1])

In [None]:
p1=plt.plot(history.history['mean_squared_error'])
#p2=plt.plot(history.history['val_mean_squared_error'])
plt.title('mean_squared_error')
plt.ylabel('MSE')
plt.xlabel('No. epoch')
plt.yscale('log')
#plt.legend([p1,p2], ['training','validation'])
plt.show()

In [None]:
# pred_train=model.predict([X_train_channel,ICN_train])
# pred_test=model.predict([X_test_channel,ICN_test])
pred_train=np.exp(model.predict([X_train_channel,ICN_train])*max_y)-1
pred_test=np.exp(model.predict([X_test_channel,ICN_test])*max_y)-1

In [None]:
np.shape(pred_test)

In [None]:
from sklearn.metrics import r2_score
title_list=['Nalm 6 Cytotoxicity','IL7RaKLRG1 stemness']
plt.figure(figsize=[15,70])
i=1
gt=np.exp(Y_train*max_y)-1
plt.subplot(1,2,1)
plt.scatter(gt,pred_train)
xmin=min(min(gt),min(pred_train))
xmax=max(max(gt),max(pred_train))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Training)', fontsize=20)
plt.ylabel('Predictions (Training)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_train[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

plt.subplot(1,2,2)
gt=np.exp(Y_test*max_y)-1
plt.scatter(gt,pred_test)
xmin=min(min(gt),min(pred_test))
xmax=max(max(gt),max(pred_test))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Test)', fontsize=20)
plt.ylabel('Predictions (Test)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_test[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

## Ensemble Predistions 

In [None]:
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_1.h5')
pred_test1=model.predict(combine_input_test)
pred_train1=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_2.h5')
pred_test2=model.predict(combine_input_test)
pred_train2=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_3.h5')
pred_test3=model.predict(combine_input_test)
pred_train3=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_4.h5')
pred_test4=model.predict(combine_input_test)
pred_train4=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_5.h5')
pred_test5=model.predict(combine_input_test)
pred_train5=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_6.h5')
pred_test6=model.predict(combine_input_test)
pred_train6=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_7.h5')
pred_test7=model.predict(combine_input_test)
pred_train7=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_8.h5')
pred_test8=model.predict(combine_input_test)
pred_train8=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_9.h5')
pred_test9=model.predict(combine_input_test)
pred_train9=model.predict(combine_input_train)
model = keras.models.load_model('saved_model/position_encoding_arrayed_data_04062023_CNN_LSTM_stemness_10.h5')
pred_test10=model.predict(combine_input_test)
pred_train10=model.predict(combine_input_train)
pred_test=(pred_test1+pred_test2+pred_test3+pred_test4+pred_test5+pred_test6+pred_test7+pred_test8+pred_test9+pred_test10)/10*max_y
pred_train=(pred_train1+pred_train2+pred_train3+pred_train4+pred_train5+pred_train6+pred_train7+pred_train8+pred_train9+pred_train10)/10*max_y
df = pd.DataFrame(np.around(pred_test,3), columns = ['Column_1'])
df.T

In [None]:
from sklearn.metrics import r2_score
title_list=['Nalm 6 Cytotoxicity','IL7RaKLRG1 stemness']
plt.figure(figsize=[15,70])
i=1
gt=np.exp(Y_train*max_y)-1
pred_train=np.exp(pred_train)-1
plt.subplot(1,2,1)
plt.scatter(gt,pred_train)
xmin=min(min(gt),min(pred_train))
xmax=max(max(gt),max(pred_train))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Training)', fontsize=20)
plt.ylabel('Predictions (Training)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_train[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')

plt.subplot(1,2,2)
gt=np.exp(Y_test*max_y)-1
pred_test=np.exp(pred_test)-1
plt.scatter(gt,pred_test)
xmin=min(min(gt),min(pred_test))
xmax=max(max(gt),max(pred_test))
xline=np.linspace(xmin,xmax,10)
plt.plot(xline,xline,color='red')
plt.xlabel('Ground Truth (Test)', fontsize=20)
plt.ylabel('Predictions (Test)', fontsize=20)
correlation_matrix = np.corrcoef(gt, pred_test[:,0])
corr = correlation_matrix[0,1]
r_squared = corr**2
plt.title(title_list[i]+'\n'+'R^2='+str(r_squared)[:5], fontsize=20)
plt.axis('square')