In [None]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten,Conv1D,InputLayer,Convolution1D,MaxPooling1D,BatchNormalization,Concatenate,Input
import scipy.stats as stats
from sklearn import preprocessing
import random
import pandas as pd
import numpy as np
import os

seed=42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.config.experimental.enable_op_determinism()

def one_hot_encode(df, col='seq', seq_len=44):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    
    # Creat empty matrix.
    vectors=np.empty([len(df),seq_len,4])
    
    # Iterate through UTRs and one-hot encode
    for i,seq in enumerate(df[col].str[:seq_len]): 
        seq = seq.lower()
        lst = [nuc_d[x] for x in seq]
        if seq_len>len(seq):
            lst += [nuc_d['n']]*(seq_len-len(seq))
        a = np.array(lst)
        vectors[i] = a
    return vectors

def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2

def test_data(df, model, test_seq, obs_col, output_col='pred'):
    '''Predict mean ribosome load using model and test set UTRs'''
    
    # Scale the test set mean ribosome load
    scaler = preprocessing.StandardScaler()
    scaler.fit(df[obs_col].values.reshape(-1,1))
    
    # Make predictions
    predictions = model.predict(test_seq, verbose=0).reshape(-1,1)
    
    # Inverse scaled predicted mean ribosome load and return in a column labeled 'pred'
    df.loc[:,output_col] = scaler.inverse_transform(predictions)
    return df

def get_cnn_model(seq_len=118,kernel_size=5, border_mode='same'):

    ''' Build model archicture and fit.'''
    model = Sequential()
    model.add(Conv1D(name="conv_1",activation="relu", input_shape=(seq_len, 4), padding=border_mode, filters=100, kernel_size=kernel_size))
    model.add(Conv1D(name="conv_2",activation="relu", padding=border_mode, filters=100, kernel_size=kernel_size))
    model.add(Dropout(0.15,name="dropout_1"))
    model.add(Conv1D(name="conv_3",activation="relu", padding=border_mode, filters=100*2, kernel_size=kernel_size))
    model.add(Dropout(0.15,name="dropout_2"))
    model.add(Flatten(name="flatten_1"))
    model.add(Dense(150,name="dense_1",kernel_initializer='glorot_normal',bias_initializer='zeros'))
    model.add(Activation('relu',name="act_1"))
    model.add(Dropout(0.15,name="dropout_out1"))
    model.add(Dense(1,name="dense_2",kernel_initializer='glorot_normal',bias_initializer='zeros'))
    model.add(Activation('linear',name="act_2"))
#     model.summary()
    adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99, epsilon=1e-06)
    model.compile(loss="mean_squared_error", optimizer=adam)
    return model

def get_cnn1_model(seq_len=118,kernel_size=5, border_mode='same'):

    ''' Build model archicture and fit.'''
    model = Sequential()
    model.add(Conv1D(name="conv_1",activation="relu", input_shape=(seq_len, 4), padding=border_mode, filters=50, kernel_size=kernel_size))
    model.add(Conv1D(name="conv_2",activation="relu", padding=border_mode, filters=100, kernel_size=kernel_size))
    model.add(MaxPooling1D(pool_size=2,strides=2, padding=border_mode,name="maxpooling1"))
    model.add(Dropout(0.30))
    model.add(Conv1D(name="conv_3",activation="relu", padding=border_mode, filters=200, kernel_size=kernel_size))
    model.add(MaxPooling1D(pool_size=2,strides=2, padding=border_mode,name="maxpooling2"))
    model.add(Conv1D(name="conv_4",activation="relu", padding=border_mode, filters=100, kernel_size=kernel_size))
    model.add(BatchNormalization())
    model.add(Dropout(0.30))
    model.add(Flatten())
    model.add(Dense(100,name="dense_1",kernel_initializer='glorot_normal'))
    model.add(Dropout(0.30))
    model.add(Dense(1,name="dense_2",activation='linear',kernel_initializer='glorot_normal'))
#     model.summary()
    #compile the model
    adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99, epsilon=1e-06)
    model.compile(loss="mean_squared_error", optimizer=adam)
    return model

def get_cnn2_model(seq_len=118,kernel_size=5,conv_layers=3,reg_lambda=0.0):

    ''' Build model archicture and fit.'''
    model = Sequential()
    model.add(Input(shape=(seq_len, 4)))
    conv_filters_first = 120
    for conv_layer_idx in range(conv_layers):
        model.add(Conv1D(activation="relu",padding='same', filters=conv_filters_first*(2**0),kernel_size=kernel_size,
                name='conv_{}_0'.format(conv_layer_idx + 1),kernel_regularizer=regularizers.l2(reg_lambda)))
        model.add(Conv1D(activation="relu",padding='same', filters=conv_filters_first*(2**conv_layer_idx),kernel_size=kernel_size,
                name='conv_{}_1'.format(conv_layer_idx + 1),kernel_regularizer=regularizers.l2(reg_lambda)))
        model.add(MaxPooling1D(pool_size=2,strides=2,padding='same'))
        if conv_layer_idx:
            model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(50,name='dense_1',activation='relu',kernel_regularizer=regularizers.l2(reg_lambda)))
    model.add(Dropout(0.25))
    model.add(Dense(1,name='dense_2',activation='linear',kernel_regularizer=regularizers.l2(reg_lambda)))
#     model.summary()
    #compile the model
    adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99, epsilon=1e-06)
    model.compile(loss="mean_squared_error", optimizer=adam)
    return model

def train_model(model,e_train,e_test,seq_len=118):
    # e_test = pd.concat([e_test,df[df.seq==original_seq]])
    
    label="score" 
    seq_e_train = one_hot_encode(e_train,seq_len=seq_len)
    seq_e_test = one_hot_encode(e_test, seq_len=seq_len)
    e_test.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_test.loc[:,label].values.reshape(-1,1))
    e_train.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_train.loc[:,label].values.reshape(-1,1))
    earlyStop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, mode='min', verbose=1, 
                          restore_best_weights=True)
    set_seed(seed)
    history = model.fit(seq_e_train, e_train['scaled_rl'], batch_size=64, epochs=50,callbacks=[earlyStop], 
                        validation_data=(seq_e_test,e_test["scaled_rl"]), verbose=1,)
    e_test = test_data(df=e_test, model=model, obs_col=label,test_seq=seq_e_test)
    e_train = test_data(df=e_train, model=model, obs_col=label,test_seq=seq_e_train)
    

In [None]:
import pandas as pd
import numpy as np
import os
import random
import Levenshtein

from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold
seed=42
data_dir="/data/home/jinyalong/data/sev_240624/results"
original_seq = 'atcccgggtgaggcatcccaccatcctcagtcacagagagacccaatctaccatcagcatcagccagtaaagattaagaaaaacttagggtgaaagaaatttcacctaacacggcgca'
original_seq=original_seq.upper()
# lst = [f"pl{i}-1-{j}" for i in (3,5,6) for j in range(1,3)]
# lst = [f"pl{i}-1-{j}" for i in (3,6) for j in range(1,3)]
# dfs = []
# for name in lst:
#     cdf = pd.read_csv(f"{data_dir}/{name}/{name}_final.csv")
#     dfs.append(cdf)
# df = pd.concat(dfs)
# seq_len=118
# df = df.groupby("seq")[["plasmid_counts","rna_counts"]].sum().reset_index()
# df = df[(df["plasmid_counts"]>40)&(df["rna_counts"]>10)]
# name,seq_len="pl3-1-2_core" # 118,24
# name,seq_len,suff = "pl3-1-2",118,""
name,seq_len,suff="SFV_0719",85,""
# df = pd.read_csv(f"{data_dir}/{name}/{name}_final.csv")
datadir=f"/data/home/jinyalong/data/5UTR/{name}_"
# datadir=f"/data/home/jinyalong/data/sev_240624/results/{name}/{name}_"
df = pd.read_csv(f"{datadir}train{suff}.csv")
e_test= pd.read_csv(f"{datadir}test{suff}.csv")
df['distance'] = df.seq.apply(lambda x: Levenshtein.distance(original_seq, x))
distance_bins = np.arange(0, df.distance.max() // 5 * 4, df.distance.max() // 5)
distance_bins = np.append(distance_bins, np.inf)
df['distance_cat'] = pd.cut(df.distance, bins=distance_bins, labels=list(range(1, len(distance_bins))), right=False)
df["length"]=df["seq"].apply(len)
# frac = df["plasmid_counts"].sum()/df["rna_counts"].sum()
# df["abs_score"]=df["rna_counts"]/df["plasmid_counts"]
# df["score"]=np.log(df["abs_score"]*frac)
# print(df[df.seq==original_seq])
# df = df[df.seq!=original_seq]
# nums = [0]*(len(original_seq))
# def edit_pos_cat(v):
#     idxs = [i for i in range(len(original_seq)) if v[i]!=original_seq[i]] 
#     mval = min([nums[i] for i in idxs])
#     hit = [i for i, n in enumerate(nums) if n==mval]
#     nums[hit[0]]+=1
#     return hit[0]


# df["edit_pos"] = df.seq.apply(edit_pos_cat)
# # print(nums)
# df["edit_pos"]= df.edit_pos.apply(lambda x : x-1 if x==117 else x)
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.05, random_state=seed)
rdf = df
ns, ss = [], []
for nsamples in range(500,14908,500):
    df = rdf.sample(nsamples)
    split = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    label="score"
    fpr = []
    for idx,(train_idx, test_idx) in enumerate(split.split(df, df["length"])):
        e_train = df.iloc[train_idx].reset_index(drop=True)
#         e_test = df.iloc[test_idx].reset_index(drop=True)
        if idx==0:
            print(f"Samples {nsamples} ", e_train.shape, e_test.shape)
        elif idx==5:
            break
        #model = get_cnn_model(seq_len=seq_len,kernel_size=6)
        model = get_cnn1_model(seq_len=seq_len,kernel_size=5)
#         model = get_cnn2_model(seq_len=seq_len,kernel_size=6,reg_lambda=1e-4)
        train_model(model,e_train,e_test,seq_len=seq_len)
        model.save(f"./models/tmp/{name}.keras")
        r = r2(e_test[label], e_test['pred'])
        pr =  stats.pearsonr(e_test[label], e_test['pred'])
        ns.append(nsamples)
        ss.append(pr[0])
        print(f'\t KFold {idx} test r-squared = ', r, "pearsonR = ", pr[0])
    # scores.append(pr[0])
#     e_train = test_data(df=e_train, model=model, obs_col=label,test_seq=seq_e_train)
#     r = r2(e_train[label], e_train['pred'])
#     pr =  stats.pearsonr(e_train[label], e_train['pred'])
#     print(f'KFold {idx} train r-squared = ', r, "pearsonR = ", pr[0])

pdf = pd.DataFrame(data={"samples":ns,"score":ss})
pdf

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
name = name.split("_")[0] if "_"in name else "SEV"
name = "SFV"
fig,ax=plt.subplots()
# mx=pdf.groupby('samples')['score'].median()
# sns.boxplot(y='score',x='samples',data=pdf,ax=ax)
plot=sns.pointplot(data=pdf,x="samples", y="score",scale=0.5)
# plot.set_xticklabels(plot.get_xticklabels(), rotation=75)
plot.set_title(f"{name}")
plot.set_xticks(ticks=range(0,35,3))
# plot.set_xticklabels(, rotation=75)
plt.show()