In [13]:
from torch import nn, optim, utils
import torch
import optuna 
import os
import numpy as np
import pandas as pd
import random
import scipy.stats as stats


def setup_seed(seed):
     torch.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
     np.random.seed(seed)
     random.seed(seed)

def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2
    
def one_hot_encode(df, col='seq', seq_len=44):
    # Dictionary returning one-hot encoding of nucleotides.
    nuc_d = {'a' :[1 ,0 ,0 ,0] ,'c' :[0 ,1 ,0 ,0] ,'g' :[0 ,0 ,1 ,0] ,'t' :[0 ,0 ,0 ,1], 'n' :[0 ,0 ,0 ,0]}
    vectors = np.empty([len(df), seq_len, 4])
    for i, seq in enumerate(df[col].str[:seq_len]):
        seq = seq.lower()
        a = np.array([nuc_d[x] for x in seq])
        vectors[i] = a
    return vectors

class CNN(nn.Module):

    def __init__(self,  trial, input_size=44, layers=3):
        super(CNN, self).__init__()
        self.conv = nn.Sequential()
        self.input_size = input_size
        filters = trial.suggest_int("filters", low=32,high=128,step=32)
        kernel_size = trial.suggest_int("kernel_size", low=4,high=12,step=4)
        hidden_size = trial.suggest_int("hidden_size", low=32,high=128,step=32)
        for i in range(layers):
            input_size = filters if i else 4
            self.conv.append(nn.Sequential(
                nn.Conv1d(in_channels=input_size, out_channels=filters, kernel_size=(kernel_size,), padding="same"),
                nn.LeakyReLU(),
                ))
        self.dense = nn.Linear(filters*self.input_size, hidden_size)
        self.dropout = nn.Dropout(p=0.2)
        self.output = nn.Linear(hidden_size, 1)
        self.init()

    def init(self):
        # nn.init.xavier_uniform_(self.dense.weight, gain=nn.init.calculate_gain('relu'))
        # nn.init.xavier_uniform_(self.output.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.kaiming_uniform_(self.dense.weight)
        nn.init.kaiming_uniform_(self.output.weight)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.conv(x)
        x = torch.flatten(x, 1)
        x = self.dense(x)
        x = torch.relu(x)
        x = self.output(x)
        # x = torch.tanh(x)
        return x

setup_seed(1337)

In [17]:
from sklearn import preprocessing
from torch.utils.data import Dataset
import torch
from torch.utils.data import DataLoader


class VEE5UTRDataset(Dataset):

    def __init__(self, X, y):
        self.X = torch.tensor(X,dtype=torch.float)
        self.y = torch.tensor(y.values,dtype=torch.float).reshape((-1, 1))

    def __getitem__(self, item):
        return self.X[item], self.y[item]

    def __len__(self):
        return len(self.y)
    

e_train=pd.read_csv("train.csv")
e_test =pd.read_csv("test.csv")
df = pd.concat([e_train,e_test])
e_train = df.sample(frac=0.8)
e_test= df[~df.index.isin(e_train.index)]
seq_len =118
print(e_train.shape, e_test.shape)
seq_e_train = one_hot_encode(e_train,seq_len=seq_len)
seq_e_test = one_hot_encode(e_test, seq_len=seq_len)

e_test.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_test.loc[:,'abs_score'].values.reshape(-1,1))
e_train.loc[:,'scaled_rl'] = preprocessing.StandardScaler().fit_transform(e_train.loc[:,'abs_score'].values.reshape(-1,1))

train = VEE5UTRDataset(seq_e_train, e_train["scaled_rl"])


(4748, 11) (1091, 11)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
def evaluate(df, model, test_seq, obs_col, output_col='pred'):
    '''Predict mean ribosome load using model and test set UTRs'''

    # Scale the test set mean ribosome load
    scaler = preprocessing.StandardScaler()
    scaler.fit(df[obs_col].values.reshape(-1, 1))
    model.eval()
    # Make predictions
    test_seq = torch.tensor(test_seq, dtype=torch.float)
    predictions = model(test_seq).reshape(-1, 1).detach().numpy()
    # Inverse scaled predicted mean ribosome load and return in a column labeled 'pred'
    df.loc[:, output_col] = scaler.inverse_transform(predictions)
    return df


def objective(trial):
    model = CNN(trial, input_size=seq_len)
    lr = trial.suggest_float("lr", 1e-5, 1e-3,log=True)
    batch_size=trial.suggest_int("batch_size", 64, 256,step=64)
    train_loader = DataLoader(train, batch_size=batch_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08)
    criterion = torch.nn.MSELoss()
    for epoch in range(epochs):
        model.train()
        for idx, (data_x, data_y) in enumerate(train_loader):
            data_x = data_x.to(torch.float32)
            data_y = data_y.to(torch.float32)
            outputs = model(data_x)
            optimizer.zero_grad()
            loss = criterion(data_y, outputs)
            loss.backward()
            optimizer.step()
            edf = evaluate(e_test, model, seq_e_test, 'abs_score', output_col='pred')
            r = r2(edf['abs_score'], edf['pred'])
            return r
epochs=20
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)
print('Accuracy: {}'.format(study.best_value))
print("Best hyperparameters: {}".format(study.best_params))

[I 2024-06-15 02:13:29,159] A new study created in memory with name: no-name-664a7ca6-18ee-43cc-a69c-c30bd79e142a


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

[I 2024-06-15 02:13:30,494] Trial 0 finished with value: 0.06333113887822948 and parameters: {'filters': 64, 'kernel_size': 12, 'hidden_size': 96, 'lr': 0.0004983777673004299, 'batch_size': 64}. Best is trial 0 with value: 0.06333113887822948.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setti



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

[I 2024-06-15 02:13:46,877] Trial 13 finished with value: 0.0014961454149013296 and parameters: {'filters': 96, 'kernel_size': 4, 'hidden_size': 96, 'lr': 0.00012047218717374065, 'batch_size': 128}. Best is trial 0 with value: 0.06333113887822948.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

[I 2024-06-15 02:13:47,220] Trial 14 finished with value: 0.027886833975653714 and parameters: {'filters': 32, 'kernel_size': 8, 'hidden_size': 64, 'lr': 0.0005231014566559692, 'batch_size': 192}. Best is trial 0 with va

Accuracy: 0.06333113887822948
Best hyperparameters: {'filters': 64, 'kernel_size': 12, 'hidden_size': 96, 'lr': 0.0004983777673004299, 'batch_size': 64}


In [12]:
df = study.trials_dataframe().drop(['state','datetime_start','datetime_complete','duration','number'], axis=1)
# df.tail(5)
# optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)
# optuna.visualization.plot_contour(study, params=['batch_size', 'lr'])