## Pretrained Encoder (keras)

In [1]:
import logging
import pandas as pd
import numpy as np
import pyreadstat
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
import tensorflow as tf
from keras.layers import Dense
from keras.models import Sequential, load_model
from keras.optimizers import SGD
from matplotlib import pyplot

# summarize the performance of the fit model
def summarize_model(model, history, trainX, trainy, testX, testy, figname):
    # evaluate the model
    _, train_acc = model.evaluate(trainX, trainy, verbose=0)
    _, test_acc = model.evaluate(testX, testy, verbose=0)
    print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))
    # plot loss during training
    pyplot.subplot(211)
    pyplot.title('MSE')
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    # plot accuracy during training
    pyplot.subplot(212)
    pyplot.title('R2')
    pyplot.plot(history.history['coeff_determination'], label='train')
    pyplot.plot(history.history['val_coeff_determination'], label='test')
    pyplot.legend()
    pyplot.savefig(figname, dpi=300)
    pyplot.show()

# standalone MLP
def fit_model(trainX, trainy, testX, testy):
    # define model
    model = Sequential()
    model.add(Dense(50, input_dim=28, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(28, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(14, activation='relu', kernel_initializer='he_uniform'))
    model.summary()
    # compile model
    model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=[tf.keras.metrics.MeanAbsolutePercentageError()])
    # fit model
    history = model.fit(trainX.values, trainy.values, validation_data=(testX.values, testy), epochs=100, verbose=1)
    return model, history

In [7]:
to_predict = 'Product titer (g/L)'
species = 'YL'
if_scale = False

# source
if if_scale:
    x_train_source = pd.read_csv(f'../data/{species}_xtrain_scaled.csv', index_col=0)
    y_train_source = pd.read_csv(f'../data/{species}_ytrain_scaled.csv', index_col=0).squeeze()
    x_test_source = pd.read_csv(f'../data/{species}_xtest_scaled.csv', index_col=0)
    y_test_source = pd.read_csv(f'../data/{species}_ytest_scaled.csv', index_col=0).squeeze()
else:
    x_train_source = pd.read_csv(f'../data/{species}_xtrain.csv', index_col=0)
    y_train_source = pd.read_csv(f'../data/{species}_ytrain.csv', index_col=0).squeeze()
    x_test_source = pd.read_csv(f'../data/{species}_xtest.csv', index_col=0)
    y_test_source = pd.read_csv(f'../data/{species}_ytest.csv', index_col=0).squeeze()
    
product_class_source = pd.read_csv(f'../data/{species}_product_class.csv', index_col=0)
print(x_train_source.shape)

# target
species = 'RT'
if if_scale:
    x_train_target = pd.read_csv(f'../data/{species}_xtrain_scaled.csv', index_col=0)
    y_train_target = pd.read_csv(f'../data/{species}_ytrain_scaled.csv', index_col=0).squeeze()
    x_test_target = pd.read_csv(f'../data/{species}_xtest_scaled.csv', index_col=0)
    y_test_target = pd.read_csv(f'../data/{species}_ytest_scaled.csv', index_col=0).squeeze()
else:
    x_train_target = pd.read_csv(f'../data/{species}_xtrain.csv', index_col=0)
    y_train_target = pd.read_csv(f'../data/{species}_ytrain.csv', index_col=0).squeeze()
    x_test_target = pd.read_csv(f'../data/{species}_xtest.csv', index_col=0)
    y_test_target = pd.read_csv(f'../data/{species}_ytest.csv', index_col=0).squeeze()
    
product_class_target = pd.read_csv(f'../data/{species}_product_class.csv', index_col=0)
#print(x_train_target.drop(columns=['plot']).shape)

(3752, 28)


In [14]:
# fit model on source dataset
model, history = fit_model(x_train_source, y_train_source, x_test_source, y_test_source)
encoded_source = model.predict(x_train_source)
#summarize_model(model, history, x_train_source, y_train_source, x_test_source, y_test_source, './figures/source-mlp.jpg')

# save model to file
model.save('encoder.h5')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 50)                1450      
                                                                 
 dense_4 (Dense)             (None, 28)                1428      
                                                                 
 dense_5 (Dense)             (None, 14)                406       
                                                                 
Total params: 3,284
Trainable params: 3,284
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10

Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100


Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [15]:
model = load_model('encoder.h5')

model.fit(x_train_target.drop(columns=['Species']), y_train_target, validation_data=(x_test_target.drop(columns=['Species']), y_test_target), epochs=10)

# predict encoded target data
encoded_target = model.predict(x_train_target.drop(columns=['Species']))

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Please report this to

## Models for pretrained encoder & autoencoder (pytorch)

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torchvision import datasets, transforms
from scipy.spatial.distance import pdist, squareform
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

def normalize_2d(tensor, mean, std):
    tensor = tensor - mean[:, None]
    tensor = tensor / std[:, None]
    return tensor

class SourceDataset(Dataset):
    def __init__(self, dataframe, x_features, to_predict, mean=None, std=None):
        self.dataframe = dataframe
        self.x_features = x_features
        self.to_predict = to_predict
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        data = np.array(self.dataframe.iloc[index][self.x_features])
        label = np.array(self.dataframe.iloc[index][self.to_predict])
        
        data = torch.tensor(data, dtype=torch.float32)
        if self.mean is not None and self.std is not None:
            data = normalize_1d(data, self.mean, self.std)

        return data, label

In [5]:
# euclidean
class TripletLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor: torch.Tensor, positive: torch.Tensor, negative: torch.Tensor) -> torch.Tensor:
        distance_positive = (anchor - positive).pow(2).sum(1)
        distance_negative = (anchor - negative).pow(2).sum(1)
        #print(distance_positive - distance_negative)
        losses = torch.relu(distance_positive - distance_negative + self.margin)
        return losses.mean()

class Autoencoder(nn.Module):
    def __init__(self, input_size, hidden_size, latent_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, latent_size),
            nn.BatchNorm1d(latent_size),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, input_size),
            nn.BatchNorm1d(input_size),
            nn.ReLU()
        )
        self.regressor = nn.Linear(latent_size, 1)
        
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        regression_output = self.regressor(encoded)
        return decoded, regression_output

# custom Euclidean distance
def euclidean_distance(x1, x2):
    return torch.sqrt(torch.sum((x1 - x2) ** 2, dim=-1))

batch_size = 64


# source
to_predict = 'Product titer (g/L)'
source_species = 'Yarrowia'
target_species = 'Rhodo'

df = pd.read_excel("../impactDB/TL training data YL.xlsx", header=1)

# Count the semicolons in 'genes_modified_updated'
df['Genes modified'] = df['Genes modified'].fillna('').apply(lambda x: x.count(';'))

# Count the ones in 'gene_overexpression' and 'heterologous_gene'
df['Genes overexpressed'] = df['Genes overexpressed'].fillna('').apply(lambda x: str(x).count('1'))
df['Heterologous genes'] = df['Heterologous genes'].fillna('').apply(lambda x: str(x).count('1'))
df['Genes deleted'] = df['Genes deleted'].fillna('').apply(lambda x: str(x).count('1'))

# Remove 'dataID' and 'paper_number' columns
df = df.drop(['dataID', 'paper_number','product_name'], axis=1)

# Impute all NaN data as zero
imputer = SimpleImputer(strategy='constant', fill_value=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

scaler = MinMaxScaler()

# Fit and transform the original DataFrame
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

#row_sums = df_imputed.sum(axis=1)
#source_norm = df_imputed / row_sums[:, np.newaxis]
x_source = df_scaled.drop([to_predict], axis=1)

# Calculate pairwise Euclidean distances
distances = pdist(df_imputed.values, metric='euclidean')
# excluding the anchor itself
pairwise_distances = squareform(distances) + np.eye(len(df_scaled)) * 1e10
pairwise_distances_neg = squareform(distances)
print(pairwise_distances.shape)

triplets_source = []
with tqdm(total=len(df_imputed)) as progress_bar:
    for i, anchor in x_source.iterrows():
        # positive sample = closest to the anchor
        positive_idx = np.argmin(pairwise_distances[i])
        positive = x_source.iloc[positive_idx]

        # negative sample = farthest from the anchor
        negative_idx = np.argmax(pairwise_distances_neg[i])
        negative = x_source.iloc[negative_idx]

        triplets_source.append((anchor, positive, negative))
        progress_bar.update(1)
triplets_array = np.array(triplets_source)
triplet_data = torch.tensor(triplets_array, dtype=torch.float32)
print(triplet_data.shape)

dataset = TensorDataset(triplet_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

 20%|███████████████▌                                                             | 946/4690 [00:00<00:00, 9451.74it/s]

(4690, 4690)


100%|████████████████████████████████████████████████████████████████████████████| 4690/4690 [00:00<00:00, 6904.51it/s]

torch.Size([4690, 3, 29])





In [3]:
# pre-training
input_dim = x_source.shape[1]
num_epochs = 10
learning_rate = 0.0001
hidden_dim = 64
latent_dim = 28
l1_weight = 0.1

ae = Autoencoder(input_dim, hidden_dim, latent_dim)
triplet_loss = TripletLoss()
#triplet_loss = nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance(), margin=1)
optimizer = optim.Adam(ae.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for triplets in dataloader:
        triplets = triplets[0]
        #print(triplets)
        anchors, positives, negatives = triplets[:, 0, :], triplets[:, 1, :], triplets[:, 2, :]

        anchors_encoded = ae(anchors)
        positives_encoded = ae(positives)
        negatives_encoded = ae(negatives)

        loss = triplet_loss(anchors_encoded, positives_encoded, negatives_encoded)
        
        l1_penalty = torch.norm(ae.encoder[2].weight, p=1)
        l1_penalty += torch.norm(ae.decoder[2].weight, p=1)

        total_loss = loss + l1_weight * l1_penalty

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

torch.save(ae.encoder.state_dict(), 'pretrained_ae.pth')

Epoch [1/10], Loss: 0.9501
Epoch [2/10], Loss: 0.2906
Epoch [3/10], Loss: 0.0101
Epoch [4/10], Loss: 0.0349
Epoch [5/10], Loss: 0.0219
Epoch [6/10], Loss: 0.0210
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0012
Epoch [10/10], Loss: 0.0000


In [4]:
# target
df = pd.read_excel("../impactDB/TL training data RT.xlsx", header=1)

# Set the target column
target = 'Product_titer(g/L)'
species = 'Rhodo'

# Count the semicolons in 'genes_modified_updated'
df['genes_modified_updated'] = df['genes_modified_updated'].fillna('').apply(lambda x: x.count(';'))

# Count the ones in 'gene_overexpression' and 'heterologous_gene'
df['gene_overexpression'] = df['gene_overexpression'].apply(lambda x: 1 if x == 1 else 0)
df['heterologous_gene'] = df['heterologous_gene'].apply(lambda x: 1 if x == 1 else 0)
df['gene_deletion'] = df['gene_deletion'].apply(lambda x: 1 if x == 1 else 0)

# Remove 'dataID' and 'paper_number' columns
df = df.drop(['dataID', 'paper_number','product_name'], axis=1)

# Impute all NaN data as zero
imputer = SimpleImputer(strategy='constant', fill_value=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

scaler = MinMaxScaler()

# Fit and transform the original DataFrame
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

#row_sums = df_imputed.sum(axis=1)
#target_norm = df_imputed / row_sums[:, np.newaxis]
x_target = df_scaled.drop([to_predict], axis=1)
batch_size = 4

distances = pdist(df_imputed.values, metric='euclidean')
pairwise_distances = squareform(distances) + np.eye(len(df_scaled)) * 1e10
pairwise_distances_neg = squareform(distances)
print(pairwise_distances.shape)

triplets_target = []
with tqdm(total=len(df_scaled)) as progress_bar:
    for i, anchor in x_target.iterrows():
        positive_idx = np.argmin(pairwise_distances[i])
        positive = x_target.iloc[positive_idx]

        negative_idx = np.argmax(pairwise_distances_neg[i])
        negative = x_target.iloc[negative_idx]

        triplets_target.append((anchor, positive, negative))
        progress_bar.update(1)
triplets_array = np.array(triplets_target)
triplet_data = torch.tensor(triplets_array, dtype=torch.float32)
print(triplet_data.shape)

dataset = TensorDataset(triplet_data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# transferring
transfer_autoencoder = Autoencoder(input_dim, hidden_dim, latent_dim)
transfer_autoencoder.load_state_dict(ae.state_dict())

# Reset the optimizer
optimizer = optim.Adam(transfer_autoencoder.parameters(), lr=0.001)

# encoder only
for param in transfer_autoencoder.encoder.parameters():
    param.requires_grad = False

for epoch in range(num_epochs):
    for data in dataloader:
        triplets = data[0]

        #print(triplets)
        anchors, positives, negatives = triplets[:, 0, :], triplets[:, 1, :], triplets[:, 2, :]

        anchors_encoded = transfer_autoencoder.encoder(anchors)
        positives_encoded = transfer_autoencoder.encoder(positives)
        negatives_encoded = transfer_autoencoder.encoder(negatives)
        
        anchors_decoded = transfer_autoencoder.decoder(anchors_encoded)
        positives_decoded = transfer_autoencoder.decoder(positives_encoded)
        negatives_decoded = transfer_autoencoder.decoder(negatives_encoded)

        loss = triplet_loss(anchors_decoded, positives_decoded, negatives_decoded)
        
        l1_penalty = torch.norm(transfer_autoencoder.encoder[2].weight, p=1)
        l1_penalty += torch.norm(transfer_autoencoder.decoder[2].weight, p=1)

        total_loss = loss + l1_weight * l1_penalty

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# save
reconstructed_anchors_list = []
reconstructed_positives_list = []
reconstructed_negatives_list = []
with torch.no_grad():
    for batch_data in dataloader:
        triplets = batch_data[0]
        anchors, positives, negatives = triplets[:, 0, :], triplets[:, 1, :], triplets[:, 2, :]

        anchors_encoded = transfer_autoencoder(anchors).detach().numpy()
        positives_encoded = transfer_autoencoder(positives).detach().numpy()
        negatives_encoded = transfer_autoencoder(negatives).detach().numpy()

        # Append the reconstructed samples to the lists
        reconstructed_anchors_list.append(anchors_encoded)
        reconstructed_positives_list.append(positives_encoded)
        reconstructed_negatives_list.append(negatives_encoded)

encoded_anchors = np.concatenate(reconstructed_anchors_list, axis=0)
encoded_df = pd.DataFrame(encoded_anchors)
encoded_df.to_csv('encoded_target_x.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 443/443 [00:00<00:00, 8760.75it/s]

(443, 443)
torch.Size([443, 3, 28])
Epoch [1/10], Loss: 0.0000





Epoch [2/10], Loss: 0.0000
Epoch [3/10], Loss: 0.0000
Epoch [4/10], Loss: 0.0000
Epoch [5/10], Loss: 0.0000
Epoch [6/10], Loss: 0.0000
Epoch [7/10], Loss: 0.0000
Epoch [8/10], Loss: 0.0000
Epoch [9/10], Loss: 0.0000
Epoch [10/10], Loss: 0.0000


In [6]:
np.savetxt("./data/encoded_target.csv", encoded_target, delimiter=",")
np.savetxt("./data/encoded_source.csv", encode_source, delimiter=",")

## Pretrain encoder only

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, y):
        self.data = data
        self.y = y

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample_data = self.data[idx]
        sample_y = self.y[idx]

        return sample_data, sample_y

batch_size = 32

to_predict = 'Product titer (g/L)'
source_species = 'Yarrowia'
target_species = 'Rhodo'

df = pd.read_excel("../impactDB/TL training data YL.xlsx", header=1)
#df = df.dropna(axis=1, how="all")

# Count the semicolons in 'genes_modified_updated'
df['Genes modified'] = df['Genes modified'].fillna('').apply(lambda x: x.count(';'))

# Count the ones in 'gene_overexpression' and 'heterologous_gene'
df['Genes overexpressed'] = df['Genes overexpressed'].fillna('').apply(lambda x: str(x).count('1'))
df['Heterologous genes'] = df['Heterologous genes'].fillna('').apply(lambda x: str(x).count('1'))
df['Genes deleted'] = df['Genes deleted'].fillna('').apply(lambda x: str(x).count('1'))

# Remove 'dataID' and 'paper_number' columns
df = df.drop(['dataID', 'paper_number','product_name'], axis=1)

# Impute all NaN data as zero
imputer = SimpleImputer(strategy='constant', fill_value=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

scaler = MinMaxScaler()

# Fit and transform the original DataFrame
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

x = df_scaled.drop(to_predict, axis=1).values
y = df_scaled[[to_predict]].values

custom_dataset = CustomDataset(x, y)
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

input_dim = x.shape[1]
learning_rate = 0.1
hidden_dim = 30
latent_dim = 14
l1_weight = 0.1

# Add L2 regularization
weight_decay = 1e-5

ae = Autoencoder(input_dim, hidden_dim, latent_dim)
#triplet_loss = TripletLoss()
#triplet_loss = nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance(), margin=1)

mse_loss = nn.MSELoss()
l1_loss = nn.L1Loss()

optimizer = optim.Adam(ae.parameters(), lr=learning_rate, weight_decay=weight_decay)

num_epochs = 200
patience = 10
min_delta = 1e-4
best_loss = float('inf')
counter = 0

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for data, y in dataloader:
        data = data.float()
        y = y.float()
        
        optimizer.zero_grad()
        decoded, regression_output = ae(data)

        reconstruction_loss = mse_loss(decoded, data)
        regression_loss = l1_loss(regression_output, y)

        combined_loss = (1-l1_weight) * reconstruction_loss + l1_weight * regression_loss

        combined_loss.backward()
        optimizer.step()
        
        epoch_loss += combined_loss.item()
        
    # Early stopping
    if best_loss - epoch_loss > min_delta:
        best_loss = epoch_loss
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {combined_loss.item():.4f}')

torch.save(ae.state_dict(), 'pretrained_ae.pth')

# target
df = pd.read_excel("../impactDB/RT+YL.xlsx", header=1)
#df = df.dropna(axis=1, how="all")

# Set the target column
to_predict = 'Product titer (g/L)'

# Count the semicolons in 'genes_modified_updated'
df['Genes modified'] = df['Genes modified'].fillna('').apply(lambda x: x.count(';'))

# Count the ones in 'gene_overexpression' and 'heterologous_gene'
df['Genes overexpressed'] = df['Genes overexpressed'].fillna('').apply(lambda x: str(x).count('1'))
df['Heterologous genes'] = df['Heterologous genes'].fillna('').apply(lambda x: str(x).count('1'))
df['Genes deleted'] = df['Genes deleted'].fillna('').apply(lambda x: str(x).count('1'))

# Remove 'dataID' and 'paper_number' columns
df = df.drop(['dataID', 'paper_number','product_name'], axis=1)
# Impute all NaN data as zero
imputer = SimpleImputer(strategy='constant', fill_value=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


scaler = MinMaxScaler()

# Fit and transform the original DataFrame
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

x = df_scaled.drop(to_predict, axis=1).values
y = df_scaled[[to_predict]].values

custom_dataset = CustomDataset(x, y)
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)



# transferring
transfer_autoencoder = Autoencoder(input_dim, hidden_dim, latent_dim)
transfer_autoencoder.encoder.load_state_dict(ae.encoder.state_dict())

for param in transfer_autoencoder.encoder.parameters():
    param.requires_grad = False

mse_loss = nn.MSELoss()
l1_loss = nn.L1Loss()

optimizer = optim.Adam(ae.parameters(), lr=learning_rate, weight_decay=weight_decay)

num_epochs = 200
patience = 10
min_delta = 1e-4
best_loss = float('inf')
counter = 0

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for data, y in dataloader:
        data = data.float()
        y = y.float()
        
        optimizer.zero_grad()
        decoded, regression_output = ae(data)

        reconstruction_loss = mse_loss(decoded, data)
        regression_loss = l1_loss(regression_output, y)

        combined_loss = (1-l1_weight) * reconstruction_loss + l1_weight * regression_loss

        combined_loss.backward()
        optimizer.step()
        
        epoch_loss += combined_loss.item()
        
    # Early stopping
    if best_loss - epoch_loss > min_delta:
        best_loss = epoch_loss
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {combined_loss.item():.4f}')

ae.eval()

# save
reconstructed_list = []
with torch.no_grad():
    for data, y in dataloader:
        data = data.float()
        y = y.float()
        # Forward pass
        encoded = transfer_autoencoder.(data)

        encoded = encoded.detach().numpy()

        # Append the reconstructed samples to the lists
        reconstructed_list.append(encoded)

encoded_x = np.concatenate(reconstructed_list, axis=0)
encoded_df = pd.DataFrame(encoded_x)
encoded_df.to_csv('encoded_target_x.csv', index=False)

## Pretrained encoder and transfer at latent space using VAE

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mean = nn.Linear(hidden_dim, latent_dim)
        self.fc2_log_var = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x):
        h = torch.relu(self.fc1(x))
        z_mean = self.fc2_mean(h)
        z_log_var = self.fc2_log_var(h)
        return z_mean, z_log_var

class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim):
        super(Decoder, self).__init__()
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, output_dim)

    def forward(self, z):
        h = torch.relu(self.fc3(z))
        x_recon = torch.sigmoid(self.fc4(h))
        return x_recon

class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, latent_dim)
        self.decoder = Decoder(latent_dim, hidden_dim, input_dim)

    def reparameterize(self, z_mean, z_log_var):
        std = torch.exp(0.5 * z_log_var)
        eps = torch.randn_like(std)
        return z_mean + eps * std

    def forward(self, x):
        z_mean, z_log_var = self.encoder(x)
        z = self.reparameterize(z_mean, z_log_var)
        x_recon = self.decoder(z)
        return x_recon, z_mean, z_log_var

In [None]:
device = torch.device("cuda")

batch_size = 32

to_predict = 'Product titer (g/L)'
source_species = 'Yarrowia'
target_species = 'Rhodo'

df = pd.read_excel("../impactDB/TL training data YL.xlsx", header=1)
#df = df.dropna(axis=1, how="all")

# Count the semicolons in 'genes_modified_updated'
df['Genes modified'] = df['Genes modified'].fillna('').apply(lambda x: x.count(';'))

# Count the ones in 'gene_overexpression' and 'heterologous_gene'
df['Genes overexpressed'] = df['Genes overexpressed'].fillna('').apply(lambda x: str(x).count('1'))
df['Heterologous genes'] = df['Heterologous genes'].fillna('').apply(lambda x: str(x).count('1'))
df['Genes deleted'] = df['Genes deleted'].fillna('').apply(lambda x: str(x).count('1'))

# Remove 'dataID' and 'paper_number' columns
df = df.drop(['dataID', 'paper_number','product_name'], axis=1)

# Impute all NaN data as zero
imputer = SimpleImputer(strategy='constant', fill_value=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

scaler = MinMaxScaler()

# Fit and transform the original DataFrame
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

x = df_scaled.drop(to_predict, axis=1).values
y = df_scaled[[to_predict]].values

custom_dataset = CustomDataset(x, y)
dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

input_dim = x.shape[1]
learning_rate = 0.1
hidden_dim = 30
latent_dim = 14
l1_weight = 0.1

# Add L2 regularization
weight_decay = 1e-5

model = VAE(input_dim, hidden_dim, latent_dim)
#triplet_loss = TripletLoss()
#triplet_loss = nn.TripletMarginWithDistanceLoss(distance_function=nn.PairwiseDistance(), margin=1)

mse_loss = nn.MSELoss()
l1_loss = nn.L1Loss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

num_epochs = 200
patience = 10
min_delta = 1e-4
best_loss = float('inf')
counter = 0

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for data, y in dataloader:
        data = data.float()
        y = y.float()
        
        optimizer.zero_grad()
        decoded, regression_output = ae(data)

        reconstruction_loss = mse_loss(decoded, data)
        regression_loss = l1_loss(regression_output, y)

        combined_loss = (1-l1_weight) * reconstruction_loss + l1_weight * regression_loss

        combined_loss.backward()
        optimizer.step()
        
        epoch_loss += combined_loss.item()
        
    # Early stopping
    if best_loss - epoch_loss > min_delta:
        best_loss = epoch_loss
        counter = 0
    else:
        counter += 1

    if counter >= patience:
        print(f"Early stopping triggered at epoch {epoch+1}")
        break
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {combined_loss.item():.4f}')