In [9]:
# use PyTorch-1.7.1 kernel

# import glob

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
# import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Mathylation Main Algorithm -- Two-Layer MLP Regression

In [2]:
# Define a simple MLP model for regression
class MLPRegression(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegression, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # No activation in the final layer (for regression)
        return x

# RMSE calculation
def compute_rmse(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2))

# Pearson's correlation calculation
def compute_pearson(y_true, y_pred):
    y_true_mean = torch.mean(y_true)
    y_pred_mean = torch.mean(y_pred)
    covariance = torch.sum((y_true - y_true_mean) * (y_pred - y_pred_mean))
    y_true_var = torch.sum((y_true - y_true_mean) ** 2)
    y_pred_var = torch.sum((y_pred - y_pred_mean) ** 2)
    return covariance / torch.sqrt(y_true_var * y_pred_var)




def methylation_regression(data_array):
        
    X = data_array[:, :-1]
    y = data_array[:, -1]
    
    # Split the dataset into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Standardize the input features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Convert the data to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)
    
    
    # Initialize the model, loss function, and optimizer
    input_dim = X_train.shape[1]  # Set input_dim dynamically
    model = MLPRegression(input_dim=input_dim)
    criterion = nn.MSELoss()  # Mean Squared Error for regression
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop
    num_epochs = 300
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        if (epoch+1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    # Testing loop
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_tensor)
        test_loss = criterion(predictions, y_test_tensor)
        print(f'Test Loss (MSE): {test_loss.item():.4f}')
    
        # Compute RMSE
        rmse = compute_rmse(y_test_tensor, predictions)
        print(f'Test RMSE: {rmse.item():.4f}')
        
        # Compute Pearson's correlation coefficient
        pearson_r = compute_pearson(y_test_tensor, predictions)
        print(f'Pearson Correlation: {pearson_r.item():.4f}')

## Base Directory of embedding data file

In [3]:
base_dir = '../embeddings/methylation/embedding-csv/'

## Load Embedding file

In [4]:
def load_embedding_file(csv_filename):

    df=pd.read_csv(csv_filename)
        
    # column_names = ['CHROM', 'START', 'SIZE', 'y','dna']
    # column_names.extend([f'{i}' for i in range(1, df.shape[1]-4)])

    # df.columns = column_names
    
    # df = df.drop(columns=['CHROM', 'START', 'SIZE', 'dna'], axis=1)

    # first_col = df.iloc[:, 0]  
    # df = df.drop(df.columns[0], axis=1)  
    # df[first_col.name] = first_col 

    data_array = df.to_numpy()
    return data_array

### GPN

In [10]:
data_array = load_embedding_file(base_dir + 'methylation_gpn_embedding.csv')
methylation_regression(data_array)

Epoch [10/300], Loss: 5478.1318
Epoch [20/300], Loss: 3708.0911
Epoch [30/300], Loss: 1268.8960
Epoch [40/300], Loss: 1109.1757
Epoch [50/300], Loss: 771.9333
Epoch [60/300], Loss: 710.4431
Epoch [70/300], Loss: 665.7606
Epoch [80/300], Loss: 630.5590
Epoch [90/300], Loss: 613.3790
Epoch [100/300], Loss: 599.4257
Epoch [110/300], Loss: 586.7148
Epoch [120/300], Loss: 575.5618
Epoch [130/300], Loss: 565.4338
Epoch [140/300], Loss: 556.3537
Epoch [150/300], Loss: 548.4974
Epoch [160/300], Loss: 541.7836
Epoch [170/300], Loss: 536.3010
Epoch [180/300], Loss: 531.9857
Epoch [190/300], Loss: 528.5250
Epoch [200/300], Loss: 525.6436
Epoch [210/300], Loss: 523.1539
Epoch [220/300], Loss: 520.9193
Epoch [230/300], Loss: 518.8572
Epoch [240/300], Loss: 516.9494
Epoch [250/300], Loss: 515.1584
Epoch [260/300], Loss: 513.4771
Epoch [270/300], Loss: 511.8884
Epoch [280/300], Loss: 510.3866
Epoch [290/300], Loss: 508.9584
Epoch [300/300], Loss: 507.6008
Test Loss (MSE): 509.8556
Test RMSE: 22.5800


### DNABERT2

In [11]:
data_array = load_embedding_file(base_dir + 'methylation_dnabert2_embedding.csv')
methylation_regression(data_array)

Epoch [10/300], Loss: 5723.6484
Epoch [20/300], Loss: 4773.3950
Epoch [30/300], Loss: 2942.0813
Epoch [40/300], Loss: 1250.2733
Epoch [50/300], Loss: 896.9476
Epoch [60/300], Loss: 751.0995
Epoch [70/300], Loss: 695.1877
Epoch [80/300], Loss: 653.6878
Epoch [90/300], Loss: 628.3942
Epoch [100/300], Loss: 609.0859
Epoch [110/300], Loss: 594.7344
Epoch [120/300], Loss: 582.6096
Epoch [130/300], Loss: 572.7758
Epoch [140/300], Loss: 564.2759
Epoch [150/300], Loss: 556.9548
Epoch [160/300], Loss: 550.5713
Epoch [170/300], Loss: 544.9264
Epoch [180/300], Loss: 539.8746
Epoch [190/300], Loss: 535.2977
Epoch [200/300], Loss: 531.1212
Epoch [210/300], Loss: 527.2679
Epoch [220/300], Loss: 523.6884
Epoch [230/300], Loss: 520.3441
Epoch [240/300], Loss: 517.1917
Epoch [250/300], Loss: 514.1804
Epoch [260/300], Loss: 511.2696
Epoch [270/300], Loss: 508.4750
Epoch [280/300], Loss: 505.7589
Epoch [290/300], Loss: 503.1231
Epoch [300/300], Loss: 500.5524
Test Loss (MSE): 523.0770
Test RMSE: 22.8709


### NT

In [12]:
data_array = load_embedding_file(base_dir + 'methylation_nt_embedding.csv')
methylation_regression(data_array)

Epoch [10/300], Loss: 5497.6504
Epoch [20/300], Loss: 3866.5767
Epoch [30/300], Loss: 1862.0349
Epoch [40/300], Loss: 1542.0035
Epoch [50/300], Loss: 980.9959
Epoch [60/300], Loss: 767.3196
Epoch [70/300], Loss: 702.6310
Epoch [80/300], Loss: 660.0431
Epoch [90/300], Loss: 634.3981
Epoch [100/300], Loss: 618.7062
Epoch [110/300], Loss: 606.4528
Epoch [120/300], Loss: 596.6179
Epoch [130/300], Loss: 588.3116
Epoch [140/300], Loss: 580.9419
Epoch [150/300], Loss: 574.2425
Epoch [160/300], Loss: 568.1375
Epoch [170/300], Loss: 562.5609
Epoch [180/300], Loss: 557.4078
Epoch [190/300], Loss: 552.6269
Epoch [200/300], Loss: 548.1791
Epoch [210/300], Loss: 543.9837
Epoch [220/300], Loss: 539.9894
Epoch [230/300], Loss: 536.2083
Epoch [240/300], Loss: 532.5889
Epoch [250/300], Loss: 529.1090
Epoch [260/300], Loss: 525.7535
Epoch [270/300], Loss: 522.4729
Epoch [280/300], Loss: 519.2582
Epoch [290/300], Loss: 516.1113
Epoch [300/300], Loss: 513.0177
Test Loss (MSE): 548.9774
Test RMSE: 23.4303


### HyenaDNA

In [13]:
data_array = load_embedding_file(base_dir + 'methylation_hyena_embedding.csv')
methylation_regression(data_array)

Epoch [10/300], Loss: 5801.6841
Epoch [20/300], Loss: 5104.1182
Epoch [30/300], Loss: 3661.4353
Epoch [40/300], Loss: 1930.1376
Epoch [50/300], Loss: 1232.4852
Epoch [60/300], Loss: 991.7443
Epoch [70/300], Loss: 855.2886
Epoch [80/300], Loss: 757.1953
Epoch [90/300], Loss: 701.2607
Epoch [100/300], Loss: 664.4675
Epoch [110/300], Loss: 642.6724
Epoch [120/300], Loss: 626.2875
Epoch [130/300], Loss: 613.9024
Epoch [140/300], Loss: 603.8984
Epoch [150/300], Loss: 595.9269
Epoch [160/300], Loss: 589.4467
Epoch [170/300], Loss: 584.0226
Epoch [180/300], Loss: 579.3590
Epoch [190/300], Loss: 575.2871
Epoch [200/300], Loss: 571.7029
Epoch [210/300], Loss: 568.5120
Epoch [220/300], Loss: 565.6276
Epoch [230/300], Loss: 562.9646
Epoch [240/300], Loss: 560.4799
Epoch [250/300], Loss: 558.1511
Epoch [260/300], Loss: 555.9483
Epoch [270/300], Loss: 553.8674
Epoch [280/300], Loss: 551.8893
Epoch [290/300], Loss: 550.0082
Epoch [300/300], Loss: 548.2188
Test Loss (MSE): 555.5245
Test RMSE: 23.5696

### Caduceus

In [14]:
data_array = load_embedding_file(base_dir + 'methylation_hyena_embedding.csv')
methylation_regression(data_array)

Epoch [10/300], Loss: 5733.0522
Epoch [20/300], Loss: 4855.4473
Epoch [30/300], Loss: 3151.5430
Epoch [40/300], Loss: 1522.1169
Epoch [50/300], Loss: 1183.0845
Epoch [60/300], Loss: 936.1277
Epoch [70/300], Loss: 833.7037
Epoch [80/300], Loss: 740.3685
Epoch [90/300], Loss: 693.1351
Epoch [100/300], Loss: 658.9875
Epoch [110/300], Loss: 637.5254
Epoch [120/300], Loss: 621.0016
Epoch [130/300], Loss: 607.8286
Epoch [140/300], Loss: 597.2402
Epoch [150/300], Loss: 588.8301
Epoch [160/300], Loss: 582.0900
Epoch [170/300], Loss: 576.5182
Epoch [180/300], Loss: 571.7691
Epoch [190/300], Loss: 567.7322
Epoch [200/300], Loss: 564.1944
Epoch [210/300], Loss: 561.0672
Epoch [220/300], Loss: 558.2411
Epoch [230/300], Loss: 555.6592
Epoch [240/300], Loss: 553.2743
Epoch [250/300], Loss: 551.0432
Epoch [260/300], Loss: 548.9457
Epoch [270/300], Loss: 546.9633
Epoch [280/300], Loss: 545.0789
Epoch [290/300], Loss: 543.2823
Epoch [300/300], Loss: 541.5673
Test Loss (MSE): 549.9398
Test RMSE: 23.4508