In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit # train_test_split, 
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

%run utility.py

print(torch.__version__)

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### Main Homo-Sapiens Classification Algorightm

### Build ANN model

In [None]:
# Define the MLP model (from previous examples)
class TwoLayerMLP(nn.Module):
    def __init__(self, input_size, hidden_size1,hidden_size2, num_classes):
        super(TwoLayerMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

### Training

In [None]:
# Training and validation loop

def  trainModel(model, train_loader, val_loader):
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to GPU
            optimizer.zero_grad()  # Zero the parameter gradients
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, targets)  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update parameters
            
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        
        # Validation
        model.eval()  # Set model to evaluation mode
        val_loss = 0.0
        correct = 0
        total = 0
    
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)  # Move data to GPU
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
    
        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct / total
    
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy * 100:.2f}%')
    
    
    print('Training completed.')
    return model

### Overall Evaluate

In [None]:
def Overall_Evaluate(model, test_loader):
    # Evaluation on test data
    model.eval() 
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to GPU
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    
    accuracy = correct / total
    print(f'Accuracy on test data: {accuracy * 100:.2f}%')

### Accuracty Evaluation Breakdown

In [None]:
def Breakdown_Evaluate(model, test_loader):
    model.eval() 
    class_correct = [0] * num_classes
    class_total = [0] * num_classes
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device) 
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            
            # Update total count for each class
            for i in range(targets.size(0)):
                label = targets[i]
                class_total[label] += 1
                class_correct[label] += (predicted[i] == label).item()

    acc_data=[]
    
    # Calculate and print accuracy for each class
    for i in range(num_classes):
        if class_total[i] > 0:  # Avoid division by zero
            accuracy = class_correct[i] / class_total[i]
            print(f'Accuracy of class {i}: {accuracy * 100:.2f}%')
            acc_data.append(accuracy)
        else:
            print(f'No samples for class {i} in the test set.')

    return acc_data

### AUC(Area Under the Curve) Evaluation Breakdown

In [None]:
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F

def Breakdown_Evaluate_AUC(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_targets = []
    all_outputs = []
    
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)  # Move data to GPU
            
            # Get model outputs and apply softmax to get class probabilities
            outputs = model(inputs)
            probabilities = F.softmax(outputs, dim=1)  # Probabilities for each class
            
            # Collect all targets and probabilities
            all_targets.append(targets.cpu())
            all_outputs.append(probabilities.cpu())
    
    # Concatenate all batches
    all_targets = torch.cat(all_targets)
    all_outputs = torch.cat(all_outputs)

    auc_data = []
    
    # Compute AUC for each class (one-vs-rest)
    for i in range(num_classes):
        # Binarize the targets for class 'i'
        binarized_targets = (all_targets == i).int()  # 1 if target is 'i', else 0
        
        # Get the predicted probabilities for class 'i'
        class_probabilities = all_outputs[:, i]
        
        # Compute AUC for class 'i'
        if len(set(binarized_targets.tolist())) > 1:  # Ensure we have both classes in the test set
            auc = roc_auc_score(binarized_targets, class_probabilities)
            print(f'AUC of class {i}: {auc:.2f}')
            auc_data.append(auc)
            
        else:
            print(f'Not enough data for class {i} to compute AUC.')
    
    return auc_data



### Hyperparameters

In [None]:
hidden_size1 = 120
hidden_size2 = 64
num_classes = 7
num_epochs = 20
batch_size = 32
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()

### Accuracy Records

In [None]:
columns = [f'class{i}' for i in range(7)]  
df_acc_nt = pd.DataFrame(columns=columns)
df_acc_gpn = pd.DataFrame(columns=columns)
df_acc_dnabert2 = pd.DataFrame(columns=columns)
df_acc_hyena = pd.DataFrame(columns=columns)
df_acc_caduceus = pd.DataFrame(columns=columns)

### T-Test Records

In [None]:
columns = [f'class{i}' for i in range(7)]  
df_auc_nt = pd.DataFrame(columns=columns)
df_auc_gpn = pd.DataFrame(columns=columns)
df_auc_dnabert2 = pd.DataFrame(columns=columns)
df_auc_hyena = pd.DataFrame(columns=columns)
df_auc_caduceus = pd.DataFrame(columns=columns)

runcount=10

### split dataframe into 3 parts

In [None]:
def run_test(runcount, model, train_loader,val_loader, test_loader, df_acc, df_auc):
    
    for i in range(0, runcount):
        # train_loader,val_loader,test_loader =prepare_dataloader(data_array)
        model = trainModel(model, train_loader,val_loader)
        
        Overall_Evaluate(model, test_loader)
    
        acc_data=Breakdown_Evaluate(model,test_loader)
        df_acc.loc[len(df_acc)] = acc_data  
        
        auc_data = Breakdown_Evaluate_AUC(model,test_loader)
        df_auc.loc[len(df_auc)] = auc_data

    return df_acc, df_auc

### Filter 

In [None]:
def filter(dfA,dfB):
    return dfA[dfA[['ROWID']].isin(dfB[['ROWID']].to_dict(orient='list')).all(axis=1)]

### Dataframe to dataloader

In [None]:
def df2dataloader(df_train, df_val, df_test):

    X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1].values
    X_val, y_val = df_val.iloc[:, :-1].values, df_val.iloc[:, -1].values
    X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1].values

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # assuming labels are integers
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)
    
    # Create TensorDatasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
   
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)  
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)     
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)   

    
    return train_loader, val_loader, test_loader

### Set base directory for embedding file

In [None]:
base_dir = '../embeddings/homo-sapiens/embedding-csv/'

### NT

In [None]:
def to_dataloader(X, y, batch_size=32, shuffle=True):
    # Convert NumPy arrays to PyTorch tensors
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    
    # Create TensorDataset
    dataset = TensorDataset(X_tensor, y_tensor)
    
    # Create DataLoader
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    
    return loader
    

def load_embedding_file_NT(csv_filename):

    df=pd.read_csv(csv_filename)
    
    column_names = [f'{i}' for i in range(0, df.shape[1]-2)]
    column_names.extend(['ROWID',  'y']) 
    df.columns = column_names
    
    # Split the dataframe into features (X) and labels (y)
    features = df.iloc[:, :-1]  
    labels = df.iloc[:, -1]     
    
    # Initialize Stratified Shuffle Split
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=None)
    
    for train_index, test_index in split.split(features, labels):
        # Use pandas indexing to split data
        X_train_val, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train_val, y_test = labels.iloc[train_index], labels.iloc[test_index]
    
    # Now split the training set into training and validation sets
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=None)
    
    for train_index, val_index in split.split(X_train_val, y_train_val):
        X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
        y_train, y_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]    
    
    return X_train, X_val, X_test, y_train, y_val, y_test


def One_Run_NT(input_size, df_acc_nt, df_auc_nt):
    
    X_train_df, X_val_df, X_test_df, y_train, y_val, y_test =  load_embedding_file_NT(base_dir + 'homo_sapiens_nt_embedding.csv') 
    
    dropcolumnns=['ROWID'] 
    X_train = X_train_df.drop(dropcolumnns, axis=1)
    X_val   = X_val_df.drop(dropcolumnns, axis=1)
    X_test  = X_test_df.drop(dropcolumnns, axis=1)
    
    train_loader = to_dataloader(X_train, y_train)
    val_loader = to_dataloader(X_val, y_val)
    test_loader = to_dataloader(X_test, y_test)
    
    model = TwoLayerMLP(input_size, hidden_size1,hidden_size2,  num_classes).to(device)    

    df_acc_nt, df_auc_nt = run_test(1,model, train_loader,val_loader, test_loader, df_acc_nt, df_auc_nt)
    return df_acc_nt, df_auc_nt, X_train_df, X_val_df, X_test_df
    

df_acc_nt, df_auc_nt, X_train_df, X_val_df, X_test_df = One_Run_NT(1280, df_acc_nt, df_auc_nt)

print(len(X_train_df))
print(len(X_val_df))
print(len(X_test_df))

print(df_acc_nt)
print('\n')
print(df_auc_nt)

##  Load embedding file

In [None]:
def load_embedding_file(csv_filename, X_train_df,X_val_df,X_test_df):

    df=pd.read_csv(csv_filename)
    
    column_names = [f'{i}' for i in range(0, df.shape[1]-2)]
    column_names.extend(['ROWID', 'y'])
    df.columns = column_names
                   
    df_train= filter(df, X_train_df)
    df_val  = filter(df, X_val_df)
    df_test = filter(df, X_test_df)

    dropcolumns=['ROWID']
    df_train= df_train.drop(dropcolumns, axis=1) 
    df_val  = df_val.drop(dropcolumns, axis=1) 
    df_test = df_test.drop(dropcolumns, axis=1) 

    return df2dataloader(df_train, df_val, df_test)

### GPN

In [None]:
def One_Run_GPN(input_size, df_acc, df_auc):

    train_loader, val_loader, test_loader=load_embedding_file(base_dir + 'homo_sapiens_gpn_embedding.csv', X_train_df,X_val_df,X_test_df)   
    model = TwoLayerMLP(input_size, hidden_size1,hidden_size2,  num_classes).to(device)    
    df_acc, df_auc = run_test(1,model, train_loader,val_loader, test_loader, df_acc, df_auc)
    return df_acc, df_auc

# input_size = 768
df_acc_gpn, df_auc_gpn = One_Run_GPN(768, df_acc_gpn, df_auc_gpn)

print(df_acc_gpn)
print('\n')
print(df_auc_gpn)

### DNABERT2

In [None]:
def One_Run_DNABERT2(input_size, df_acc, df_auc):

    train_loader, val_loader, test_loader=load_embedding_file(base_dir + 'homo_sapiens_dnabert2_embedding.csv', X_train_df,X_val_df,X_test_df)    
    model = TwoLayerMLP(input_size, hidden_size1,hidden_size2,  num_classes).to(device)    
    df_acc, df_auc = run_test(1,model, train_loader,val_loader, test_loader, df_acc, df_auc)
    return df_acc, df_auc

# input_size = 768
df_acc_dnabert2, df_auc_dnabert2 = One_Run_DNABERT2(768, df_acc_dnabert2, df_auc_dnabert2)

print(df_acc_dnabert2)
print('\n')
print(df_auc_dnabert2)

### HyenaDNA

In [None]:
def One_Run_HYENADNA(input_size, df_acc, df_auc):

    train_loader, val_loader, test_loader=load_embedding_file(base_dir + 'homo_sapiens_hyena_embedding.csv', X_train_df,X_val_df,X_test_df)    
    model = TwoLayerMLP(input_size, hidden_size1,hidden_size2,  num_classes).to(device)    
    df_acc, df_auc = run_test(1,model,train_loader,val_loader, test_loader, df_acc, df_auc)
    return df_acc, df_auc

# input_size = 256
df_acc_hyena, df_auc_hyena = One_Run_HYENADNA(256, df_acc_hyena, df_auc_hyena)

print(df_acc_hyena)
print('\n')
print(df_auc_hyena)

### Caduceus

In [None]:
def One_Run_CADUCEUS(input_size, df_acc, df_auc):

    train_loader, val_loader, test_loader=load_embedding_file(base_dir + 'homo_sapiens_caduceus_embedding.csv', X_train_df,X_val_df,X_test_df)    
    model = TwoLayerMLP(input_size, hidden_size1,hidden_size2,  num_classes).to(device)    
    df_acc, df_auc = run_test(1,model,train_loader,val_loader, test_loader, df_acc, df_auc)
    return df_acc, df_auc

# input_size = 256
df_acc_caduceus, df_auc_caduceus = One_Run_CADUCEUS(256, df_acc_caduceus, df_auc_caduceus)

print(df_acc_caduceus)
print('\n')
print(df_auc_caduceus)

## Perfrom T-Test

In [None]:
%%time

def clear_df(df):
    df=df.drop(df.index)
    return df

df_acc_nt      =clear_df(df_acc_nt)
df_acc_gpn     =clear_df(df_acc_gpn)
df_acc_dnabert2=clear_df(df_acc_dnabert2)
df_acc_hyena   =clear_df(df_acc_hyena)
df_acc_caduceus=clear_df(df_acc_caduceus)

df_auc_nt      =clear_df(df_auc_nt)
df_auc_gpn     =clear_df(df_auc_gpn)
df_auc_dnabert2=clear_df(df_auc_dnabert2)
df_auc_hyena   =clear_df(df_auc_hyena)
df_auc_caduceus=clear_df(df_auc_caduceus)


for i in range(100):
    print("====round "+str(i+1)+"======")
    
    df_acc_nt, df_auc_nt, X_train_df, X_val_df, X_test_df = One_Run_NT(1280, df_acc_nt, df_auc_nt)

    df_acc_gpn, df_auc_gpn = One_Run_GPN(768, df_acc_gpn, df_auc_gpn)

    df_acc_dnabert2, df_auc_dnabert2 = One_Run_DNABERT2(768, df_acc_dnabert2, df_auc_dnabert2)

    df_acc_hyena, df_auc_hyena = One_Run_HYENADNA(256, df_acc_hyena, df_auc_hyena)
    
    df_acc_caduceus, df_auc_caduceus = One_Run_CADUCEUS(256, df_acc_caduceus, df_auc_caduceus)

print("====Print Accuracy======")
print(df_acc_nt)
print('\n')
print(df_acc_gpn)
print('\n')
print(df_acc_dnabert2)
print('\n')
print(df_acc_hyena)
print('\n')
print(df_acc_caduceus)

print("====Print AUC======")
print(df_auc_nt)
print('\n')
print(df_auc_gpn)
print('\n')
print(df_auc_dnabert2)
print('\n')
print(df_auc_hyena)
print('\n')
print(df_auc_caduceus)


In [None]:
df_acc_nt.to_csv('t_acc_nt.csv', index=False)
df_acc_gpn.to_csv('t_acc_gpn.csv', index=False)
df_acc_dnabert2.to_csv('t_acc_dnabert2.csv', index=False)
df_acc_hyena.to_csv('t_acc_hyena.csv', index=False)
df_acc_caduceus.to_csv('t_acc_caduceus.csv', index=False)

df_auc_nt.to_csv('t_auc_nt.csv', index=False)
df_auc_gpn.to_csv('t_auc_gpn.csv', index=False)
df_auc_dnabert2.to_csv('t_auc_dnabert2.csv', index=False)
df_auc_hyena.to_csv('t_auc_hyena.csv', index=False)
df_auc_caduceus.to_csv('t_auc_caduceus.csv', index=False)