## Lets use the embeddings generated from [1] and prepare lots of data for analysis in [3]

In [1]:
# Standard library imports
import glob
import json
import os
import shutil

# Related third-party imports
import numpy as np
import pandas as pd
import glob
import pacmap
from matplotlib import pyplot as plt
import seaborn as sns
import torch
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
import optuna

# Local application/library specific imports
from utils.standard_ds_loader import StandardDatasetLoader

  from .autonotebook import tqdm as notebook_tqdm


**Create and check directories. You will be able to make embeddings from a model with/without fine-tuning, and also be able to add additional features that resemble the presence of various important sequences in 3' end**

In [8]:
directories = [
    './hyena_embeds',
    './hyena_embeds_pe',
    './fine_tuned_hyena_embeds',
    './fine_tuned_hyena_embeds_pe'
]

subdirs = ['kfold_results', 'kfold_plots']

# Checks if the directories and subdirectories exist, and creates them if they don't
for dir in directories:
    if not os.path.exists(dir):
        os.makedirs(dir)
    for subdir in subdirs:
        full_path = os.path.join(dir, subdir)
        if not os.path.exists(full_path):
            os.makedirs(full_path)
        assert os.path.exists(full_path), f"Failed to create {full_path}"

smart_read_csv = StandardDatasetLoader().smart_read_csv

def smart_read_embeddings(filepath):
    _, ext = os.path.splitext(filepath)
    print(filepath)
    # Check the file extension and use the appropriate library to load the data
    if ext in ['.pt', '.pth']:
        # Load PyTorch tensor
        data = torch.load(filepath)
        if isinstance(data, torch.Tensor):
            return data.shape[0]
        else:
            raise ValueError(f"File {filepath} does not contain a PyTorch tensor.")
    elif ext == '.npy':
        # Load Numpy array
        data = np.load(filepath)
        return data.shape[0]
    else:
        raise ValueError(f"Unsupported file extension {ext}. Supported extensions are .pt, .pth, and .npy.")

def test_X_y_paths(non_tuned_embeddings_path, tuned_embeddings_path, label_data_path):
    non_tuned_X_path = non_tuned_embeddings_path
    tuned_X_path = tuned_embeddings_path
    target_values_path = label_data_path
    
    target_df = smart_read_csv(target_values_path)
    assert target_df.shape[1] == 2, f"Wrong number of columns in target_values_path: {target_df.shape[1]}, should be 2, with column one containing the sequences, and column two containing the labels."
    df_length = len(target_df)
    
    # Getting the shape of the first dimension for embeddings
    non_tuned_length = smart_read_embeddings(non_tuned_X_path)
    tuned_length = smart_read_embeddings(tuned_X_path)
    
    # Now you can compare or do any other operations with df_length, non_tuned_length, and tuned_length
    # For instance:
    assert df_length == non_tuned_length == tuned_length, f"Mismatching lengths between dataframes: {df_length}, {non_tuned_length}, {tuned_length}, this means you didn't load the correct embeddings and their corresponding labels. You probably didn't load the updated datasets."


# Load your embeddings (X) paths, label (y) paths, and Scikit-Learn settings

**The embeddings were generated using a genomics foundational model. However, we did not use the neural net to infer on the sequences. This is simply a information-rich way of representing gene sequences, and a more accurate one than traditional one-hot-encoding.**

In [3]:
#=================================Load a couple parameters==============================

# Start off by instantiating the StandardDatasetLoader
# Select your dataset here. You can choose between 'standard' or path a datapath to your own custom dataset. The 'standard' option will load the dataset selected in [1].
dataset = 'standard' # or pass a path to your custom dataset here
standard_ds_loader = StandardDatasetLoader()
PATH_TO_GENERATED_EMBEDDINGS = os.path.expanduser(standard_ds_loader.path_to_generated_embeddings)

# Creating a function to load the hyena data process settings simply
def dataset_creator(dataset):
    
    if dataset == 'standard':
        n_folds, model_dict, model_list, SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING, FINE_TUNED_SEQS_EMBEDDINGS_PATH, OPTIMAL_SEQS_DATA_PATH, LABEL_DATA_PATH = standard_ds_loader.load_hyena_data_process_settings()

    else:
        print('Detected a custom datapath entry, will try loading using smart_read_csv')
        LABEL_DATA_PATH = dataset
        n_folds, model_dict, model_list, SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING, FINE_TUNED_SEQS_EMBEDDINGS_PATH, OPTIMAL_SEQS_DATA_PATH, _ = standard_ds_loader.load_hyena_data_process_settings()        

    return n_folds, model_dict, model_list, SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING, FINE_TUNED_SEQS_EMBEDDINGS_PATH, OPTIMAL_SEQS_DATA_PATH, LABEL_DATA_PATH

# Call the function
n_folds, model_dict, model_list, SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING, FINE_TUNED_SEQS_EMBEDDINGS_PATH, OPTIMAL_SEQS_DATA_PATH, LABEL_DATA_PATH = dataset_creator(dataset)

assert os.path.exists(SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING) and os.path.exists(FINE_TUNED_SEQS_EMBEDDINGS_PATH), f'Embedding paths {SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING} and {FINE_TUNED_SEQS_EMBEDDINGS_PATH} do not exist'
assert os.path.exists(LABEL_DATA_PATH), f'Label data path {LABEL_DATA_PATH} does not exist'

**Lets test that we loaded the correct X embeddings (fine-tuned and not fine-tuned) and target values**

**It will test the length of these to make sure you have the correct ds loaded**

In [5]:
test_X_y_paths(SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING, FINE_TUNED_SEQS_EMBEDDINGS_PATH, LABEL_DATA_PATH)

./data/ds_fine_tuned_embeddings/mmseqs_train_1.pth
./data/ds_fine_tuned_embeddings/mmseqs_train_6.pth


In [6]:
def process_and_save_embeddings(dataset):

    if dataset == 'UTR_regions':
        file_path = SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING
        directory_name = './hyena_embeds_pe'
    elif dataset == 'no_augment':
        file_path = SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING
        directory_name = './hyena_embeds'
    elif dataset == 'fine_tuned':
        file_path = FINE_TUNED_SEQS_EMBEDDINGS_PATH
        directory_name = './fine_tuned_hyena_embeds'
    elif dataset == 'fine_tuned_pe':
        file_path = FINE_TUNED_SEQS_EMBEDDINGS_PATH
        directory_name = './fine_tuned_hyena_embeds_pe'
    else:
        raise ValueError(f'Invalid dataset: {dataset}: must be one of [UTR_regions, no_augment, fine_tuned, fine_tuned_pe]')
    
    embeddings = torch.load(file_path)
    # Push to CPU
    embeddings = embeddings.cpu()
    
    # # Convert to numpy
    # embeddings = embeddings.numpy()
    
    #     # If it's already a numpy array, proceed directly to reshaping
    # if isinstance(embeddings, np.ndarray):
    #     X_reshaped = embeddings.reshape(len(embeddings), -1)
    #     print('X shape:', X_reshaped.shape)


    # Calculate the average along the last dimension (128)
    average_array = torch.mean(embeddings, dim=2).numpy()

    # Calculate the max and min along the last dimension (128)
    max_array = torch.max(embeddings, dim=2)[0].numpy()
    min_array = torch.min(embeddings, dim=2)[0].numpy()

    # Stack max and min arrays along the last dimension (2)
    max_min_array = np.stack([max_array, min_array], axis=2)

    # Reshape the max and min arrays to be 2D arrays
    min_max_array_reshaped = max_min_array.reshape(len(max_min_array), -1)

    # Print the shapes of the resulting NumPy arrays
    print("Average Array Shape:", average_array.shape)
    print("Max-Min Array Shape:", max_min_array.shape)
    print('min_max_array_reshaped shape:', min_max_array_reshaped.shape)

    # Create the directory if it doesn't already exist
    if not os.path.exists(directory_name):
        os.makedirs(directory_name)

    # Save the reshaped arrays to .npy files
    np.save(f'{directory_name}/reshaped_min_max_hyena.npy', min_max_array_reshaped)
    np.save(f'{directory_name}/reshaped_average_hyena.npy', average_array)
    print(f"Embeddings saved to {directory_name}/")

# Example usage with shalem_embeddings and shalem_embeds as the directory name
# process_and_save_embeddings(dataset='UTR_regions')


***Let's do Dimensionality Reduction as well***

In [7]:
def load_and_scale_data(data):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    return scaled_data

def process_embeddings_if_needed(embeddings):
    # Check if embeddings is a PyTorch tensor, and if so, convert to NumPy array
    if isinstance(embeddings, torch.Tensor):
        # Push to the CPU if it's on a GPU
        if embeddings.is_cuda:
            embeddings = embeddings.cpu()
        
        # Convert to numpy array
        embeddings = embeddings.numpy()

    # If it's already a numpy array, proceed directly to reshaping
    if isinstance(embeddings, np.ndarray):
        X_reshaped = embeddings.reshape(len(embeddings), -1)
        print('X shape:', X_reshaped.shape)
        
    else:
        print("The provided embeddings are neither a PyTorch tensor nor a NumPy array.")

    return X_reshaped

METHODS = ['pca', 'pacmap']
COMPONENTS = [2, 5, 10, 150]

def perform_dimensionality_reduction(X_reshaped, base_dir, methods, components):
    for method in methods:
        for num_components in components:
            print('Method and number of components:', method, num_components)

            if method == 'pca':
                file_name = os.path.join(base_dir, f'pca_components_{num_components}.npy')
            elif method == 'pacmap':
                if num_components == 150:
                    num_components = 100
                file_name = os.path.join(base_dir, f'pacmap_components_{num_components}.npy')
            else:
                print(f"Unsupported method: {method}")
                continue

            if os.path.exists(file_name):
                print(f"File {file_name} already exists. Skipping.")
                continue

            X_reshaped_scaled = StandardScaler().fit_transform(X_reshaped)

            if method == 'pca':
                dim_reduct = PCA(n_components=num_components)
            elif method == 'pacmap':
                # Make sure to import pacmap before using
                dim_reduct = pacmap.PaCMAP(n_components=num_components, n_neighbors=7)
            
            X_reduced = dim_reduct.fit_transform(X_reshaped_scaled)
            assert X_reduced.shape[1] == num_components, f'Expected {num_components} components, but got {X_reduced.shape[1]}'

            print('Shape of X array after reduction:', X_reduced.shape)
            assert not np.isnan(X_reduced).any(), f'The numpy array contains NaNs'
            
            np.save(file_name, X_reduced)
            print(f"Reduced data saved to {file_name}")

# Example usage, replace 'your_X_reshaped' with your actual reshaped data
# perform_dimensionality_reduction(your_X_reshaped)

In [9]:
# Call the function on shalem_embeds to perform dimensionality reduction
non_fine_tuned_embeddings = torch.load(SEQS_EMBEDDINGS_PATH_NO_FINE_TUNING)
process_and_save_embeddings(dataset='no_augment')
X_reshaped = process_embeddings_if_needed(non_fine_tuned_embeddings)
perform_dimensionality_reduction(X_reshaped, base_dir='./hyena_embeds', methods=METHODS, components=COMPONENTS)

Average Array Shape: (10447, 152)
Max-Min Array Shape: (10447, 152, 2)
min_max_array_reshaped shape: (10447, 304)
Embeddings saved to ./hyena_embeds/
X shape: (10447, 19456)
Method and number of components: pca 2
Shape of X array after reduction: (10447, 2)
Reduced data saved to ./hyena_embeds/pca_components_2.npy
Method and number of components: pca 5
Shape of X array after reduction: (10447, 5)
Reduced data saved to ./hyena_embeds/pca_components_5.npy
Method and number of components: pca 10
Shape of X array after reduction: (10447, 10)
Reduced data saved to ./hyena_embeds/pca_components_10.npy
Method and number of components: pca 150
Shape of X array after reduction: (10447, 150)
Reduced data saved to ./hyena_embeds/pca_components_150.npy
Method and number of components: pacmap 2
Shape of X array after reduction: (10447, 2)
Reduced data saved to ./hyena_embeds/pacmap_components_2.npy
Method and number of components: pacmap 5
Shape of X array after reduction: (10447, 5)
Reduced data s

***It's likely that the strong terminators have a Positioning Element (pe), Efficiency Element (EE), and PolyA Site (pas) within them. In the sequences, lets check this by creating more features, denoting the presence of these unique sequences***

***You likely don't need this, as I don't see much of a performance boost, but this pipeline can be edited easily if you want to use a more sophisticated scoring technique***

In [10]:
def produce_optimal_seq_dataset(target_df_path, path_save_opt_seqs):
    # Define the optimal sequences found in the 3' region of yeast genes, that are associated with highly expressed transcripts
    optimal_sequences = ['TATATA', 'AATAAA', 'CAAA']
    
    
    df_opt_seqs = smart_read_csv(target_df_path)
    # Add column names for shalem_df
    df_opt_seqs.columns = ['sequence', 'tpm']
    print('Head of ds: ', df_opt_seqs.head())


    df_opt_seqs['ee'] = 0
    df_opt_seqs['pe'] = 0
    df_opt_seqs['pas'] = 0

    for index, row in df_opt_seqs.iterrows():
        sequence = row['sequence']
        for i, optimal in enumerate(optimal_sequences):
            if optimal in sequence:
                if i == 0:
                    df_opt_seqs.at[index, 'ee'] = 1
                elif i == 1:
                    df_opt_seqs.at[index, 'pe'] = 1
                elif i == 2:
                    df_opt_seqs.at[index, 'pas'] = 1
    print('Head of df_opt_seqs: ', df_opt_seqs.head())

    # Save the updated dataframe to the file
    df_opt_seqs.to_csv(path_save_opt_seqs, sep="\t")
                        
#============Calling the function==================
produce_optimal_seq_dataset(target_df_path=LABEL_DATA_PATH, path_save_opt_seqs=OPTIMAL_SEQS_DATA_PATH)
#==================================================

# Create a function that will add the EE, PE and PAS columns to any loaded embedding
def stack_element_features(embedding, optimal_seqs_filepath):
    # Run the produce_optimal_seq_dataset function
    produce_optimal_seq_dataset(target_df_path=LABEL_DATA_PATH, path_save_opt_seqs=optimal_seqs_filepath)
    
    df = smart_read_csv(optimal_seqs_filepath)
    df_opt_seqs = df.iloc[:, 3:]
    print('Head of df_opt_seqs: ', df_opt_seqs.head())
    
    # Assert there there is only three columns, ee, pe and pas
    assert df_opt_seqs.shape[1] == 3, f'The df_opt_seqs is not comprised of three columns. It is supposed to be ee, pe and pas (3). df_opt_seqs: {df_opt_seqs.shape[1]}'
    assert (df_opt_seqs.dtypes == 'int64').all(), f'The df_opt_seqs is not comprised of integer values. It is supposed to be 1 or 0. df_opt_seqs: {df_opt_seqs.dtypes}'
    
    # Convert to numpy array
    opt_seqs_numpy = df_opt_seqs.to_numpy()
    
    additional_features_embeddings = np.hstack((embedding, opt_seqs_numpy))
    
    print('Shape of additional_features_embeddings: ', additional_features_embeddings.shape)
    print('Shape of embedding: ', embedding.shape)
    print('Shape of opt_seqs_numpy: ', opt_seqs_numpy.shape)
    
    return additional_features_embeddings

Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:                                              sequence       tpm  ee  pe  pas
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293   0   1    1
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170   1   1    1
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423   1   1    1
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952   0   0    1
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185   0   0    0


***Lets start defining functions for our 10-fold Cross Validation and Swarm/Scatter-plot generation***

In [12]:
def save_scatterplot(y_actual, y_predicted, model_name, file_name, file_basename, dataset):
    from sklearn.metrics import r2_score
    import seaborn as sns
    from sklearn.metrics import mean_squared_error
    

    mse_test = mean_squared_error(y_actual, y_predicted)
    pearson_r_test = np.corrcoef(np.array(y_actual).ravel(), np.array(y_predicted).ravel())[0, 1]
    r2 = 1 - (mse_test / np.var(np.array(y_actual)))
        
    # plot_title = f'Scatterplot for {model_name}/{num}-components - {method} Measured vs Predicted Values)\nR-squared: {r2:.2f}'
    plot_title = f'Scatterplot for {file_name}_{model_name}_{file_basename}: Measured vs Predicted Values)\nR-squared: {r2:.2f}\nPearson Correlation: {pearson_r_test:.2f}'
    
    # Create a DataFrame for plotting
    df = pd.DataFrame({'Measured Values': y_actual.ravel(), 'Predicted Values': y_predicted.ravel()})
    
    if file_basename == 'train':
        color = 'tab:orange'
    elif file_basename == 'test':
        color = 'tab:blue'

    # Create the scatterplot using Seaborn
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x='Measured Values', y='Predicted Values', color=color, alpha=0.5)
    plt.title(plot_title)
    plt.grid(True)
    
    if dataset == 'UTR_regions':
        plot_filename = f'./hyena_embeds_pe/kfold_plots/{file_name}_{model_name}_{file_basename}.png'
    elif dataset == 'no_augment':
        # Define the file path and name where you want to save the plot
        plot_filename = f'./hyena_embeds/kfold_plots/{file_name}_{model_name}_{file_basename}.png'
    elif dataset == 'fine_tuned':
        plot_filename = f'./fine_tuned_hyena_embeds/kfold_plots/{file_name}_{model_name}_{file_basename}.png'
    elif dataset == 'fine_tuned_pe':
        plot_filename = f'./fine_tuned_hyena_embeds_pe/kfold_plots/{file_name}_{model_name}_{file_basename}.png'

    # Save the plot as an image file (you can specify the file format, e.g., PNG, JPEG)
    plt.savefig(plot_filename)

    # Close the plot to release resources (optional)
    plt.close()
    
# Use case example
# run_cross_validation(model_name_function, hyperparams, X, y, n_folds, file_name=file_name_no_extension, feature_engineering=False)
# save_scatterplot(y_train_folds_concat, y_pred_train_folds_concat, feature_engineering=feature_engineering, model_name=model_name_function, file_name =file_name, file_basename='train')

def run_cross_validation(model_name_function, hyperparams, X, y, n_folds, file_name, dataset):
    
    # Make sure to reshape the target, if not already in (1, n_samples) format
    if len(y.shape) == 1:
        y = y.reshape(-1, 1)
            
    if model_name_function == 'xgb':
        model = XGBRegressor(**hyperparams)
    elif model_name_function == 'lr':
        model = LinearRegression(**hyperparams)
    else:
        raise ValueError("Invalid model name, choose from xgb, lr")
    
    # ========================MAKING ADJUSTMENTS TO SCALING PROCESS TO ENSURE NO DATA LEAKAGE!!!!==================
    scaler_x = StandardScaler()
    scaler_y = StandardScaler()
   
    y = scaler_y.fit_transform(y)


    # Create lists to store r2_train, r2_test, y_pred_train, and y_pred_test values for each fold
    r2_train_list = []
    r2_test_list = []

    # X_train_fold_list = []
    y_pred_train_fold_list = []
    y_train_fold_list = []
    # X_test_fold_list = []
    y_pred_test_fold_list = []
    y_test_fold_list = []

    RANDOM_STATE = 21
    # Perform K-fold cross-validation
    kf = KFold(n_splits=n_folds, random_state=RANDOM_STATE, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        
        print('Length of X-test fold: ', len(X_test_fold))
        print('Length of X-train fold: ', len(X_train_fold))
        
        # ==============================================USING NEW SCALING PROCESS=======================================
        X_train_fold = scaler_x.fit_transform(X_train_fold)
        X_test_fold = scaler_x.transform(X_test_fold)
        # =========================================================================================================
        
        model.fit(X_train_fold, y_train_fold)
        
        # Calculate R-squared on test set for this fold
        r2_test_fold = model.score(X_test_fold, y_test_fold)
        r2_test_list.append(r2_test_fold)
        
        # Calculate R-squared on train set for this fold
        r2_train_fold = model.score(X_train_fold, y_train_fold)
        r2_train_list.append(r2_train_fold)
        
        # Predict on the training set for this fold
        y_pred_train_fold = model.predict(X_train_fold)
        y_train_fold_list.append(y_train_fold)
        y_pred_train_fold_list.append(y_pred_train_fold)
        
        # Predict on the test set for this fold
        y_test_fold_list.append(y_test_fold)
        y_pred_test_fold = model.predict(X_test_fold)
        y_pred_test_fold_list.append(y_pred_test_fold)

    # Calculate the mean R-squared values for train and test sets
    mean_r2_train = np.mean(r2_train_list)
    mean_r2_test = np.mean(r2_test_list)

    # Concatenate the predicted values for training and testing sets
    y_pred_train_folds_concat = np.concatenate(y_pred_train_fold_list)
    y_train_folds_concat = np.concatenate(y_train_fold_list)
    y_pred_test_folds_concat = np.concatenate(y_pred_test_fold_list)
    y_test_folds_concat = np.concatenate(y_test_fold_list)
    print('Length of y_pred_train: ', len(y_pred_train_folds_concat))
    print('Length of y_pred_test: ', len(y_pred_test_folds_concat))

    # Create and save scatterplots for training and testing sets
    save_scatterplot(y_train_folds_concat, y_pred_train_folds_concat, model_name=model_name_function, file_name =file_name, file_basename='train', dataset=dataset)
    save_scatterplot(y_test_folds_concat, y_pred_test_folds_concat, model_name=model_name_function, file_name =file_name, file_basename='test', dataset=dataset)

    # Add the R-squared train/test list to df
    print('r2_train_list len:', len(r2_train_list))
    print('r2_test_list len:', len(r2_test_list))

    print('R2 Train List: ', r2_train_list)
    print('R2 Test List: ', r2_test_list)
    
    # Add the R-squared train/test list to df
    print('r2_train_list len:', len(r2_train_list))
    print('r2_test_list len:', len(r2_test_list))
    df_test = pd.DataFrame()
    df_train = pd.DataFrame()
    
    file_name = f'{file_name}_{model_name_function}'
    
    # Create the train R-squared column, and concatenate the test on axis 0 (vertical)
    df_test['r2'] = r2_train_list
    df_test['type'] = 'train'
    df_test['file_name'] = file_name

    df_train['r2'] = r2_test_list
    df_train['type'] = 'test'
    df_train['file_name'] = file_name
    
    # Concatenate the train and test sets (vertical)
    df = pd.concat([df_train, df_test], axis=0)
    assert len(df) == len(r2_test_list) + len(r2_train_list), f'The length of R2 df does not match the sum of r2_test_list ({len(r2_test_list)}) and r2_train_list ({len(r2_train_list)})'
        
    if dataset == 'UTR_regions':
        df.to_csv(f'./hyena_embeds_pe/kfold_results/R2_Values_{file_name}.csv', index=False)
    elif dataset == 'no_augment':
        df.to_csv(f'./hyena_embeds/kfold_results/R2_Values_{file_name}.csv', index=False)
    elif dataset == 'fine_tuned':
        df.to_csv(f'./fine_tuned_hyena_embeds/kfold_results/R2_Values_{file_name}.csv', index=False)
    elif dataset == 'fine_tuned_pe':
        df.to_csv(f'./fine_tuned_hyena_embeds_pe/kfold_results/R2_Values_{file_name}.csv', index=False)
    else:
        raise ValueError(f'Invalid dataset: {dataset}, choose from UTR_regions, no_augment, fine_tuned, fine_tuned_pe')


In [13]:
def glob_files(dataset):
    # Based on the dataset, get the list of reduced embeddings, for the 'pe' datasets, they will use their corresponding embeddings without the augmentation.
    # For example, the 'UTR_regions' dataset will use the non-fine-tuned embeddings, ./hyena_embeds/*.npy
    if dataset == 'UTR_regions':
        hyena_embeds_list = glob.glob(f'./hyena_embeds/*.npy')
    elif dataset == 'no_augment':
        hyena_embeds_list = glob.glob(f'./hyena_embeds/*.npy')
    elif dataset == 'fine_tuned':
        hyena_embeds_list = glob.glob(f'./fine_tuned_hyena_embeds/*.npy')
    elif dataset == 'fine_tuned_pe':
        hyena_embeds_list = glob.glob(f'./fine_tuned_hyena_embeds/*.npy')
    else:
        raise ValueError(f'Invalid dataset: {dataset}')
    
    hyena_embeds_list = [file for file in hyena_embeds_list if not os.path.isdir(file)]
    print('Hyena embeds list: ', hyena_embeds_list)
    
    return hyena_embeds_list


In [14]:
def run_cross_validation_for_files(hyena_embeds_list, model_list, model_dict, y, n_folds, dataset):
    for file_name in hyena_embeds_list:
        print('File Name: ', file_name)
        
        if not file_name.endswith(('.csv', '.npy')):
            print(f"Skipping invalid file type: {file_name}")
            continue

        file_name_no_extension = os.path.basename(file_name)[:-4]


        # Load X from the file
        if file_name.endswith('.csv'):
            X = pd.read_csv(file_name, header=None)
            assert not X.isnull().values.any(), f'NaN values in file: {file_name}'
        elif file_name.endswith('.npy'):
            X = np.load(file_name)
            assert not np.isnan(X).any(), f'NaN values in file: {file_name}'
        elif file_name.endswith('.pt') or file_name.endswith('.pth'):
            print(f"This is a torch file, and most likely a raw unprocessed embedding, skipping {file_name}.")
            continue
        
        # Lets concat the extra features is dataset 'UTR_regions' or 'fine_tuned_pe' is chosen
        if dataset == 'UTR_regions' or dataset == 'fine_tuned_pe':
            
            X = stack_element_features(embedding=X, optimal_seqs_filepath=OPTIMAL_SEQS_DATA_PATH)        

        print('File Name for validation: ', file_name_no_extension)
        assert os.stat(file_name).st_size > 0, f'Empty file: {file_name}'
        
        for model_name in model_list:
            print('Running model: ', model_name)
            hyperparams = model_dict[model_name]
            print('Model name and hyperparams: ', model_name, hyperparams)
            
            # Replace with actual function to run cross-validation
            run_cross_validation(model_name, hyperparams, X, y, n_folds, file_name=file_name_no_extension, dataset=dataset)

# Example usage (replace with actual values)
# run_cross_validation_for_files(hyena_embeds_list, model_list, model_dict, y, n_folds)

***Lets run the cross-validation for the hyena_embeds***


In [15]:
# Load target values
target_df = smart_read_csv(LABEL_DATA_PATH)
y = target_df.iloc[:, 1]
y_np = y.to_numpy()
print('y shape: ', y.shape)

y shape:  (10447,)


In [16]:
# Running the regular hyena-dna embeddings (no-fine-tuning but with no extra features)
hyena_embeds_list = glob_files('no_augment')
print('Hyena embeds list: ', hyena_embeds_list)

run_cross_validation_for_files(hyena_embeds_list, model_list, model_dict, y_np, n_folds, dataset='no_augment')

Hyena embeds list:  ['./hyena_embeds/reshaped_min_max_hyena.npy', './hyena_embeds/reshaped_average_hyena.npy', './hyena_embeds/pca_components_2.npy', './hyena_embeds/pca_components_5.npy', './hyena_embeds/pca_components_10.npy', './hyena_embeds/pca_components_150.npy', './hyena_embeds/pacmap_components_2.npy', './hyena_embeds/pacmap_components_5.npy', './hyena_embeds/pacmap_components_10.npy', './hyena_embeds/pacmap_components_100.npy']
Hyena embeds list:  ['./hyena_embeds/reshaped_min_max_hyena.npy', './hyena_embeds/reshaped_average_hyena.npy', './hyena_embeds/pca_components_2.npy', './hyena_embeds/pca_components_5.npy', './hyena_embeds/pca_components_10.npy', './hyena_embeds/pca_components_150.npy', './hyena_embeds/pacmap_components_2.npy', './hyena_embeds/pacmap_components_5.npy', './hyena_embeds/pacmap_components_10.npy', './hyena_embeds/pacmap_components_100.npy']
File Name:  ./hyena_embeds/reshaped_min_max_hyena.npy
File Name for validation:  reshaped_min_max_hyena
Running model:

Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1044
Length of X-train fold:  9403
Length of X-test fold:  1044
Length of X-train fold:  9403
Length of X-test fold:  1044
Length of X-train fold:  9403
Length of y_pred_train:  94023
Length of y_pred_test:  10447


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.28536136026002623, 0.28565862202217396, 0.28321684578063233, 0.21206457766038522, 0.268459094907883, 0.27951154419559865, 0.2848752478782898, 0.281704316211405, 0.22407514383331506, 0.2844883259314097]
R2 Test List:  [0.19898566809211238, 0.20888976759032496, 0.24039538490967338, 0.23180757388027584, 0.18471298767236177, 0.22788692303874425, 0.22630766868699392, 0.2574252006957486, 0.171756592002979, 0.2279299033455986]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
L

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9991150990333497, 0.9991979884796054, 0.9991126205247365, 0.9991091209403119, 0.9991744845145376, 0.999082879853456, 0.9992012250371866, 0.9990779004418953, 0.9991279155523611, 0.9990933770468949]
R2 Test List:  [0.36241581282510993, 0.3604771923579684, 0.35410248134991484, 0.3794995271167694, 0.3064733595326369, 0.37354027689003677, 0.33706144676952454, 0.35078867294157057, 0.3195902089434265, 0.3575068432159333]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/reshaped_average_hyena.npy
File Name for validation:  reshaped_average_hyena
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Len

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.2538990693864952, 0.24930907819567238, 0.2510177198096494, 0.24828032406787703, 0.2511096289293282, 0.24977941731587228, 0.255504813338448, 0.2508369621857872, 0.2504529334752982, 0.24909401650305674]
R2 Test List:  [0.19673582435820025, 0.23706021177034964, 0.22309578050735324, 0.24643918846101787, 0.222791966206228, 0.23586144367417428, 0.18070223338948033, 0.22901353318369932, 0.23000763326536589, 0.24040927453863148]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9984296902905355, 0.998612536078822, 0.9985345510774859, 0.9987324726234826, 0.9985477464572406, 0.998686894227337, 0.9986572572709917, 0.9987391953521934, 0.9987356333907029, 0.9985259377016653]
R2 Test List:  [0.3424507299458929, 0.41452751185614845, 0.3525313915927195, 0.3716747712836831, 0.33346702634013725, 0.38716819265204006, 0.3200419789106499, 0.37697271977865976, 0.3547927299564916, 0.3955752391437688]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_2.npy
File Name for validation:  pca_components_2
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test 

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.23230293374371114, 0.2304435364406373, 0.2314314604442017, 0.22853831135940617, 0.2307277545788311, 0.2289177185524699, 0.23469614047231357, 0.23210398273103627, 0.23078780163190116, 0.22828752883820524]
R2 Test List:  [0.21701834123619212, 0.2333697337642987, 0.22397243635151098, 0.2502768390356912, 0.23137531448162385, 0.24744558061096344, 0.19595353192410603, 0.21993645358320268, 0.23054858193571237, 0.25259936477475686]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  10

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.6618370916105443, 0.6675904773326667, 0.65947443183345, 0.6650232480875837, 0.6590089692976717, 0.6650777509821433, 0.6651304752139868, 0.6598976365056882, 0.6651354650515301, 0.6678949779460139]
R2 Test List:  [0.12550158599748307, 0.12236105439502254, 0.13913624359758858, 0.1948762543670467, 0.19177261742653295, 0.10346683414733504, 0.11469606160788104, 0.17725488144181623, 0.14359549417120832, 0.17455657760791454]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_5.npy
File Name for validation:  pca_components_5
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.2389871068732996, 0.2366358018987481, 0.2386395308179986, 0.23465982147317133, 0.23706078694609134, 0.23527110343031032, 0.24061414096241274, 0.23886950804870566, 0.23818725564678989, 0.23403476489691266]
R2 Test List:  [0.2212234388695976, 0.24218178796519862, 0.2229885318426923, 0.2594350826798989, 0.23877555788052407, 0.2547291855170495, 0.20702344475107815, 0.22378633962938954, 0.22820835357179714, 0.26487846972096907]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  104

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9218307232609229, 0.9208709624063615, 0.9150544262944371, 0.9117808439762308, 0.921233416090849, 0.9195913315319109, 0.9277860254286966, 0.9227218717423209, 0.9264075268881845, 0.9191628829045334]
R2 Test List:  [0.24014749257254087, 0.2556275729822639, 0.22186131870659642, 0.3052319642345649, 0.30555770686757733, 0.24788282605599166, 0.26466273083947245, 0.2606869179958331, 0.2555879268968567, 0.27201769020943556]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_10.npy
File Name for validation:  pca_components_10
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.24673212568992886, 0.2439231324514528, 0.24724315299990285, 0.2427934781367358, 0.24518701308885626, 0.24271036099358134, 0.24874185463199017, 0.24719382218554364, 0.24694897599355137, 0.24245168833468134]
R2 Test List:  [0.2321865585352103, 0.2571300352046564, 0.22579753467520292, 0.2668753451436018, 0.24594737674882639, 0.2681203702292173, 0.21436645485731276, 0.22962625055757857, 0.22969139337790057, 0.2696888210713526]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  104

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9759130047694806, 0.9753974014830448, 0.9767543664908658, 0.9769758755715594, 0.9773274455491416, 0.9791134856516248, 0.9794945710895469, 0.9751535343347499, 0.977163759038896, 0.977821510317628]
R2 Test List:  [0.2642967051696071, 0.304687714459859, 0.24824262168259537, 0.3055175183483745, 0.3311049368517375, 0.30351944386350727, 0.26869167993010457, 0.3500784317549207, 0.2928037926603503, 0.339638470984849]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_150.npy
File Name for validation:  pca_components_150
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.3470625857412106, 0.3412762818003596, 0.3469940154468978, 0.34287498745514056, 0.34544928346351766, 0.3428614405288297, 0.3463881151384668, 0.34658872154120735, 0.347016675592554, 0.3427300177471785]
R2 Test List:  [0.3039716812474996, 0.3555297615637224, 0.30336069927942777, 0.34235279654887574, 0.3188172882572994, 0.3428546579118076, 0.3103632525169584, 0.31144562214164007, 0.304336584734492, 0.342882501415137]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length o

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9992313994909131, 0.9992595975677435, 0.9992399865037631, 0.9992351979792281, 0.9992883502342158, 0.9992661810613617, 0.999236602918903, 0.999252249546181, 0.9992561302515376, 0.9992661022500519]
R2 Test List:  [0.32866179724675926, 0.4422147501491006, 0.3456482149544259, 0.3917793619778711, 0.37417104954917535, 0.39646332322405076, 0.34794104599536013, 0.3752430219526862, 0.3434064048119372, 0.3743033646264995]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_2.npy
File Name for validation:  pacmap_components_2
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.12365291789540833, 0.1247619980653254, 0.12077693328262273, 0.11871933164621296, 0.1212741692407423, 0.12067017591592177, 0.126711656437959, 0.12091965127150772, 0.11925319832036263, 0.11825048513948755]
R2 Test List:  [0.1013486802512854, 0.08960256141664957, 0.12672371744811795, 0.14463408353504936, 0.1231243392105067, 0.1281372181573568, 0.07409880276421188, 0.12565180596871617, 0.14059201934679078, 0.14931236834525174]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  104

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.6552480120734705, 0.6484443166979439, 0.66275678490255, 0.657091847416737, 0.6632193274139984, 0.6535250503582415, 0.66025954734767, 0.6488444569894756, 0.6462913415903568, 0.661105221946334]
R2 Test List:  [0.268241303896007, 0.3184962740489582, 0.24863558520573725, 0.3301045602145374, 0.2619375805936207, 0.3064230749725557, 0.20309728747423206, 0.3131572617102494, 0.276411733233592, 0.30675718696188514]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_5.npy
File Name for validation:  pacmap_components_5
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test f

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.12957258350011325, 0.12855203782739777, 0.12646684060199231, 0.12548949251155395, 0.12605889543193893, 0.12556650164670202, 0.1318876258287165, 0.12602925282488575, 0.12528886708896692, 0.1252798282743227]
R2 Test List:  [0.10289288352800341, 0.11096139400374061, 0.13043298110920432, 0.1391703042223177, 0.13546638132573852, 0.13913017296002128, 0.08217275290811799, 0.13441691109298648, 0.14092812114396047, 0.14135926195309145]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold: 

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.8133943149475403, 0.8108018422137406, 0.808231541838297, 0.8086709926820776, 0.8164131743527061, 0.8205340481283688, 0.8165144124504823, 0.8141519556043257, 0.8109190793634087, 0.8043207700108977]
R2 Test List:  [0.2994141081979833, 0.3766068644406012, 0.25894337578294113, 0.33391971023108435, 0.2611107220537886, 0.31180078018953217, 0.30603750658560436, 0.29438196705517505, 0.2983224726293603, 0.2984604995754384]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_10.npy
File Name for validation:  pacmap_components_10
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length 

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.13278661367978906, 0.13375225521242906, 0.13106558297177817, 0.12863042117495938, 0.13122327243588128, 0.12943462423087748, 0.13699367355617043, 0.1295721314224133, 0.12986118648542233, 0.12958415924103717]
R2 Test List:  [0.11664269216254464, 0.1057865372552167, 0.1310845412876328, 0.15309668372981777, 0.13061856568304397, 0.1460731500658775, 0.07878851322373803, 0.14407242764577488, 0.14200876468766455, 0.14450994627859648]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.8667593037187542, 0.8679912440171129, 0.86717460696773, 0.8684971405409477, 0.8749508151868692, 0.8762352384178796, 0.8717351759982044, 0.8702420845860759, 0.8700902156491591, 0.8746403840283306]
R2 Test List:  [0.30199760376611373, 0.3459316281115107, 0.31378989457321393, 0.3081077059385158, 0.33761153604577, 0.3020516929676864, 0.2744507947437108, 0.3329704649436571, 0.2743023438782093, 0.33288416402297993]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_100.npy
File Name for validation:  pacmap_components_100
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of 

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.2611099582392029, 0.2600564354431987, 0.2588000416370101, 0.25069667049102173, 0.25626673961076996, 0.2529499228127283, 0.2601163789429346, 0.2597116293730408, 0.25851986071923505, 0.2554724623441109]
R2 Test List:  [0.2128855660396729, 0.21764975121509889, 0.23220705689751553, 0.3063216008931384, 0.25599957143224383, 0.28682432109095746, 0.22335800883213974, 0.22819998634326433, 0.23611592494935263, 0.2633926670128329]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
L

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9342053948859831, 0.9395959253064302, 0.9406731413580321, 0.9385329216938976, 0.9429823501485871, 0.9372834833249971, 0.9396859399505315, 0.9396253015122182, 0.9366460138596651, 0.9392114626932954]
R2 Test List:  [0.3219454718538898, 0.3598411767573525, 0.31264002187451834, 0.3539604176994193, 0.31157315558852794, 0.2826760512430838, 0.2460594977471754, 0.3367030947776394, 0.3146247913647676, 0.3385194995958315]
r2_train_list len: 10
r2_test_list len: 10


**Now lets reproduce this for the fine tuned embeddings**   

In [17]:
# Call the function on shalem_embeds to perform dimensionality reduction
fine_tuned_embeddings = torch.load(FINE_TUNED_SEQS_EMBEDDINGS_PATH)
X_reshaped = process_embeddings_if_needed(fine_tuned_embeddings)
process_and_save_embeddings(dataset='fine_tuned')
perform_dimensionality_reduction(X_reshaped, base_dir='./fine_tuned_hyena_embeds', methods=METHODS, components=COMPONENTS)

hyena_embeds_list = glob_files('fine_tuned')
print('Hyena embeds list: ', hyena_embeds_list)

# Running the fine-tuned hyena-dna embeddings with no extra features
run_cross_validation_for_files(hyena_embeds_list, model_list, model_dict, y_np, n_folds, dataset='fine_tuned')

X shape: (10447, 19456)
Average Array Shape: (10447, 152)
Max-Min Array Shape: (10447, 152, 2)
min_max_array_reshaped shape: (10447, 304)
Embeddings saved to ./fine_tuned_hyena_embeds/
Method and number of components: pca 2
Shape of X array after reduction: (10447, 2)
Reduced data saved to ./fine_tuned_hyena_embeds/pca_components_2.npy
Method and number of components: pca 5
Shape of X array after reduction: (10447, 5)
Reduced data saved to ./fine_tuned_hyena_embeds/pca_components_5.npy
Method and number of components: pca 10
Shape of X array after reduction: (10447, 10)
Reduced data saved to ./fine_tuned_hyena_embeds/pca_components_10.npy
Method and number of components: pca 150
Shape of X array after reduction: (10447, 150)
Reduced data saved to ./fine_tuned_hyena_embeds/pca_components_150.npy
Method and number of components: pacmap 2
Shape of X array after reduction: (10447, 2)
Reduced data saved to ./fine_tuned_hyena_embeds/pacmap_components_2.npy
Method and number of components: pa

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.32405546092863213, 0.3185367011256127, 0.3207301645262203, 0.3198242526307049, 0.32148004533986285, 0.32229214947264717, 0.32106761733777456, 0.32257108431917847, 0.3176317413189744, 0.31953261524887877]
R2 Test List:  [0.2416856246544261, 0.28504404551836793, 0.2607944078925586, 0.2802935926263863, 0.24471120280978598, 0.2586954504451454, 0.2679304381240565, 0.2648628096800155, 0.2919116772124076, 0.2861319888160796]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Len

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9991306502128569, 0.9990416634813737, 0.9990961333242288, 0.9991589570058809, 0.9990752429364089, 0.9990845316652985, 0.9990969255984614, 0.9990872962359518, 0.9990897313621461, 0.9991910631645029]
R2 Test List:  [0.43491435096381714, 0.48030871990692126, 0.4505528642552469, 0.448995795331101, 0.4478605019796035, 0.4510102474208558, 0.4199971487768426, 0.45057072618461913, 0.4301018155973365, 0.4814293426103864]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/reshaped_average_hyena.npy
File Name for validation:  reshaped_average_hyena
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold: 

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.27669458702338945, 0.27800695089299765, 0.2800662760370821, 0.27927450320012115, 0.27852459099556925, 0.2781566108345108, 0.2828792700907433, 0.28013509268358516, 0.280114950308502, 0.27574541017889176]
R2 Test List:  [0.2470645927332603, 0.2609680507148898, 0.24321945946923662, 0.2518021706916611, 0.2558020147021779, 0.26262996761207913, 0.2196034965644409, 0.24648252566029516, 0.2454997679806934, 0.2847640183209227]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Len

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9986524860886221, 0.9988666467217104, 0.998703100102153, 0.9986560821385364, 0.9987292485730588, 0.9985959242973992, 0.9986348062381578, 0.9985433406281784, 0.9986948744547132, 0.9986930677493527]
R2 Test List:  [0.46090080814499346, 0.4395404149009031, 0.40942514447984546, 0.4489314243049839, 0.4185897286275545, 0.4522826046878622, 0.40137327491717434, 0.441317048437906, 0.43535355584926905, 0.4501977398346403]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_2.npy
File Name for validation:  pca_components_2
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5151395542485173, 0.5079163890461014, 0.5127199479271374, 0.5095891002102813, 0.5137783049909763, 0.5076752832659241, 0.5128745120819913, 0.5129213936442836, 0.5123607372704799, 0.5083238013377898]
R2 Test List:  [0.4763139463686432, 0.5426040455457914, 0.497638396448266, 0.5262443133552168, 0.48704597376258085, 0.5438844421417938, 0.4974580880247568, 0.4979863545387899, 0.5017649511621861, 0.5376041742430953]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.7547121012715752, 0.7496826488702075, 0.7539240880206886, 0.7560278088183352, 0.7546064165115954, 0.7541932992012971, 0.7638023749451148, 0.7553726128064092, 0.7511672553101221, 0.7516891995910321]
R2 Test List:  [0.3912529956132489, 0.460997264148566, 0.4033965412454761, 0.43109791515050533, 0.40859561879951134, 0.45536489316850404, 0.42393123654921017, 0.4738109343802951, 0.4366215288086033, 0.4590185133570861]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_5.npy
File Name for validation:  pca_components_5
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Lengt

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5225744814814484, 0.5151556591140178, 0.5204979144781774, 0.5164139818408737, 0.5215315509378184, 0.5154378606838153, 0.5198649482561508, 0.5196959306977841, 0.5202507194724415, 0.5157055094933107]
R2 Test List:  [0.48292618488955663, 0.5510132724076817, 0.500940583391687, 0.538091329726125, 0.49039119754820215, 0.5475701891169573, 0.5080148107843752, 0.509942480726653, 0.5042692541149115, 0.5447554620178046]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9530860301432229, 0.9525080138272463, 0.9557744299058658, 0.9536203427546461, 0.9534080592399466, 0.9503791496878432, 0.9532271724723266, 0.9524691326050003, 0.949261723164299, 0.9533300306863971]
R2 Test List:  [0.4404464687271774, 0.5236546001949844, 0.459264942433098, 0.4504043646870113, 0.47003620477955876, 0.5230232927507275, 0.48360407042108056, 0.5093215736552033, 0.4689075699518891, 0.5176588047928115]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_10.npy
File Name for validation:  pca_components_10
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5477337730180412, 0.5423413139301454, 0.5478283794505634, 0.543687272792716, 0.5475081519713833, 0.5429156975428615, 0.5462768623541387, 0.5457320001190302, 0.5476607783393626, 0.543150670659097]
R2 Test List:  [0.5238822921729835, 0.5735740180525848, 0.5218738931358203, 0.5599959564299835, 0.5244691214883037, 0.5676242354002912, 0.5376248812382332, 0.5422814417235415, 0.524913329377156, 0.5649492081220201]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-tr

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9860859657319259, 0.9868658786097596, 0.986207352355994, 0.9850014204837466, 0.9872832273703054, 0.9844037805643665, 0.9857643822441919, 0.9864450452661641, 0.984948991754664, 0.9858308715367844]
R2 Test List:  [0.47059747080239467, 0.5452150756062537, 0.4843068711130134, 0.5099966305708141, 0.5023253815265798, 0.5207671599052809, 0.5187141953532943, 0.5349986205561895, 0.5057355568560697, 0.5656569311441921]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_150.npy
File Name for validation:  pca_components_150
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Lengt

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5913112683797601, 0.5860730590825838, 0.5923335157871241, 0.5905110240384959, 0.5918229906616729, 0.5883216046158234, 0.5915282084926381, 0.5904626675767116, 0.5913669658451672, 0.5879688415453127]
R2 Test List:  [0.566385810116094, 0.6148842811439172, 0.5536227647411853, 0.5734190160858006, 0.5587469504110809, 0.5945348930872393, 0.5622912715860621, 0.5744728308380584, 0.5657170811717772, 0.595382650799888]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-t

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9993342161145062, 0.9993018959720924, 0.9993521142490117, 0.9993537619816135, 0.9993599392511535, 0.9993407969012642, 0.999356579658128, 0.9993228233229182, 0.999363402400263, 0.9993363366342971]
R2 Test List:  [0.5049732869933178, 0.5914923831895508, 0.5182385312959747, 0.5189825873152387, 0.5330753416521039, 0.5631694829049925, 0.5358503178283722, 0.5579964666148559, 0.5204017001349373, 0.5591117363707978]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_2.npy
File Name for validation:  pacmap_components_2
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Leng

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.1746053888740119, 0.17254740057236873, 0.1725458285169651, 0.16509343277001243, 0.1736267437895026, 0.1685881816661805, 0.17414047375991326, 0.17114218867625486, 0.16826151564367398, 0.1698420382837147]
R2 Test List:  [0.138041912258431, 0.1560994805723095, 0.1556606407677208, 0.22243646383103588, 0.14519097593697117, 0.19248955042106686, 0.14316152297082085, 0.16959068406014932, 0.19502708989920536, 0.18121940524374147]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.7260379321320998, 0.7214091550628667, 0.7196610609095325, 0.7231974939265096, 0.7363044358533235, 0.7259355770452947, 0.7267119845122791, 0.7160242900272803, 0.7299604289706294, 0.7184674448127502]
R2 Test List:  [0.3771501231567568, 0.49445040373922455, 0.41065163461459864, 0.4439579067477145, 0.40628188035633683, 0.4390029230971412, 0.4053570154543006, 0.4596795915988706, 0.4006071744582719, 0.47354525368798395]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_5.npy
File Name for validation:  pacmap_components_5
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  940

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.2444182667622381, 0.2396896533795977, 0.23915734468188754, 0.23555741436826205, 0.2419916623163384, 0.23459990087052074, 0.24295433281084544, 0.23988895475900196, 0.23979061854011574, 0.2383869518730729]
R2 Test List:  [0.19550579275450686, 0.2376227692952153, 0.24256471483524122, 0.27473940799711405, 0.2159296445687402, 0.28429482835533126, 0.209662018317066, 0.2368777255349147, 0.2374461390579613, 0.2501237473680088]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Le

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.8704542015728356, 0.8716603189731819, 0.870112603795199, 0.873836291380586, 0.8749710613029653, 0.8746625065476941, 0.8750169809193962, 0.8704994912965044, 0.8705566840730152, 0.8680107205173053]
R2 Test List:  [0.4308215618862552, 0.5262503773552967, 0.43523680741553894, 0.4783784079759501, 0.41954777267546195, 0.46463640609645196, 0.47860836220512015, 0.5024215607991742, 0.4253330164230481, 0.4830348651265137]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_10.npy
File Name for validation:  pacmap_components_10
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  940

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.2647264487087738, 0.26368276834870374, 0.26190851708946006, 0.25841582257300444, 0.263821949598514, 0.2579705005309585, 0.26465094082512275, 0.2610707417833993, 0.2618369584217396, 0.25873266382537485]
R2 Test List:  [0.2327256500138929, 0.24039251129103079, 0.25727464921835574, 0.28869042623934726, 0.2394131483223093, 0.2934913422943539, 0.23384976724567053, 0.2653345705511524, 0.2582954684806311, 0.28644230034705187]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Le

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.900877919171434, 0.89979020690936, 0.9058767890658267, 0.9057330827026329, 0.9000669197302358, 0.9060707687121675, 0.8998639679050457, 0.9034253978993855, 0.9038530604776271, 0.9016366397473244]
R2 Test List:  [0.42487134314713837, 0.4988115979093133, 0.4363311875218857, 0.4720564938634062, 0.4152451975110072, 0.44588945809205793, 0.47498214074045975, 0.481829214319471, 0.44451863529257296, 0.48642276177575694]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_100.npy
File Name for validation:  pacmap_components_100
Running model:  lr
Model name and hyperparams:  lr {'fit_intercept': True}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  94

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.41734629360971953, 0.41219827439359946, 0.4165670012264723, 0.4101874281123211, 0.41853003151204693, 0.4110843841330609, 0.41421141231220604, 0.41297822944456186, 0.41672624376809053, 0.40887314698759003]
R2 Test List:  [0.37333730066338344, 0.41789305014577227, 0.37818596960188067, 0.4380247178568458, 0.35775612065404916, 0.4295974859157774, 0.40293732385239633, 0.41369963676045085, 0.378762307362735, 0.44893342356909605]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  104

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9574998861144428, 0.9570122869988829, 0.9573990715583519, 0.955827498137855, 0.9590518305580027, 0.9564861585032663, 0.9570542361201553, 0.9559709675414783, 0.9576020389075833, 0.9572782607690481]
R2 Test List:  [0.4493670223405215, 0.5059905053979984, 0.44399920386206815, 0.45567427779818803, 0.44580636615743585, 0.47064638015968063, 0.4906937636783618, 0.49667774049189606, 0.4190553830596295, 0.4730306606677738]
r2_train_list len: 10
r2_test_list len: 10


***Let's run the regular hyena-dna embeds with the extra features as well as hyena-dna embeds from my fine-tuned weights***

In [18]:
# Running the regular hyena-dna embeddings (no-fine-tuning, but with extra features)
hyena_embeds_list = glob_files('UTR_regions')
print('Hyena embeds list: ', hyena_embeds_list)

run_cross_validation_for_files(hyena_embeds_list, model_list, model_dict, y_np, n_folds, dataset='UTR_regions')

Hyena embeds list:  ['./hyena_embeds/reshaped_min_max_hyena.npy', './hyena_embeds/reshaped_average_hyena.npy', './hyena_embeds/pca_components_2.npy', './hyena_embeds/pca_components_5.npy', './hyena_embeds/pca_components_10.npy', './hyena_embeds/pca_components_150.npy', './hyena_embeds/pacmap_components_2.npy', './hyena_embeds/pacmap_components_5.npy', './hyena_embeds/pacmap_components_10.npy', './hyena_embeds/pacmap_components_100.npy']
Hyena embeds list:  ['./hyena_embeds/reshaped_min_max_hyena.npy', './hyena_embeds/reshaped_average_hyena.npy', './hyena_embeds/pca_components_2.npy', './hyena_embeds/pca_components_5.npy', './hyena_embeds/pca_components_10.npy', './hyena_embeds/pca_components_150.npy', './hyena_embeds/pacmap_components_2.npy', './hyena_embeds/pacmap_components_5.npy', './hyena_embeds/pacmap_components_10.npy', './hyena_embeds/pacmap_components_100.npy']
File Name:  ./hyena_embeds/reshaped_min_max_hyena.npy
Head of ds:                                              sequenc

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.29677301174193815, 0.2939086054200274, 0.2919545195289268, 0.2867600024396463, 0.29354869838304143, 0.2933344248221257, 0.29372500538188007, 0.28964259405851756, 0.2948588818287483, 0.29217201155979944]
R2 Test List:  [0.201512478244271, 0.22404360688033143, 0.24538974058582608, 0.296448457990234, 0.22886258819900818, 0.23388498329911245, 0.230266800112921, 0.26894704161591765, 0.22144720857941336, 0.24311315027776947]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Le

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9991089528316504, 0.9992015867617514, 0.9991191087658404, 0.9990742151012281, 0.9991794544626308, 0.9991210155565495, 0.9990960095522817, 0.9991652871660265, 0.9991432584188391, 0.9990922707642256]
R2 Test List:  [0.3596582417160654, 0.3594176737730834, 0.36002405739537424, 0.3828910091557517, 0.3464569627148305, 0.37851210786154155, 0.3392698633623109, 0.3614830806038387, 0.33604119136551436, 0.34483234454729905]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/reshaped_average_hyena.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:            

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.26301366472200693, 0.2574261294047787, 0.25947882926352184, 0.25749031598422956, 0.2596282695933503, 0.25916523466724706, 0.26494709092236135, 0.2590045867230554, 0.2592376591305904, 0.25717297588449683]
R2 Test List:  [0.20115895714358834, 0.25137741830803473, 0.2341197079909605, 0.2505514396097416, 0.2334592362576352, 0.23768449383792523, 0.18247339651550742, 0.2420841060027542, 0.237939413380565, 0.2546628069003637]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Le

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9985109304337392, 0.9985960061776635, 0.9987194111837215, 0.9986704952938623, 0.9986021861777777, 0.9987532591512263, 0.998591165853684, 0.998556685077244, 0.9985814993645673, 0.998689559774927]
R2 Test List:  [0.3505792337589142, 0.4206848134809701, 0.326939728527593, 0.4013252537382527, 0.3517364466857602, 0.35902676414864365, 0.3466191549296125, 0.39246394736531665, 0.3401295865424133, 0.3796965599169838]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_2.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:                        

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.23987691119619547, 0.23711197637068093, 0.23912731968797352, 0.23594368553297684, 0.23730391779554727, 0.23587416946181594, 0.24208907607905572, 0.23906187775815335, 0.23817683510698096, 0.2345638080299398]
R2 Test List:  [0.2192366230944106, 0.2439311386269225, 0.2248057282221051, 0.2541683661034635, 0.24289297202780835, 0.25530934068207634, 0.19984293909050532, 0.22780646677736538, 0.23441056071735777, 0.26638543814903304]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.7379144034344551, 0.7376793673289443, 0.7370329842267064, 0.7356210036402497, 0.736410319827787, 0.7321730402574936, 0.7294029532899784, 0.7303530171604733, 0.7289347031908735, 0.7349114112159355]
R2 Test List:  [0.13616891884860527, 0.1514158822626539, 0.13972472101302225, 0.23520391949832176, 0.2252354199920379, 0.14280175101229853, 0.17081659827671614, 0.20535938522818376, 0.17388406840925674, 0.18605883389182887]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_5.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:               

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.24409557640879276, 0.24088894402599936, 0.2433522171081156, 0.2395590551273672, 0.2415834026012844, 0.2398895109405087, 0.24573975968944017, 0.24327362368858774, 0.24276386839351305, 0.23825993322797312]
R2 Test List:  [0.22117964053069128, 0.24992873390281278, 0.22666571124095125, 0.2614358851195493, 0.2441610872388914, 0.25920289689536213, 0.20662694828069872, 0.23008901293454886, 0.23297368130571594, 0.272860544641796]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9218562406942857, 0.9242063136140076, 0.9218211897753072, 0.9211631831707121, 0.9268312383835506, 0.9284819314448434, 0.9297587404285714, 0.9211524678832931, 0.930749956862712, 0.9300822158620476]
R2 Test List:  [0.2381017989294959, 0.2944726211908504, 0.22771686918263478, 0.3142712322110379, 0.30461218509249033, 0.2741506967523851, 0.2629718798786502, 0.28016884435245015, 0.2569157876141027, 0.2799877845256211]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_10.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:                   

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.251061725245892, 0.2478167072671965, 0.2511677178447813, 0.24717365496162924, 0.24906557496435178, 0.24691435248996885, 0.253181177498195, 0.2508563108301046, 0.2509887257018135, 0.24620406646631277]
R2 Test List:  [0.23317300190501022, 0.26229998284502143, 0.2307196715626032, 0.2675989140578722, 0.2513153416033864, 0.27042214137853615, 0.21412615903217513, 0.2365361441757271, 0.23353832512833572, 0.27606069014704904]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Len

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.977112921629398, 0.9761637088798207, 0.9767693644077327, 0.9779337041341984, 0.9777202771720868, 0.9781324876769906, 0.9806142770507924, 0.9765761842900597, 0.9766591648397641, 0.9790044125935391]
R2 Test List:  [0.2945056591711358, 0.30768391965772013, 0.25000830305740374, 0.32816621128597123, 0.33898957623907366, 0.2869804994708265, 0.25434949195867085, 0.3512795928793283, 0.31370286216449417, 0.36428915856975896]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pca_components_150.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:              

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.350111129148615, 0.34394925965623124, 0.3497360825126591, 0.34600771078410775, 0.3484877521127142, 0.34538167399064823, 0.35013637090392724, 0.3489627688377669, 0.35018995979670564, 0.34553567462027734]
R2 Test List:  [0.30478711904242517, 0.3605964665642054, 0.3077578896572327, 0.34305273328078056, 0.32023407086115985, 0.34918825327191805, 0.30505191659642383, 0.3183874792703525, 0.30416826838076727, 0.3466701122211766]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9992443365245982, 0.9992216190242527, 0.999234341803636, 0.9992601526026503, 0.9992807763285955, 0.9992622965213029, 0.9992351062338081, 0.9992173480640495, 0.9992020641385203, 0.9992988958121787]
R2 Test List:  [0.3467713674572015, 0.39887530331494336, 0.3706469216469186, 0.41043236698285535, 0.39733809792457964, 0.39867188397945674, 0.3694692043421651, 0.3688954833354742, 0.3715574848568318, 0.3773027289308043]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_2.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:                

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.13467486682219176, 0.13524568096964917, 0.13176153222858944, 0.13021574437299077, 0.13095261543133097, 0.13133439642486355, 0.136909033694528, 0.13158360800805968, 0.12998658016641085, 0.12750304429000114]
R2 Test List:  [0.10661164309019233, 0.09958667663947895, 0.1320400349032561, 0.14588725117779555, 0.14103210380818398, 0.13636490636643073, 0.08645024872253282, 0.1345717498746919, 0.14860482278668774, 0.1700850788534597]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.6939646599115499, 0.6838838622484409, 0.6953391973412318, 0.6867238297232181, 0.6954842206898351, 0.689704580801624, 0.6937141846132309, 0.6894755759661897, 0.679936561102005, 0.6904189152754686]
R2 Test List:  [0.2896530075425289, 0.36254103091188017, 0.2618438842670674, 0.34000333524673365, 0.2862230124951919, 0.310924157279011, 0.22645692772865622, 0.3140943979085338, 0.30204716289871947, 0.3310839647390931]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_5.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:                  

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.1366513507292022, 0.13522341249967684, 0.13354545914345084, 0.133098290995326, 0.1325215972881345, 0.1324434830743232, 0.1385275414096001, 0.13273399980401468, 0.13220315497374024, 0.13127561224788398]
R2 Test List:  [0.1064314586320596, 0.1182939406864143, 0.13383600094058545, 0.13820680631884152, 0.1449658447295722, 0.14444102020437055, 0.08941240444292842, 0.14162728469718855, 0.14623075979456213, 0.15459086474207984]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.8279009315674645, 0.8240441800250049, 0.8269021319585542, 0.821446345453827, 0.8245944538009986, 0.8289264970436018, 0.8271062794339029, 0.8237554825668754, 0.8236628915962099, 0.8235187194329194]
R2 Test List:  [0.29503052910588645, 0.3796296321599558, 0.2720830114641717, 0.32710887116190734, 0.2962741273593418, 0.3271565923737293, 0.29490532848578666, 0.28404233307212556, 0.29379004638191053, 0.30452990163238625]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_10.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:             

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.14607329745813968, 0.14554001910779135, 0.14420962550542515, 0.1417499482265575, 0.14345408770178225, 0.1418637861229053, 0.14955267207368328, 0.14165754687824184, 0.14214202836823453, 0.14086545249353255]
R2 Test List:  [0.12025020456475777, 0.12315463843708108, 0.13586341384481015, 0.15861923914608456, 0.1440221109819212, 0.15767307949595621, 0.08890141561120968, 0.1587344472050054, 0.1550372976663169, 0.16629852674475476]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.8810618795875398, 0.8805600775756375, 0.8755833737582576, 0.8782475351465007, 0.8812306785392969, 0.8812245575875696, 0.8763021994084551, 0.8758539127120022, 0.8791797240183972, 0.8780197809744649]
R2 Test List:  [0.2923550514102984, 0.35442661615791504, 0.32698593233924966, 0.33033155227290556, 0.3353431781284182, 0.31650514311404676, 0.2768417404547948, 0.3322459581502478, 0.2898146950278424, 0.33981338109083303]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./hyena_embeds/pacmap_components_100.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:            

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.26881394500447675, 0.266492447591413, 0.2656754679466836, 0.2586634250288785, 0.26324534224735274, 0.2601967089802154, 0.2677108135649032, 0.2659877775342364, 0.26566495611317364, 0.2619581287236804]
R2 Test List:  [0.21350869969436337, 0.22994001028250066, 0.24065233947862497, 0.3046854179254801, 0.2635032444369656, 0.29191387700371985, 0.2243484650773655, 0.24122036932533042, 0.2420263223169249, 0.27498915334316276]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Len

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.939583264442027, 0.9437929709597863, 0.9402397073828691, 0.938798523983521, 0.9425982860771781, 0.9416635409452613, 0.9424504750800438, 0.9405853639248088, 0.9437570689395727, 0.9424703423832504]
R2 Test List:  [0.32321790818878493, 0.3196125323260607, 0.3014023137056444, 0.35771081022729845, 0.2921137876013551, 0.26660744077133014, 0.241548118807552, 0.3321809627998602, 0.3011896267265525, 0.3500753457869836]
r2_train_list len: 10
r2_test_list len: 10


In [19]:
# Running the fine-tuned hyena-dna embeddings with extra features
hyena_embeds_list = glob_files('fine_tuned_pe')
print('Hyena embeds list: ', hyena_embeds_list)

run_cross_validation_for_files(hyena_embeds_list, model_list, model_dict, y_np, n_folds, dataset='fine_tuned_pe')

Hyena embeds list:  ['./fine_tuned_hyena_embeds/reshaped_min_max_hyena.npy', './fine_tuned_hyena_embeds/reshaped_average_hyena.npy', './fine_tuned_hyena_embeds/pca_components_2.npy', './fine_tuned_hyena_embeds/pca_components_5.npy', './fine_tuned_hyena_embeds/pca_components_10.npy', './fine_tuned_hyena_embeds/pca_components_150.npy', './fine_tuned_hyena_embeds/pacmap_components_2.npy', './fine_tuned_hyena_embeds/pacmap_components_5.npy', './fine_tuned_hyena_embeds/pacmap_components_10.npy', './fine_tuned_hyena_embeds/pacmap_components_100.npy']
Hyena embeds list:  ['./fine_tuned_hyena_embeds/reshaped_min_max_hyena.npy', './fine_tuned_hyena_embeds/reshaped_average_hyena.npy', './fine_tuned_hyena_embeds/pca_components_2.npy', './fine_tuned_hyena_embeds/pca_components_5.npy', './fine_tuned_hyena_embeds/pca_components_10.npy', './fine_tuned_hyena_embeds/pca_components_150.npy', './fine_tuned_hyena_embeds/pacmap_components_2.npy', './fine_tuned_hyena_embeds/pacmap_components_5.npy', './fine

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.3390844172884173, 0.3334079421993311, 0.3370914111573564, 0.33657153357506087, 0.3370466628667056, 0.3377439539574417, 0.33729322646324367, 0.3373025443667973, 0.33416928492240683, 0.33441375940519413]
R2 Test List:  [0.2614555223444708, 0.31067480572568096, 0.27654563352950734, 0.2878928708681485, 0.2737730972087362, 0.2690464005759847, 0.27555634141112295, 0.2835556027180397, 0.30536053764131, 0.30655739198991927]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Lengt

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9991695896328922, 0.9991027025274039, 0.9989934005648884, 0.9991504257705167, 0.9992069416997517, 0.9991391995198291, 0.9991044903190895, 0.9991379013116329, 0.9990072225174391, 0.9991663860877649]
R2 Test List:  [0.4458458281979861, 0.4786306964109286, 0.44621610103909337, 0.4478988743108069, 0.4490150275959093, 0.44701378602088115, 0.4229220953558338, 0.4361886218290809, 0.4567060931629643, 0.4898259570211213]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/reshaped_average_hyena.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:   

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.2932948198491042, 0.2913385623866146, 0.2937455986605003, 0.29288110891075914, 0.29275971500972653, 0.2922759681099254, 0.29615526262037606, 0.29371910492265874, 0.2942645539170593, 0.28924667512443536]
R2 Test List:  [0.2638198363194221, 0.2786413209165063, 0.2580454063987382, 0.26745157925216645, 0.2649159148334892, 0.27358185091207454, 0.23716923080335972, 0.2619815743674744, 0.2556059424013222, 0.30058928273677543]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Le

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9987508879284488, 0.9987017702374534, 0.9986780281110703, 0.998645368682851, 0.9987316921813558, 0.9986253091230801, 0.9987391221608838, 0.9986803972341829, 0.9986499436525182, 0.9987625688715507]
R2 Test List:  [0.45146774829944225, 0.4529421830167246, 0.42423744411478115, 0.45136853494661033, 0.42962058807172987, 0.459484009602144, 0.4083059817895355, 0.44487573226235133, 0.436855316356354, 0.4419737615509868]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_2.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:         

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5158408668189425, 0.5088897409135991, 0.5136245696167232, 0.5105743927641426, 0.5148127881111304, 0.5086327397824851, 0.513947355427802, 0.5139465898008191, 0.5134704266653991, 0.5092797737730175]
R2 Test List:  [0.47920439150649186, 0.5433964141745258, 0.4989854517743342, 0.5268727990018612, 0.48689323633563064, 0.5447183304778997, 0.4971863532994557, 0.49835687692822717, 0.5010994449044655, 0.5384385202848014]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.7963974816860668, 0.7932638110300071, 0.793223548687533, 0.7938449068501, 0.7947253069043709, 0.7944550289735552, 0.7972131418080882, 0.7924656938360758, 0.7979615992740178, 0.7934498438766917]
R2 Test List:  [0.4085343150598735, 0.47485896043706033, 0.40740404112447903, 0.4247602208465393, 0.4274999015034072, 0.44008666032137755, 0.4086059702830459, 0.48127881254993765, 0.43964750806930086, 0.45107382782055483]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_5.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:         

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5233102604847707, 0.5160416373186418, 0.5213511113114757, 0.5172294204187178, 0.5225347418984421, 0.5162758641390467, 0.5205346822494852, 0.5205886430836355, 0.5209736828677127, 0.5165825587242732]
R2 Test List:  [0.48407906455288907, 0.551094856541475, 0.5012271827584683, 0.5388148429948343, 0.48898204662302314, 0.5480441455728522, 0.5096335303920869, 0.5100310172965841, 0.5055171238386778, 0.544831668883258]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9521516675621562, 0.9548951605824156, 0.9541018264947732, 0.955931517380997, 0.9542338461568229, 0.951166619561213, 0.9545371003727888, 0.9537348560350263, 0.9507409519603348, 0.954462139153175]
R2 Test List:  [0.4605835692849565, 0.5382853036673783, 0.4728104406792317, 0.45112272587321034, 0.47414991627460057, 0.5134198914835842, 0.4853919010367165, 0.501346481088714, 0.48147975993150083, 0.5234624758533402]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_10.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:           

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.548679293374514, 0.5431134741758274, 0.5487203586514924, 0.5444210542269221, 0.5481785067863257, 0.5436741135102878, 0.5468773190166898, 0.5463849361929111, 0.5482586816153106, 0.5440208012372407]
R2 Test List:  [0.5224192875625164, 0.5738741096529392, 0.5209582832174864, 0.5606362373942451, 0.5254126604688248, 0.5680264468954255, 0.5390916753573138, 0.5434720328210498, 0.5265712380292018, 0.5643322470580789]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9860294689972446, 0.9855768146288153, 0.987555212359039, 0.9867794422883354, 0.9869494468828437, 0.9868889696384072, 0.986587612144067, 0.9878111237742054, 0.9860573335929745, 0.9867388941354482]
R2 Test List:  [0.4658046067686731, 0.5612610469131252, 0.48655408859121163, 0.5098290885619856, 0.490574872920876, 0.5198650490451293, 0.5236268620188183, 0.5226215199639803, 0.499068896129912, 0.5680515879002204]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pca_components_150.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:            

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.5916394294278524, 0.5863113279669941, 0.5927089791726914, 0.5907451531408523, 0.5922008443544104, 0.5886562016858142, 0.5917427974893508, 0.5907064405631905, 0.5916354210807705, 0.5882956628357414]
R2 Test List:  [0.5662003964213431, 0.6155498273631451, 0.5527010234157592, 0.574132875940051, 0.5580272626274823, 0.5942424678742884, 0.563047596878455, 0.5749928301965748, 0.5660077317014385, 0.5951359874617926]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-t

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9993223195771065, 0.99934045354707, 0.9993633855897285, 0.9993314883784156, 0.9993476630514179, 0.9993444669554064, 0.9993411604835004, 0.9992988916261952, 0.9993570700768808, 0.9993381842129668]
R2 Test List:  [0.49786443477927145, 0.5928306445456497, 0.5204901040428932, 0.5370744634492389, 0.5237204025238003, 0.5688136099827004, 0.543240498328083, 0.5606033762413222, 0.5383204956387343, 0.5579271836844686]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_2.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:          

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.20037699240185047, 0.19793062004340378, 0.19870154786645278, 0.19327450116140577, 0.1982341192295144, 0.1946535139094051, 0.20187270906720667, 0.19752438779871373, 0.19613684862260472, 0.1946979447164665]
R2 Test List:  [0.16886167969872057, 0.19019581100236949, 0.18285096197470052, 0.23159593711921223, 0.187387563496359, 0.22047212844077357, 0.15611135270550291, 0.19486208653293313, 0.20674526097147006, 0.21968244411179738]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.7474103169930817, 0.7454667420224894, 0.7478213010659833, 0.7467417369148059, 0.7523017976160487, 0.7424847915358586, 0.7455642547475159, 0.7418751415199989, 0.7515225189383938, 0.7429356587977252]
R2 Test List:  [0.3689085960146644, 0.5026453733597777, 0.40122497983571215, 0.4447568768988588, 0.39592102580842115, 0.4395715311765839, 0.4123957239632472, 0.4607276593675206, 0.4243021510936641, 0.46033735315444824]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_5.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:     

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.25340958554352655, 0.24805868070847636, 0.24775614412934532, 0.2451726169224332, 0.2501697307578925, 0.24369350386081412, 0.2519916112615217, 0.2485288708707677, 0.24922141983609014, 0.24630510340667477]
R2 Test List:  [0.20178102138799914, 0.2496739930357107, 0.25273757270369956, 0.27561525926595287, 0.23004106989718842, 0.2898334256466639, 0.21553934828000743, 0.2465726608000185, 0.24003531906883457, 0.2660040748436371]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.8801882923540313, 0.8771630000491242, 0.8775647425555057, 0.8813167046251654, 0.879066312216956, 0.8834664794051497, 0.8779152675582153, 0.875883873280434, 0.8782404692896941, 0.8791984090533383]
R2 Test List:  [0.4204519080033907, 0.5320827058763022, 0.4492906895657287, 0.4921236200906107, 0.4343636076933355, 0.4743520413802613, 0.4812171666018057, 0.4937879052317524, 0.42593405318978006, 0.4933856955452077]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_10.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:        

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.27302758437582186, 0.27116538419820035, 0.2696742227681902, 0.2667157142320149, 0.2710373618686235, 0.2663107283449563, 0.2732005907909697, 0.2689936023510159, 0.27024954498135245, 0.26599155210514447]
R2 Test List:  [0.23717247732647984, 0.25238085571588764, 0.2666438162681367, 0.29311421442520125, 0.25392668860943357, 0.2973378832119361, 0.23552405882933825, 0.2732846739807243, 0.26182967819393965, 0.30009520803583556]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9084475451723395, 0.9060075491194862, 0.9071629497545851, 0.9087873835201086, 0.9085509356836629, 0.9111573013571194, 0.9072127886467516, 0.903777146379934, 0.9094448139217076, 0.9059094628902195]
R2 Test List:  [0.41306970054185355, 0.5107778628985606, 0.43823086159538993, 0.4690814857675448, 0.42624174845972596, 0.4494992131847807, 0.4639350157240417, 0.4857919183210869, 0.44622281252137985, 0.49594131564264576]
r2_train_list len: 10
r2_test_list len: 10
File Name:  ./fine_tuned_hyena_embeds/pacmap_components_100.npy
Head of ds:                                              sequence       tpm
0  GGGGACCAGGTGCCGTAAGGTGCGGCTGGCCCAATGTGCGCCTATG... -1.226293
1  GGGGACCAGGTGCCGTAAGCCATGAATTAATGAATATCTTTACTTA...  0.086170
2  GGGGACCAGGTGCCGTAAGGTCACGCTTACATTCACGCCCTCCTCC... -1.842423
3  GGGGACCAGGTGCCGTAAGTTGCTATGCAGATGCTTTATACTTCTT...  0.371952
4  GGGGACCAGGTGCCGTAAGCATTGTTAGCTTTTTATGCATTATAAT... -1.729185
Head of df_opt_seqs:  

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.4185705600517443, 0.41328463149343675, 0.41765289163767194, 0.41133128017266274, 0.41984927341035216, 0.41243395337406863, 0.41541007264179364, 0.4140483957888076, 0.41765727401511044, 0.4099212083646442]
R2 Test List:  [0.37309089183641575, 0.41943270402022814, 0.37955598344503483, 0.4387900389702064, 0.3566330022894366, 0.42860981158684974, 0.40231996124906155, 0.41525105094985193, 0.38132513214373476, 0.45054166389283534]
r2_train_list len: 10
r2_test_list len: 10
Running model:  xgb
Model name and hyperparams:  xgb {'learning_rate': 0.22, 'max_depth': 10, 'reg_lambda': 1.8, 'reg_alpha': 0.89, 'n_jobs': -1}
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1045
Length of X-train fold:  9402
Length of X-test fold:  1

  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


r2_train_list len: 10
r2_test_list len: 10
R2 Train List:  [0.9576898730771394, 0.95803952505129, 0.9588792275015458, 0.9542446515563556, 0.9598287721706187, 0.9590934495881717, 0.9585577923943284, 0.9551241812835609, 0.958735957648332, 0.9591231499670899]
R2 Test List:  [0.43172398061719697, 0.48773060772810717, 0.43689451692418235, 0.4626155626646663, 0.4430972553577589, 0.4770837934402328, 0.4857915342817597, 0.49541350211008284, 0.4424033592126896, 0.4848627988092412]
r2_train_list len: 10
r2_test_list len: 10


# Now lets use Optuna to tune our hyperparameters :)

In [34]:
# Objective function for optimization
def objective(trial, X, y):
    classifier_name = trial.suggest_categorical('classifier', ['XGBoost', 'LinearRegression', 'Ridge', 'RandomForest'])
    
    if classifier_name == 'XGBoost':
        xgb_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 1, 20),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
            'n_jobs': -1
        }
        clf = XGBRegressor(**xgb_params)

    elif classifier_name == 'LinearRegression':
        clf = LinearRegression(n_jobs=-1)

    elif classifier_name == 'Ridge':
        ridge_params = {
            'alpha': trial.suggest_float('alpha', 0.1, 10.0)
        }
        clf = Ridge(**ridge_params)

    elif classifier_name == 'RandomForest':
        rf_params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 1, 20),
        }
        clf = RandomForestRegressor(**rf_params, n_jobs=-1)
    
    # return cross_val_score(clf, X, y, n_jobs=-1, cv=3).mean()
    return cross_validate(clf, X, y, cv=4, n_jobs=-1, return_train_score=True)['test_score'].mean() # Use this instead, I believe the cross_val_score was returning training scores instead of test. You are supposed to tune hyperparameters based on the test!!!

def optimize_hyperparameters(file_list):
    embeddings_file_list = file_list
    results = {}

    label_df = smart_read_csv(LABEL_DATA_PATH)
    y = label_df.iloc[:, 1]
    y_np = y.to_numpy()
    print('y shape: ', y.shape)
    
    for file in embeddings_file_list:
        print(f'\nFile being processed in hyperparameter optimization:\n {file}')
        embeddings = np.load(file)
        # Push to CPU
        X = embeddings
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, X, y), n_trials=10) # Change n_trials to however many trials you have time for!

        results[file] = {
            "Value": study.best_value,
            "Params": study.best_params
        }

    return results

# Example of using the function
# file_path = "*.csv"  # Adjust to the actual path
# results = optimize_hyperparameters(file_path)
# print(results)

def save_results_to_json(results, dataset_name, file_path):
    # Creating the filename with the dataset name
    filename = f"{file_path}optuna_results_{dataset_name}.json"

    # Saving the results dictionary to a JSON file
    with open(filename, 'w') as json_file:
        json.dump(results, json_file)

    print(f"Results saved to {filename}")

def optimize_dataset(dataset_name):
    if dataset_name == 'no_augment':
        path = './hyena_embeds/'
        globbed_reduced_embeds_path = glob.glob(f'{path}*.npy')
        results = optimize_hyperparameters(globbed_reduced_embeds_path)
        save_results_to_json(results, dataset_name, path)
        
    elif dataset_name == 'UTR_regions':
        path = './hyena_embeds_pe/'
        globbed_reduced_embeds_path = glob.glob(f'{path}*.npy')
        results = optimize_hyperparameters(globbed_reduced_embeds_path)
        save_results_to_json(results, dataset_name, path)
        
    elif dataset_name == 'fine_tuned':
        path = './fine_tuned_hyena_embeds/'
        globbed_reduced_embeds_path = glob.glob(f'{path}*.npy')
        results = optimize_hyperparameters(globbed_reduced_embeds_path)
        save_results_to_json(results, dataset_name, path)
        
    elif dataset_name == 'fine_tuned_pe':
        path = './fine_tuned_hyena_embeds_pe/'
        globbed_reduced_embeds_path = glob.glob(f'{path}*.npy')
        results = optimize_hyperparameters(globbed_reduced_embeds_path)
        save_results_to_json(results, dataset_name, path)
        
    else:
        raise ValueError(f'Invalid dataset: {dataset_name}')

In [36]:
# Run optimize_dataset for each dataset
# for dataset_name in ['UTR_regions', 'no_augment', 'fine_tuned', 'fine_tuned_pe']:
#     optimize_dataset(dataset_name)

optimize_dataset('fine_tuned') # 10/26/23, THIS DOESN'T WORK FOR 'UTR_Regions' OR 'fine_tuned_pe' datasets, since the glob function isn't grabbing the right files

[I 2023-11-13 16:49:28,327] A new study created in memory with name: no-name-96f70fd3-3a97-4eec-9cc8-49db96bdaae5


y shape:  (7239,)

File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/reshaped_min_max_hyena.npy


[I 2023-11-13 16:49:30,627] Trial 0 finished with value: 0.3753377182772739 and parameters: {'classifier': 'LinearRegression'}. Best is trial 0 with value: 0.3753377182772739.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 16:49:38,918] Trial 1 finished with value: 0.5319692064159527 and parameters: {'classifier': 'XGBoost', 'n_estimators': 72, 'max_depth': 6, 'learning_rate': 0.025237409842203143}. Best is trial 1 with value: 0.5319692064159527.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 16:49:56,927] Trial 2 finished with value: 0.6695447545116925 and parameters: {'classifier': 'XGBoost', 'n_estimators': 181, 'max_depth': 6, 'learning_rate': 0.040612041055642284}. Best is trial 2 with value: 0.6695447545116925.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 16:53:08,461] Trial 3 finished with value: 0.594626909030014 


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/reshaped_average_hyena.npy


[I 2023-11-13 16:59:30,054] Trial 0 finished with value: 0.3791550317540814 and parameters: {'classifier': 'Ridge', 'alpha': 5.126251839464566}. Best is trial 0 with value: 0.3791550317540814.
[I 2023-11-13 16:59:30,945] Trial 1 finished with value: 0.598288769906769 and parameters: {'classifier': 'LinearRegression'}. Best is trial 1 with value: 0.598288769906769.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 16:59:51,641] Trial 2 finished with value: 0.7745241857083804 and parameters: {'classifier': 'XGBoost', 'n_estimators': 186, 'max_depth': 8, 'learning_rate': 0.11127686103961461}. Best is trial 2 with value: 0.7745241857083804.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:00:14,895] Trial 3 finished with value: 0.6985565573479633 and parameters: {'classifier': 'XGBoost', 'n_estimators': 131, 'max_depth': 8, 'learning_rate': 0.013085264118385283}. Best is trial 2 with


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pca_components_2.npy


[I 2023-11-13 17:02:49,277] Trial 5 finished with value: 0.7942793407119255 and parameters: {'classifier': 'XGBoost', 'n_estimators': 175, 'max_depth': 16, 'learning_rate': 0.15856492451735887}. Best is trial 1 with value: 0.8365786913038906.
[I 2023-11-13 17:02:49,292] Trial 6 finished with value: 0.8365786862421376 and parameters: {'classifier': 'Ridge', 'alpha': 6.225458342145928}. Best is trial 1 with value: 0.8365786913038906.
[I 2023-11-13 17:02:49,306] Trial 7 finished with value: 0.8365786891003146 and parameters: {'classifier': 'Ridge', 'alpha': 9.299134794454062}. Best is trial 1 with value: 0.8365786913038906.
[I 2023-11-13 17:02:49,886] Trial 8 finished with value: 0.8542852544540285 and parameters: {'classifier': 'RandomForest', 'n_estimators': 105, 'max_depth': 10}. Best is trial 8 with value: 0.8542852544540285.
[I 2023-11-13 17:02:49,901] Trial 9 finished with value: 0.8365786913038906 and parameters: {'classifier': 'LinearRegression'}. Best is trial 8 with value: 0.854


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pca_components_5.npy


[I 2023-11-13 17:02:50,393] Trial 0 finished with value: 0.8753243062722647 and parameters: {'classifier': 'RandomForest', 'n_estimators': 74, 'max_depth': 6}. Best is trial 0 with value: 0.8753243062722647.
[I 2023-11-13 17:02:50,408] Trial 1 finished with value: 0.8639795884498039 and parameters: {'classifier': 'Ridge', 'alpha': 4.736300667282341}. Best is trial 0 with value: 0.8753243062722647.
[I 2023-11-13 17:02:51,970] Trial 2 finished with value: 0.8843324711747322 and parameters: {'classifier': 'RandomForest', 'n_estimators': 195, 'max_depth': 12}. Best is trial 2 with value: 0.8843324711747322.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:02:54,980] Trial 3 finished with value: 0.8635313058830277 and parameters: {'classifier': 'XGBoost', 'n_estimators': 68, 'max_depth': 17, 'learning_rate': 0.04316480612946805}. Best is trial 2 with value: 0.8843324711747322.
[I 2023-11-13 17:02:54,994] Trial 4 finished with value: 0.


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pca_components_10.npy


[I 2023-11-13 17:03:02,494] Trial 2 finished with value: 0.890043870208671 and parameters: {'classifier': 'XGBoost', 'n_estimators': 105, 'max_depth': 10, 'learning_rate': 0.08938040995804694}. Best is trial 2 with value: 0.890043870208671.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:03:02,864] Trial 3 finished with value: 0.8956134080494078 and parameters: {'classifier': 'XGBoost', 'n_estimators': 172, 'max_depth': 5, 'learning_rate': 0.12489507605270828}. Best is trial 3 with value: 0.8956134080494078.
[I 2023-11-13 17:03:04,410] Trial 4 finished with value: 0.8902684714008093 and parameters: {'classifier': 'RandomForest', 'n_estimators': 161, 'max_depth': 8}. Best is trial 3 with value: 0.8956134080494078.
[I 2023-11-13 17:03:04,435] Trial 5 finished with value: 0.8889697493539122 and parameters: {'classifier': 'LinearRegression'}. Best is trial 3 with value: 0.8956134080494078.
[I 2023-11-13 17:03:04,460] Trial 6 finished


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pca_components_150.npy


[I 2023-11-13 17:03:09,568] Trial 0 finished with value: 0.8839092087636014 and parameters: {'classifier': 'XGBoost', 'n_estimators': 200, 'max_depth': 1, 'learning_rate': 0.0866954593743777}. Best is trial 0 with value: 0.8839092087636014.
[I 2023-11-13 17:03:36,464] Trial 1 finished with value: 0.8924860999212061 and parameters: {'classifier': 'RandomForest', 'n_estimators': 190, 'max_depth': 18}. Best is trial 1 with value: 0.8924860999212061.
[I 2023-11-13 17:03:36,619] Trial 2 finished with value: 0.9058755588962559 and parameters: {'classifier': 'LinearRegression'}. Best is trial 2 with value: 0.9058755588962559.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:03:54,829] Trial 3 finished with value: 0.8678989074812118 and parameters: {'classifier': 'XGBoost', 'n_estimators': 163, 'max_depth': 7, 'learning_rate': 0.01176196156093277}. Best is trial 2 with value: 0.9058755588962559.
  if is_sparse(data):
  if is_sparse(data):


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pacmap_components_2.npy


[I 2023-11-13 17:07:34,191] Trial 1 finished with value: 0.7649614156525945 and parameters: {'classifier': 'XGBoost', 'n_estimators': 195, 'max_depth': 6, 'learning_rate': 0.024728946103565613}. Best is trial 1 with value: 0.7649614156525945.
[I 2023-11-13 17:07:34,540] Trial 2 finished with value: 0.5481068654475181 and parameters: {'classifier': 'RandomForest', 'n_estimators': 95, 'max_depth': 3}. Best is trial 1 with value: 0.7649614156525945.
[I 2023-11-13 17:07:34,555] Trial 3 finished with value: 0.3058072106568476 and parameters: {'classifier': 'LinearRegression'}. Best is trial 1 with value: 0.7649614156525945.
[I 2023-11-13 17:07:34,994] Trial 4 finished with value: 0.4801008103117291 and parameters: {'classifier': 'RandomForest', 'n_estimators': 136, 'max_depth': 2}. Best is trial 1 with value: 0.7649614156525945.
[I 2023-11-13 17:07:35,009] Trial 5 finished with value: 0.3058072106568476 and parameters: {'classifier': 'LinearRegression'}. Best is trial 1 with value: 0.764961


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pacmap_components_5.npy


[I 2023-11-13 17:07:37,852] Trial 2 finished with value: 0.822940637533641 and parameters: {'classifier': 'RandomForest', 'n_estimators': 161, 'max_depth': 13}. Best is trial 2 with value: 0.822940637533641.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:07:42,904] Trial 3 finished with value: 0.7782462234324772 and parameters: {'classifier': 'XGBoost', 'n_estimators': 150, 'max_depth': 19, 'learning_rate': 0.011569188295293827}. Best is trial 2 with value: 0.822940637533641.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:07:43,043] Trial 4 finished with value: 0.6669646531140021 and parameters: {'classifier': 'XGBoost', 'n_estimators': 192, 'max_depth': 3, 'learning_rate': 0.022041873514697274}. Best is trial 2 with value: 0.822940637533641.
[I 2023-11-13 17:07:43,984] Trial 5 finished with value: 0.8228509945451947 and parameters: {'classifier': 'RandomForest', 'n_estim


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pacmap_components_10.npy


[I 2023-11-13 17:07:47,628] Trial 3 finished with value: 0.48289923864417234 and parameters: {'classifier': 'RandomForest', 'n_estimators': 129, 'max_depth': 2}. Best is trial 3 with value: 0.48289923864417234.
[I 2023-11-13 17:07:48,422] Trial 4 finished with value: 0.5705907733564839 and parameters: {'classifier': 'RandomForest', 'n_estimators': 121, 'max_depth': 3}. Best is trial 4 with value: 0.5705907733564839.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:07:48,490] Trial 5 finished with value: 0.4723987172561188 and parameters: {'classifier': 'XGBoost', 'n_estimators': 84, 'max_depth': 1, 'learning_rate': 0.034059461525698026}. Best is trial 4 with value: 0.5705907733564839.
[I 2023-11-13 17:07:48,505] Trial 6 finished with value: 0.3796733602315856 and parameters: {'classifier': 'Ridge', 'alpha': 6.944330060273312}. Best is trial 4 with value: 0.5705907733564839.
[I 2023-11-13 17:07:50,298] Trial 7 finished with value: 


File being processed in hyperparameter optimization:
 ./fine_tuned_hyena_embeds/pacmap_components_100.npy


[I 2023-11-13 17:07:52,495] Trial 1 finished with value: 0.5283961941575563 and parameters: {'classifier': 'Ridge', 'alpha': 2.360087751784299}. Best is trial 0 with value: 0.572024237567136.
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
[I 2023-11-13 17:07:55,613] Trial 2 finished with value: 0.8216851673545313 and parameters: {'classifier': 'XGBoost', 'n_estimators': 116, 'max_depth': 6, 'learning_rate': 0.07196475440422272}. Best is trial 2 with value: 0.8216851673545313.
[I 2023-11-13 17:07:55,759] Trial 3 finished with value: 0.572024237567136 and parameters: {'classifier': 'LinearRegression'}. Best is trial 2 with value: 0.8216851673545313.
[I 2023-11-13 17:07:55,885] Trial 4 finished with value: 0.500249859298502 and parameters: {'classifier': 'Ridge', 'alpha': 6.081787271750014}. Best is trial 2 with value: 0.8216851673545313.
[I 2023-11-13 17:08:06,220] Trial 5 finished with value: 0.8312847429100583 and parameters: {'classifier': 'Ran

Results saved to ./fine_tuned_hyena_embeds/optuna_results_fine_tuned.json


# Lets use the optimal configuration and test the val ds

**The below is going to call a separate function to take the top performer from Optuna, and test it on the Validation dataset**

In [37]:
from optimal_model_test import *

In [40]:
# THIS COULD BE AUTOMATED BY ADDING A SETTINGS.PY FILE
json_file_path = './fine_tuned_hyena_embeds/optuna_results_fine_tuned.json'
model, r2, mse = run_pipeline(json_file_path, y_train_path=LABEL_DATA_PATH, X_val_path=os.path.join(PATH_TO_GENERATED_EMBEDDINGS, 'val_40.pth'), y_val_path='./data/train_val_test_splits/val.csv', specific_model=None)

X shape: (1035, 19456)


In [41]:
print(f'The model, r2, and mse are: {model}, {r2}, {mse}')

The model, r2, and mse are: Ridge(alpha=9.733764055482645), 0.5341146568734052, 0.6174119691903348
