## General

### Installation

In [1]:
#!pip install git+https://www.github.com/huggingface/transformers
#!pip install git+https://github.com/huggingface/accelerate
#!pip install bitsandbytes
#!pip install einops
#!pip install --upgrade torch torchvision
#!pip install scikit-learn
#!pip install matplotlib
#!pip install datasets
#!pip install Bio
#!pip install pybedtools
#!pip install tabulate
#!pip install enformer-pytorch
#!pip install einops==0.5.0
#!pip install git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python


### Paths & CKPT

In [2]:
### Datasets
# hg19 fasta file
FASTA_FILE = "/data/Dcode/gaetano/repos/fasta_files/hg19.fa"

# training files
path_bios = '/data/Dcode/gaetano/repos/AI4Genomic/data/enhancers/biosamples/'

### Libraries

In [3]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoModel, BertForSequenceClassification
from transformers.models.bert.configuration_bert import BertConfig
from sklearn import metrics 
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from Bio import SeqIO
from pybedtools import BedTool
from transformers import EarlyStoppingCallback
import importlib

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import random
import gc

2024-11-06 13:52:45.595447: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 13:52:45.595484: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 13:52:45.596214: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 13:52:45.600818: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Clean GPUs

In [4]:
def clean_gpu():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    gc.collect()

clean_gpu()

### get fasta hg19 database

In [5]:
def get_chrom2seq(capitalize=True):

    chrom2seq = {}
    for seq in SeqIO.parse(FASTA_FILE, "fasta"):
        chrom2seq[seq.description.split()[0]] = seq.seq.upper() if capitalize else seq.seq

    return chrom2seq

chrom2seq = get_chrom2seq()

In [6]:
pos = 159174683        

chrom2seq['chr1'][pos-1]

'T'

## Model & Tokenizer & Datasetes

In [7]:
# architecture configuration
def get_hf_model_tokenizer(model_ckpt):
    
    if 'dnabert2' in model_ckpt:  # Only for DNABERT models
        model_ckpt =  "vivym/DNABERT-2-117M"
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        model = BertForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)


    elif 'Geneformer' in model_ckpt:  # Geneformer model
        tokenizer = AutoTokenizer.from_pretrained('tanoManzo/Geneformer_ft_Hepg2_1kbpHG19_DHSs_H3K27AC')
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)

    elif 'gena-' in model_ckpt:  # Gena models
        model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
        gena_module_name = model.__class__.__module__
        
        if 'bigbird' in model_ckpt:  # BigBird model under Gena
            cls = getattr(importlib.import_module(gena_module_name), 'BigBirdForSequenceClassification')
        else:
            cls = getattr(importlib.import_module(gena_module_name), 'BertForSequenceClassification')
        
        model = cls.from_pretrained(model_ckpt, num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)

    else:  # Default case for other models
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)

    return model, tokenizer

### Get HepG2 data - sequences

In [8]:
def get_bios_sequences(bios_id, path_bios=path_bios, chrom2seq=chrom2seq):
    pos_beds = list(BedTool(f'{path_bios}{bios_id}_positive_1kb.bed'))
    ctrl_beds = list(BedTool(f'{path_bios}{bios_id}_control_1kb.bed'))

    pos_list = []
    ctrl_list = []
    for chr, start, end  in pos_beds:
        pos_list.append(str(chrom2seq[chr][int(start):int(end)]))

    for chr, start, end  in ctrl_beds:
        ctrl_list.append(str(chrom2seq[chr][int(start):int(end)]))

    ctrl_list = random.sample(ctrl_list, len(pos_list))
    seq_data = []
    seq_data.extend(pos_list)
    seq_data.extend(ctrl_list)

    labels_data = []
    labels_data.extend([1 for _ in range(len(pos_list))])
    labels_data.extend([0 for _ in range(len(ctrl_list))])

    return seq_data, labels_data

### Create dataframe and remove Ns seq

In [9]:
def get_clean_sample(seq_data, labels_data, sample_size):
    # Create DataFrame
    bioS = pd.DataFrame({'seq_data': seq_data, 'labels': labels_data})

    # Filter out rows with sequences consisting only of the same character (presumably Ns)
    bioS_no_Ns = bioS[bioS['seq_data'].apply(lambda x: len(set(x)) > 1)]

    # take a sample based on sample_size
    bioS_no_Ns_sampled = bioS_no_Ns.sample(round(len(bioS_no_Ns)*sample_size),random_state=10)
    bioS_no_Ns_sampled['labels'].value_counts()
    return bioS_no_Ns_sampled

### Split train/val/test

In [10]:
def datasets_split_train_val_test(bioS_no_Ns_sampled):
    # Get training data
    train_sequences_bioS = bioS_no_Ns_sampled['seq_data'].tolist()
    train_labels_bioS = bioS_no_Ns_sampled['labels'].tolist()

    # Split the dataset into a training and a validation dataset
    train_sequences_bioS, test_sequences_bioS, train_labels_bioS, test_labels_bioS = train_test_split(train_sequences_bioS,
                                                                                  train_labels_bioS, test_size=0.20, random_state=42)

    # Split the test data into validation and test sets
    validation_sequences_bioS, test_sequences_bioS, validation_labels_bioS, test_labels_bioS = train_test_split(test_sequences_bioS, test_labels_bioS, test_size=0.50, random_state=42)

    # Create datasets from dictionaries
    ds_train_bioS = Dataset.from_dict({"data": train_sequences_bioS, "labels": train_labels_bioS})
    ds_validation_bioS = Dataset.from_dict({"data": validation_sequences_bioS, "labels": validation_labels_bioS})
    ds_test_bioS = Dataset.from_dict({"data": test_sequences_bioS, "labels": test_labels_bioS})
    
    return ds_train_bioS, ds_validation_bioS, ds_test_bioS

### Tokenize the dataset

In [11]:
def get_tokenized_dataset(tokenizer, max_length=512):

    def tokenize_function(examples):
        # Encode sequences
        encoding = tokenizer(
            examples['data'],
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors="pt"
        )
        
        # Add labels to the encoding
        encoding['labels'] = examples['labels']
        
        return encoding
    
    # Tokenize and process the datasets
    tokenized_train = ds_train_bioS.map(tokenize_function, batched=True, remove_columns=["data"])
    tokenized_validation = ds_validation_bioS.map(tokenize_function, batched=True, remove_columns=["data"])
    tokenized_test = ds_test_bioS.map(tokenize_function, batched=True, remove_columns=["data"])

    return tokenized_train, tokenized_validation, tokenized_test

### Train Arguments 

In [12]:
# Define the function
def create_training_args(output_dir, batch_size, learning_rate, num_epochs, log_steps, token):
    training_args = TrainingArguments(
        hub_model_id= output_dir.split('/')[1],
        output_dir=output_dir,  # Directory to save model and logs
        per_device_train_batch_size=batch_size,  # Training batch size per device
        per_device_eval_batch_size=batch_size,  # Evaluation batch size per device
        learning_rate=learning_rate,  # Learning rate
        num_train_epochs=num_epochs,  # Number of training epochs
        logging_steps=log_steps,  # Logging interval
        logging_dir='./logs',  # Directory to store logs
        eval_strategy="steps",  # Evaluation strategy
        save_strategy="steps",  # Save strategy
        save_total_limit=3,  # Maximum number of saved models
        disable_tqdm=False,  # Enable tqdm progress bars
        load_best_model_at_end=True,  # Load best model at the end of training
        metric_for_best_model="f1_score",  # Metric to select the best model
        fp16=True,  # Enable mixed precision training
        #push_to_hub=True,  # Push model to Hugging Face hub
        hub_token=token  # Authentication token for Hugging Face hub
    )
    return training_args

### Metric

In [13]:
# Define the metric for the evaluation using f1, auc, and prc
def compute_metrics_classification_binary(eval_pred):
    """Computes F1, AUC, PRC, and other metrics for binary classification."""
    predictions = np.argmax(eval_pred.predictions, axis=-1)
    # Get probability predictions for AUC and PRC calculation (assuming it's binary classification)
    prob_predictions = eval_pred.predictions[:, 1]  # assuming class 1 is positive
    references = eval_pred.label_ids
    
    r = {
        'f1_score': metrics.f1_score(references, predictions),
        'precision': metrics.precision_score(references, predictions),
        'recall': metrics.recall_score(references, predictions),
        'accuracy': metrics.accuracy_score(references, predictions),
        'auc': metrics.roc_auc_score(references, prob_predictions),  # AUC score
        'prc': metrics.average_precision_score(references, prob_predictions)  # PRC (average precision score)
    }
    
    return r


In [14]:
# Custom Trainer class to override the _save method
class CustomTrainer(Trainer):
    def _save(self, output_dir, state_dict=None):
        # Save the model with safe_serialization=False to avoid shared tensor issues
        self.model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False)
        if self.tokenizer is not None:
            self.tokenizer.save_pretrained(output_dir)

## Training

In [15]:
# Define the working device
device = torch.device("cuda")

# dataset sample size
# 0.1 = 10%, 0.2 = 20%, .. , 1.00 = 100
sample_size = 1.00

### Model 
# model name from huggingface.co/model name_id:model_name
#model_ckpt = 'czl/dnabert2'

#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-50m-multi-species'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-100m-multi-species'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-250m-multi-species'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-500m-multi-species'

#model_ckpt = 'InstaDeepAI/nucleotide-transformer-500m-1000g'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-500m-human-ref'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-2.5b-1000g'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-2.5b-multi-species'

#model_ckpt ='ctheodoris/Geneformer'

#model_ckpt = 'AIRI-Institute/gena-lm-bert-base-t2t'
#model_ckpt = 'AIRI-Institute/gena-lm-bert-large-t2t'
#model_ckpt = 'AIRI-Institute/gena-lm-bert-base-t2t-multi'
#model_ckpt = 'AIRI-Institute/gena-lm-bigbird-base-t2t'

model_ckpt = 'LongSafari/hyenadna-small-32k-seqlen-hf'
#model_ckpt = 'LongSafari/hyenadna-medium-160k-seqlen-hf'
#model_ckpt = 'LongSafari/hyenadna-medium-450k-seqlen-hf'
#model_ckpt = 'LongSafari/hyenadna-large-1m-seqlen-hf'


# Define configuration parameters
BATCH_SIZE = 8
LOG_STEPS = 500
LEARNING_RATE = 1e-5
NUM_EPOCHS = 200
TOKEN = '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'


# samples for fine-tuning
#'BioS2'=Hela, 'BioS45'=neural progenitor cell, 'BioS73'=hepg2, 'BioS74'=k562
bios_ids = ['BioS2', 'BioS45', 'BioS73', 'BioS74']



for bios_id in bios_ids:
    # load model and dataset
    model, tokenizer = get_hf_model_tokenizer(model_ckpt=model_ckpt)
    seq_data, labels_data = get_bios_sequences(bios_id, path_bios=path_bios, chrom2seq=chrom2seq)
    bioS_no_Ns_sampled = get_clean_sample(seq_data=seq_data, labels_data=labels_data, sample_size=sample_size)
    ds_train_bioS, ds_validation_bioS, ds_test_bioS = datasets_split_train_val_test(bioS_no_Ns_sampled=bioS_no_Ns_sampled)
    ds_tokenized_train, ds_tokenized_validation, ds_tokenized_test = get_tokenized_dataset(tokenizer, max_length=512)
    model.config.use_flash_attention = False  
    OUTPUT_DIR = f"ft/{model_ckpt.split('/')[1]}_ft_{bios_id}_1kbpHG19_DHSs_H3K27AC_one_shot"
    training_args = create_training_args(OUTPUT_DIR, BATCH_SIZE, LEARNING_RATE, NUM_EPOCHS, LOG_STEPS, TOKEN)

    # Trainer
    trainer = Trainer(
        model=model.to(device),
        args=training_args,
        train_dataset=ds_tokenized_train,
        eval_dataset=ds_tokenized_validation,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_classification_binary,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
        )



    # Train the model
    trainer.train()
    # push to hub
    trainer.push_to_hub()
    # clean gpus
    clean_gpu()
    

Some weights of HyenaDNAForSequenceClassification were not initialized from the model checkpoint at LongSafari/hyenadna-small-32k-seqlen-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/476 [00:00<?, ? examples/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1 Score,Precision,Recall,Accuracy,Auc,Prc
500,0.3316,0.725684,0.677966,0.645161,0.714286,0.677966,0.752304,0.678095


RuntimeError: The weights trying to be saved contained shared tensors [{'hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.3.freq', 'hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.1.freq', 'hyena.backbone.layers.0.mixer.filter_fn.implicit_filter.5.freq'}, {'hyena.backbone.layers.1.mixer.filter_fn.implicit_filter.1.freq', 'hyena.backbone.layers.1.mixer.filter_fn.implicit_filter.5.freq', 'hyena.backbone.layers.1.mixer.filter_fn.implicit_filter.3.freq'}, {'hyena.backbone.layers.2.mixer.filter_fn.implicit_filter.5.freq', 'hyena.backbone.layers.2.mixer.filter_fn.implicit_filter.1.freq', 'hyena.backbone.layers.2.mixer.filter_fn.implicit_filter.3.freq'}, {'hyena.backbone.layers.3.mixer.filter_fn.implicit_filter.1.freq', 'hyena.backbone.layers.3.mixer.filter_fn.implicit_filter.5.freq', 'hyena.backbone.layers.3.mixer.filter_fn.implicit_filter.3.freq'}] that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.

In [None]:
trainer.predict(ds_tokenized_test)

[[-0.7211914   0.7060547 ]
 [ 2.0644531  -1.9433594 ]
 [-1.4130859   1.3886719 ]
 ...
 [-0.9267578   0.88720703]
 [ 0.17858887 -0.16992188]
 [-1.2646484   1.2451172 ]]


PredictionOutput(predictions=array([[-0.7211914 ,  0.7060547 ],
       [ 2.0644531 , -1.9433594 ],
       [-1.4130859 ,  1.3886719 ],
       ...,
       [-0.9267578 ,  0.88720703],
       [ 0.17858887, -0.16992188],
       [-1.2646484 ,  1.2451172 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.475721150636673, 'test_f1_score': 0.8197463768115942, 'test_precision': 0.7494824016563147, 'test_recall': 0.9045477261369316, 'test_accuracy': 0.7907465825446898, 'test_auc': 0.868190003722487, 'test_prc': 0.8528215378984432, 'test_runtime': 5.09, 'test_samples_per_second': 747.35, 'test_steps_per_second': 93.517})

In [None]:
from tabulate import tabulate

# Evaluate the model
eval_results = trainer.predict(ds_tokenized_test)


# Print evaluation results in a table format
print(tabulate(eval_results.metrics.items(), headers=["Metric", "Value"]))

[[-0.7211914   0.7060547 ]
 [ 2.0644531  -1.9433594 ]
 [-1.4130859   1.3886719 ]
 ...
 [-0.9267578   0.88720703]
 [ 0.17858887 -0.16992188]
 [-1.2646484   1.2451172 ]]
Metric                        Value
-----------------------  ----------
test_loss                  0.475721
test_f1_score              0.819746
test_precision             0.749482
test_recall                0.904548
test_accuracy              0.790747
test_auc                   0.86819
test_prc                   0.852822
test_runtime               5.0971
test_samples_per_second  746.309
test_steps_per_second     93.387


In [None]:
np.argmax(eval_results.predictions, axis=1)

array([1, 0, 1, ..., 1, 0, 1])