## General

### Installation

In [1]:
#!pip install git+https://www.github.com/huggingface/transformers
#!pip install git+https://github.com/huggingface/accelerate
#!pip install bitsandbytes
#!pip install einops
#!pip install --upgrade torch torchvision
#!pip install scikit-learn
#!pip install matplotlib
#!pip install datasets
#!pip install Bio
#!pip install pybedtools
#!pip install tabulate
#!pip install enformer-pytorch
#!pip install einops==0.5.0
#!pip install git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python


### Paths & CKPT

In [2]:
### Datasets
# hg19 fasta file
FASTA_FILE = "/data/Dcode/gaetano/repos/fasta_files/hg19.fa"

# training files
path_bios = '/data/Dcode/gaetano/repos/AI4Genomic/data/enhancers/biosamples/'

### Libraries

In [3]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, AutoModel, BertForSequenceClassification
from transformers.models.bert.configuration_bert import BertConfig
from sklearn import metrics 
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
from Bio import SeqIO
from pybedtools import BedTool
from transformers import EarlyStoppingCallback
import importlib

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn
import random
import gc

2024-09-29 12:15:35.125986: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-29 12:15:35.140278: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-29 12:15:35.140302: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-29 12:15:35.149955: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Clean GPUs

In [4]:
def clean_gpu():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    gc.collect()

clean_gpu()

### get fasta hg19 database

In [5]:
def get_chrom2seq(capitalize=True):

    chrom2seq = {}
    for seq in SeqIO.parse(FASTA_FILE, "fasta"):
        chrom2seq[seq.description.split()[0]] = seq.seq.upper() if capitalize else seq.seq

    return chrom2seq

chrom2seq = get_chrom2seq()

## Model & Tokenizer & Datasetes

In [6]:
# architecture configuration
def get_hf_model_tokenizer(model_ckpt):
    
    if 'dnabert2' in model_ckpt:  # Only for DNABERT models
        model_ckpt =  "vivym/DNABERT-2-117M"
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        model = BertForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)


    elif 'Geneformer' in model_ckpt:  # Geneformer model
        tokenizer = AutoTokenizer.from_pretrained('tanoManzo/Geneformer_ft_Hepg2_1kbpHG19_DHSs_H3K27AC')
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)

    elif 'gena-' in model_ckpt:  # Gena models
        model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
        gena_module_name = model.__class__.__module__
        
        if 'bigbird' in model_ckpt:  # BigBird model under Gena
            cls = getattr(importlib.import_module(gena_module_name), 'BigBirdForSequenceClassification')
        else:
            cls = getattr(importlib.import_module(gena_module_name), 'BertForSequenceClassification')
        
        model = cls.from_pretrained(model_ckpt, num_labels=2)
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)

    else:  # Default case for other models
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)
        model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)

    return model, tokenizer

### Get HepG2 data - sequences

In [7]:
def get_bios_sequences(bios_id, path_bios=path_bios, chrom2seq=chrom2seq):
    pos_beds = list(BedTool(f'{path_bios}{bios_id}_positive_1kb.bed'))
    ctrl_beds = list(BedTool(f'{path_bios}{bios_id}_control_1kb.bed'))

    pos_list = []
    ctrl_list = []
    for chr, start, end  in pos_beds:
        pos_list.append(str(chrom2seq[chr][int(start):int(end)]))

    for chr, start, end  in ctrl_beds:
        ctrl_list.append(str(chrom2seq[chr][int(start):int(end)]))

    ctrl_list = random.sample(ctrl_list, len(pos_list))
    seq_data = []
    seq_data.extend(pos_list)
    seq_data.extend(ctrl_list)

    labels_data = []
    labels_data.extend([1 for _ in range(len(pos_list))])
    labels_data.extend([0 for _ in range(len(ctrl_list))])

    return seq_data, labels_data

### Create dataframe and remove Ns seq

In [8]:
def get_clean_sample(seq_data, labels_data, sample_size):
    # Create DataFrame
    bioS = pd.DataFrame({'seq_data': seq_data, 'labels': labels_data})

    # Filter out rows with sequences consisting only of the same character (presumably Ns)
    bioS_no_Ns = bioS[bioS['seq_data'].apply(lambda x: len(set(x)) > 1)]

    # take a sample based on sample_size
    bioS_no_Ns_sampled = bioS_no_Ns.sample(round(len(bioS_no_Ns)*sample_size),random_state=10)
    bioS_no_Ns_sampled['labels'].value_counts()
    return bioS_no_Ns_sampled

### Split train/val/test

In [9]:
def datasets_split_train_val_test(bioS_no_Ns_sampled):
    # Get training data
    train_sequences_bioS = bioS_no_Ns_sampled['seq_data'].tolist()
    train_labels_bioS = bioS_no_Ns_sampled['labels'].tolist()

    # Split the dataset into a training and a validation dataset
    train_sequences_bioS, test_sequences_bioS, train_labels_bioS, test_labels_bioS = train_test_split(train_sequences_bioS,
                                                                                  train_labels_bioS, test_size=0.20, random_state=42)

    # Split the test data into validation and test sets
    validation_sequences_bioS, test_sequences_bioS, validation_labels_bioS, test_labels_bioS = train_test_split(test_sequences_bioS, test_labels_bioS, test_size=0.50, random_state=42)

    # Create datasets from dictionaries
    ds_train_bioS = Dataset.from_dict({"data": train_sequences_bioS, "labels": train_labels_bioS})
    ds_validation_bioS = Dataset.from_dict({"data": validation_sequences_bioS, "labels": validation_labels_bioS})
    ds_test_bioS = Dataset.from_dict({"data": test_sequences_bioS, "labels": test_labels_bioS})
    
    return ds_train_bioS, ds_validation_bioS, ds_test_bioS

### Tokenize the dataset

In [10]:
def get_tokenized_dataset(tokenizer, max_length=512):

    def tokenize_function(examples):
        # Encode sequences
        encoding = tokenizer(
            examples['data'],
            truncation=True,
            max_length=max_length,
            padding=True,
            return_tensors="pt"
        )
        
        # Add labels to the encoding
        encoding['labels'] = examples['labels']
        
        return encoding
    
    # Tokenize and process the datasets
    tokenized_train = ds_train_bioS.map(tokenize_function, batched=True, remove_columns=["data"])
    tokenized_validation = ds_validation_bioS.map(tokenize_function, batched=True, remove_columns=["data"])
    tokenized_test = ds_test_bioS.map(tokenize_function, batched=True, remove_columns=["data"])

    return tokenized_train, tokenized_validation, tokenized_test

### Train Arguments 

In [11]:
# Define the function
def create_training_args(output_dir, batch_size, learning_rate, num_epochs, log_steps, token):
    training_args = TrainingArguments(
        hub_model_id= output_dir.split('/')[1],
        output_dir=output_dir,  # Directory to save model and logs
        per_device_train_batch_size=batch_size,  # Training batch size per device
        per_device_eval_batch_size=batch_size,  # Evaluation batch size per device
        learning_rate=learning_rate,  # Learning rate
        num_train_epochs=num_epochs,  # Number of training epochs
        logging_steps=log_steps,  # Logging interval
        logging_dir='./logs',  # Directory to store logs
        eval_strategy="steps",  # Evaluation strategy
        save_strategy="steps",  # Save strategy
        save_total_limit=3,  # Maximum number of saved models
        disable_tqdm=False,  # Enable tqdm progress bars
        load_best_model_at_end=True,  # Load best model at the end of training
        metric_for_best_model="f1_score",  # Metric to select the best model
        fp16=True,  # Enable mixed precision training
        #push_to_hub=True,  # Push model to Hugging Face hub
        hub_token=token  # Authentication token for Hugging Face hub
    )
    return training_args

### Metric

In [12]:
# Define the metric for the evaluation using f1, auc, and prc
def compute_metrics_classification_binary(eval_pred):
    """Computes F1, AUC, PRC, and other metrics for binary classification."""
    predictions = np.argmax(eval_pred.predictions, axis=-1)
    # Get probability predictions for AUC and PRC calculation (assuming it's binary classification)
    prob_predictions = eval_pred.predictions[:, 1]  # assuming class 1 is positive
    references = eval_pred.label_ids
    
    r = {
        'f1_score': metrics.f1_score(references, predictions),
        'precision': metrics.precision_score(references, predictions),
        'recall': metrics.recall_score(references, predictions),
        'accuracy': metrics.accuracy_score(references, predictions),
        'auc': metrics.roc_auc_score(references, prob_predictions),  # AUC score
        'prc': metrics.average_precision_score(references, prob_predictions)  # PRC (average precision score)
    }
    
    return r


In [13]:
# Custom Trainer class to override the _save method
class CustomTrainer(Trainer):
    def _save(self, output_dir, state_dict=None):
        # Save the model with safe_serialization=False to avoid shared tensor issues
        self.model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False)
        if self.tokenizer is not None:
            self.tokenizer.save_pretrained(output_dir)

## Training

In [14]:
# Define the working device
device = torch.device("cuda")

# dataset sample size
# 0.1 = 10%, 0.2 = 20%, .. , 1.00 = 100
sample_size = 1.0

### Model 
# model name from huggingface.co/model name_id:model_name
model_ckpt = 'czl/dnabert2'

#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-50m-multi-species'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-100m-multi-species'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-250m-multi-species'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-v2-500m-multi-species'

#model_ckpt = 'InstaDeepAI/nucleotide-transformer-500m-1000g'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-500m-human-ref'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-2.5b-1000g'
#model_ckpt = 'InstaDeepAI/nucleotide-transformer-2.5b-multi-species'

#model_ckpt ='ctheodoris/Geneformer'

#model_ckpt = 'AIRI-Institute/gena-lm-bert-base-t2t'
#model_ckpt = 'AIRI-Institute/gena-lm-bert-large-t2t'
#model_ckpt = 'AIRI-Institute/gena-lm-bert-base-t2t-multi'
#model_ckpt = 'AIRI-Institute/gena-lm-bigbird-base-t2t'

#model_ckpt = 'LongSafari/hyenadna-small-32k-seqlen-hf'
#model_ckpt = 'LongSafari/hyenadna-medium-160k-seqlen-hf'
#model_ckpt = 'LongSafari/hyenadna-medium-450k-seqlen-hf'
#model_ckpt = 'LongSafari/hyenadna-large-1m-seqlen-hf'


# Define configuration parameters
BATCH_SIZE = 8
LOG_STEPS = 500
LEARNING_RATE = 1e-5
NUM_EPOCHS = 20
TOKEN = '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'


# samples for fine-tuning
#'BioS2'=Hela, 'BioS45'=neural progenitor cell, 'BioS73'=hepg2, 'BioS74'=k562
bios_ids = ['BioS2', 'BioS45', 'BioS73', 'BioS74']



for bios_id in bios_ids:
    # load model and dataset
    model, tokenizer = get_hf_model_tokenizer(model_ckpt=model_ckpt)
    seq_data, labels_data = get_bios_sequences(bios_id, path_bios=path_bios, chrom2seq=chrom2seq)
    bioS_no_Ns_sampled = get_clean_sample(seq_data=seq_data, labels_data=labels_data, sample_size=sample_size)
    ds_train_bioS, ds_validation_bioS, ds_test_bioS = datasets_split_train_val_test(bioS_no_Ns_sampled=bioS_no_Ns_sampled)
    ds_tokenized_train, ds_tokenized_validation, ds_tokenized_test = get_tokenized_dataset(tokenizer, max_length=512)
    model.config.use_flash_attention = False  
    OUTPUT_DIR = f"ft/{model_ckpt.split('/')[1]}_ft_{bios_id}_1kbpHG19_DHSs_H3K27AC"
    training_args = create_training_args(OUTPUT_DIR, BATCH_SIZE, LEARNING_RATE, NUM_EPOCHS, LOG_STEPS, TOKEN)

    # Trainer
    trainer = Trainer(
        model=model.to(device),
        args=training_args,
        train_dataset=ds_tokenized_train,
        eval_dataset=ds_tokenized_validation,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_classification_binary,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
        )



    # Train the model
    trainer.train()
    # push to hub
    trainer.push_to_hub()
    # clean gpus
    clean_gpu()
    

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at vivym/DNABERT-2-117M and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.atten

Map:   0%|          | 0/47492 [00:00<?, ? examples/s]

Map:   0%|          | 0/5937 [00:00<?, ? examples/s]

Map:   0%|          | 0/5937 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1 Score,Precision,Recall,Accuracy,Auc,Prc
500,0.5965,0.583609,0.766823,0.649437,0.936008,0.698838,0.787468,0.766076
1000,0.574,0.546276,0.77314,0.697668,0.866921,0.73084,0.803018,0.787571
1500,0.5599,0.616237,0.774875,0.702381,0.864056,0.734378,0.806331,0.789331
2000,0.5516,0.543364,0.777971,0.670507,0.926457,0.720229,0.814983,0.802785
2500,0.5542,0.575944,0.642721,0.803631,0.535498,0.685026,0.815028,0.799642
3000,0.5508,0.585392,0.77357,0.649578,0.956065,0.703891,0.815331,0.804419
3500,0.5431,0.541378,0.781402,0.709535,0.869468,0.742631,0.819591,0.811347
4000,0.5416,0.55942,0.787512,0.70529,0.891436,0.745494,0.822425,0.809406
4500,0.5379,0.520911,0.787677,0.721707,0.866921,0.752737,0.827836,0.818341
5000,0.5364,0.559085,0.788534,0.705734,0.893346,0.746505,0.832339,0.821742


[[ 0.3371582   0.3322754 ]
 [-0.02272034  0.83447266]
 [ 0.8691406  -0.68603516]
 ...
 [ 0.43115234  0.14819336]
 [-0.06069946  0.8803711 ]
 [ 0.11700439  0.640625  ]]
[[ 0.52734375  0.17822266]
 [ 0.14074707  0.71728516]
 [ 0.9379883  -1.6523438 ]
 ...
 [ 0.79541016 -0.34033203]
 [ 0.15490723  0.69873047]
 [ 0.1940918   0.6484375 ]]
[[ 0.7753906  -0.03094482]
 [-0.7529297   1.5546875 ]
 [ 1.6494141  -1.9101562 ]
 ...
 [-0.00801086  0.8798828 ]
 [-0.5996094   1.4316406 ]
 [-0.609375    1.4375    ]]
[[ 0.41357422  0.6479492 ]
 [-0.17614746  1.1630859 ]
 [ 1.1630859  -1.6464844 ]
 ...
 [ 0.23400879  0.8300781 ]
 [ 0.01432037  1.0244141 ]
 [-0.06246948  1.0888672 ]]
[[ 0.9003906  -0.17651367]
 [ 0.4284668   0.53466797]
 [ 1.2802734  -1.1005859 ]
 ...
 [ 0.84033203 -0.0869751 ]
 [ 0.75634766  0.03735352]
 [ 0.6010742   0.2993164 ]]
[[ 0.05511475  0.8876953 ]
 [-0.39257812  1.2451172 ]
 [ 1.1845703  -1.5048828 ]
 ...
 [ 0.3154297   0.56640625]
 [-0.1772461   1.0761719 ]
 [-0.296875    1.178

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/357M [00:00<?, ?B/s]

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at vivym/DNABERT-2-117M and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.atten

Map:   0%|          | 0/19016 [00:00<?, ? examples/s]

Map:   0%|          | 0/2377 [00:00<?, ? examples/s]

Map:   0%|          | 0/2377 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1 Score,Precision,Recall,Accuracy,Auc,Prc
500,0.5979,0.522528,0.725717,0.773017,0.683871,0.730332,0.824176,0.819791
1000,0.5569,0.498722,0.773791,0.749245,0.8,0.755995,0.836869,0.831941
1500,0.5311,0.601863,0.755115,0.613293,0.982258,0.667648,0.836885,0.834984
2000,0.5364,0.507271,0.770531,0.769293,0.771774,0.760202,0.842218,0.836407
2500,0.5181,0.490671,0.767701,0.757457,0.778226,0.754312,0.844023,0.837833
3000,0.521,0.562298,0.786885,0.717608,0.870968,0.753891,0.842794,0.83827
3500,0.5122,0.505299,0.752534,0.789894,0.718548,0.753471,0.844189,0.838248
4000,0.5096,0.521762,0.781129,0.750371,0.814516,0.761885,0.846295,0.838861
4500,0.5031,0.494281,0.781392,0.735755,0.833065,0.756836,0.840889,0.837934
5000,0.4983,0.555903,0.779739,0.743777,0.819355,0.758519,0.838458,0.837062


[[-0.30908203  0.7573242 ]
 [ 1.0869141  -0.95166016]
 [ 0.5151367  -0.2355957 ]
 ...
 [-0.44482422  0.89746094]
 [ 0.93310547 -0.7524414 ]
 [ 0.11248779  0.22753906]]
[[-0.5629883   0.8676758 ]
 [ 1.1445312  -1.1533203 ]
 [ 0.23657227 -0.08416748]
 ...
 [-0.70410156  1.0283203 ]
 [ 1.0810547  -1.078125  ]
 [-0.19140625  0.4038086 ]]
[[-0.5029297   1.0849609 ]
 [ 1.2275391  -1.1054688 ]
 [-0.35742188  0.88671875]
 ...
 [-0.54345703  1.1298828 ]
 [ 0.578125   -0.27270508]
 [-0.2529297   0.7504883 ]]
[[-0.52001953  1.0585938 ]
 [ 1.3935547  -1.4589844 ]
 [ 0.32250977  0.00821686]
 ...
 [-0.54785156  1.0859375 ]
 [ 1.3652344  -1.390625  ]
 [-0.31860352  0.77978516]]
[[-0.47485352  0.97265625]
 [ 1.6396484  -1.1318359 ]
 [ 0.11773682  0.39404297]
 ...
 [-0.3774414   0.86083984]
 [ 1.5322266  -0.99072266]
 [ 0.07543945  0.4350586 ]]
[[-1.1855469   1.5488281 ]
 [ 1.8154297  -1.4677734 ]
 [-0.46069336  0.8354492 ]
 ...
 [-1.21875     1.5615234 ]
 [ 1.8056641  -1.4482422 ]
 [-0.87109375  1.228

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/357M [00:00<?, ?B/s]

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at vivym/DNABERT-2-117M and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.atten

Map:   0%|          | 0/21464 [00:00<?, ? examples/s]

Map:   0%|          | 0/2683 [00:00<?, ? examples/s]

Map:   0%|          | 0/2684 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1 Score,Precision,Recall,Accuracy,Auc,Prc
500,0.5511,0.557333,0.776451,0.644537,0.976257,0.699963,0.860375,0.862357
1000,0.5087,0.516541,0.789416,0.820648,0.760475,0.783451,0.863715,0.865632
1500,0.5156,0.506178,0.81168,0.76547,0.863827,0.78606,0.867527,0.868308
2000,0.5127,0.477202,0.816021,0.764491,0.875,0.789415,0.867921,0.869362
2500,0.4991,0.506254,0.810266,0.720261,0.925978,0.768543,0.871223,0.87192
3000,0.478,0.456872,0.811273,0.742178,0.894553,0.777861,0.871327,0.87227
3500,0.4808,0.458523,0.804073,0.808616,0.799581,0.792024,0.872346,0.873808
4000,0.4873,0.46721,0.811531,0.729654,0.914106,0.773388,0.874352,0.876085
4500,0.4649,0.475901,0.809684,0.721464,0.922486,0.768543,0.875104,0.876054


[[-0.26513672  0.35375977]
 [ 0.28808594 -0.5317383 ]
 [-0.82666016  1.1621094 ]
 ...
 [-0.6777344   0.9580078 ]
 [ 0.22058105 -0.42529297]
 [ 0.06591797 -0.18457031]]
[[ 0.60839844 -1.0566406 ]
 [ 1.0146484  -1.7470703 ]
 [-0.83447266  1.2382812 ]
 ...
 [-0.7138672   1.0820312 ]
 [ 1.0146484  -1.7490234 ]
 [ 0.99902344 -1.7441406 ]]
[[-0.34033203  0.328125  ]
 [ 1.4960938  -1.9462891 ]
 [-1.3330078   1.6064453 ]
 ...
 [-1.1816406   1.4189453 ]
 [ 1.5009766  -1.9501953 ]
 [ 1.46875    -1.9287109 ]]
[[-0.21789551  0.0914917 ]
 [ 1.3496094  -1.8095703 ]
 [-1.46875     1.6992188 ]
 ...
 [-1.2441406   1.4355469 ]
 [ 1.3701172  -1.8310547 ]
 [ 1.1992188  -1.6640625 ]]
[[-0.296875   0.3959961]
 [ 1.3779297 -1.8291016]
 [-1.4160156  1.6445312]
 ...
 [-1.3212891  1.5253906]
 [ 1.4345703 -1.8818359]
 [ 1.0224609 -1.4326172]]
[[-0.18591309  0.05059814]
 [ 1.0292969  -1.4716797 ]
 [-1.2480469   1.3925781 ]
 ...
 [-0.9785156   1.0419922 ]
 [ 1.0722656  -1.5136719 ]
 [ 0.64208984 -1.0546875 ]]
[[ 0

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/357M [00:00<?, ?B/s]

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at vivym/DNABERT-2-117M and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.attention.self.key.bias', 'bert.encoder.layer.1.attention.self.key.weight', 'bert.encoder.layer.1.atten

Map:   0%|          | 0/30427 [00:00<?, ? examples/s]

Map:   0%|          | 0/3803 [00:00<?, ? examples/s]

Map:   0%|          | 0/3804 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,F1 Score,Precision,Recall,Accuracy,Auc,Prc
500,0.5891,0.538934,0.75191,0.762791,0.741336,0.743886,0.815975,0.802627
1000,0.5602,0.573582,0.713421,0.78845,0.651431,0.726006,0.819054,0.806646
1500,0.5579,0.528534,0.76233,0.752446,0.772476,0.747831,0.825248,0.81547
2000,0.5246,0.511424,0.777326,0.742009,0.816173,0.755193,0.828971,0.816726
2500,0.5286,0.533626,0.787159,0.700039,0.899046,0.745464,0.83248,0.819257
3000,0.5365,0.652343,0.703546,0.808344,0.622803,0.725217,0.816963,0.815115
3500,0.5339,0.5058,0.7904,0.725252,0.868408,0.758875,0.83595,0.824252
4000,0.5288,0.591615,0.739189,0.794457,0.69111,0.744675,0.8348,0.822297
4500,0.5279,0.51822,0.791194,0.709896,0.893521,0.75309,0.837221,0.826648
5000,0.5273,0.540489,0.776975,0.771019,0.783024,0.764659,0.838266,0.828815


[[ 1.5878906  -1.4111328 ]
 [ 0.22290039  0.24694824]
 [ 0.17712402  0.28930664]
 ...
 [ 0.81689453 -0.43408203]
 [ 0.37963867  0.08026123]
 [ 0.6225586  -0.19995117]]
[[ 1.8457031  -1.9941406 ]
 [ 0.09295654 -0.0690918 ]
 [ 0.3955078  -0.40722656]
 ...
 [ 0.55566406 -0.59033203]
 [ 0.2512207  -0.23742676]
 [ 0.58496094 -0.6269531 ]]
[[ 1.6523438  -1.8466797 ]
 [-0.28271484  0.10516357]
 [ 0.02035522 -0.25463867]
 ...
 [ 0.14501953 -0.41333008]
 [-0.23400879  0.04528809]
 [ 0.31591797 -0.6010742 ]]
[[ 1.1074219  -1.3925781 ]
 [-0.2763672   0.2783203 ]
 [ 0.05789185 -0.1373291 ]
 ...
 [ 0.12091064 -0.2253418 ]
 [-0.23535156  0.22875977]
 [ 0.07977295 -0.17004395]]
[[ 0.94433594 -1.1269531 ]
 [-0.45996094  0.86035156]
 [-0.10754395  0.41455078]
 ...
 [-0.13781738  0.4482422 ]
 [-0.32348633  0.6899414 ]
 [-0.11309814  0.4177246 ]]
[[ 1.5732422  -1.8398438 ]
 [ 0.23376465 -0.45263672]
 [ 1.0185547  -1.3896484 ]
 ...
 [ 1.0107422  -1.4003906 ]
 [ 0.453125   -0.74121094]
 [ 0.8808594  -1.248

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/357M [00:00<?, ?B/s]

In [15]:
trainer.predict(ds_tokenized_test)

[[-0.7211914   0.7060547 ]
 [ 2.0644531  -1.9433594 ]
 [-1.4130859   1.3886719 ]
 ...
 [-0.9267578   0.88720703]
 [ 0.17858887 -0.16992188]
 [-1.2646484   1.2451172 ]]


PredictionOutput(predictions=array([[-0.7211914 ,  0.7060547 ],
       [ 2.0644531 , -1.9433594 ],
       [-1.4130859 ,  1.3886719 ],
       ...,
       [-0.9267578 ,  0.88720703],
       [ 0.17858887, -0.16992188],
       [-1.2646484 ,  1.2451172 ]], dtype=float32), label_ids=array([1, 0, 1, ..., 0, 1, 1]), metrics={'test_loss': 0.475721150636673, 'test_f1_score': 0.8197463768115942, 'test_precision': 0.7494824016563147, 'test_recall': 0.9045477261369316, 'test_accuracy': 0.7907465825446898, 'test_auc': 0.868190003722487, 'test_prc': 0.8528215378984432, 'test_runtime': 5.09, 'test_samples_per_second': 747.35, 'test_steps_per_second': 93.517})

In [16]:
from tabulate import tabulate

# Evaluate the model
eval_results = trainer.predict(ds_tokenized_test)


# Print evaluation results in a table format
print(tabulate(eval_results.metrics.items(), headers=["Metric", "Value"]))

[[-0.7211914   0.7060547 ]
 [ 2.0644531  -1.9433594 ]
 [-1.4130859   1.3886719 ]
 ...
 [-0.9267578   0.88720703]
 [ 0.17858887 -0.16992188]
 [-1.2646484   1.2451172 ]]
Metric                        Value
-----------------------  ----------
test_loss                  0.475721
test_f1_score              0.819746
test_precision             0.749482
test_recall                0.904548
test_accuracy              0.790747
test_auc                   0.86819
test_prc                   0.852822
test_runtime               5.0971
test_samples_per_second  746.309
test_steps_per_second     93.387


In [17]:
np.argmax(eval_results.predictions, axis=1)

array([1, 0, 1, ..., 1, 0, 1])