### Installation

In [1]:
#!pip install git+https://www.github.com/huggingface/transformers
#!pip install git+https://github.com/huggingface/accelerate
#!pip install bitsandbytes
#!pip install einops
#!pip install --upgrade torch torchvision
#!pip install scikit-learn
#!pip install matplotlib
#!pip install datasets
#!pip install Bio
#!pip install pybedtools
#!pip install tabulate

## Libraries

In [2]:
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification,BertForSequenceClassification, AutoModel, AutoConfig
from transformers.models.bert.configuration_bert import BertConfig
from datasets import load_dataset, Dataset

from sklearn import metrics 
from sklearn.model_selection import train_test_split

from Bio import SeqIO
from pybedtools import BedTool

import pandas as pd
import numpy as np
import scipy.stats as stats

import torch
from torch import nn

import importlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

sns.set()

2024-10-02 16:49:28.470287: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-02 16:49:28.470317: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-02 16:49:28.471065: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-02 16:49:28.476054: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [65]:
import os
os.environ['HF_HOME'] = './cache/'
os.environ["TRANSFORMERS_CACHE"] = './cache/'

## Load Transformer Models

In [66]:
def load_models_and_tokenizers(models_names, bios_id, ft_model_type):
    models_tokenizers_dict = {}

    for model_name in models_names:
        model_ckpt = f"tanoManzo/{model_name}_ft_{bios_id}_{ft_model_type}"
        print(f"Loading model and tokenizer for: {model_ckpt}")

        try:
            # Load DNABERT model
            if 'dnabert2' in model_ckpt:
               model = BertForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)
               tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
            
            # Load Geneformer model
            elif 'Geneformer' in model_ckpt:
                tokenizer = AutoTokenizer.from_pretrained('tanoManzo/Geneformer_ft_Hepg2_1kbpHG19_DHSs_H3K27AC')
                model = AutoModelForSequenceClassification.from_pretrained(model_ckpt)

            # Load Gena models
            elif 'gena-' in model_ckpt:
                model = AutoModel.from_pretrained(model_ckpt, trust_remote_code=True)
                gena_module_name = model.__class__.__module__

                # BigBird model under Gena
                if 'bigbird' in model_ckpt:
                    cls = getattr(importlib.import_module(gena_module_name), 'BigBirdForSequenceClassification')
                else:
                    cls = getattr(importlib.import_module(gena_module_name), 'BertForSequenceClassification')
                
                model = cls.from_pretrained(model_ckpt, num_labels=2)
                tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)

            # Load generic model
            else:
                tokenizer = AutoTokenizer.from_pretrained(model_ckpt, trust_remote_code=True)
                model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, trust_remote_code=True)

            # Store the model and tokenizer in a dictionary
            models_tokenizers_dict[f"{model_name}_ft_{bios_id}"] = {'model': model, 'tokenizer': tokenizer}

        except Exception as e:
            print(f"Error loading {model_ckpt}: {str(e)}")

    return models_tokenizers_dict

# Example usage
#models_tokenizers_dict = load_models_and_tokenizers(models_names, bios_id, ft_model_type)

## Datasetes

### get fasta hg19/hg38 database

In [67]:
def get_chrom2seq(fasta_file, capitalize=True):

    chrom2seq = {}
    for seq in SeqIO.parse(fasta_file, "fasta"):
        chrom2seq[seq.description.split()[0]] = seq.seq.upper() if capitalize else seq.seq

    return chrom2seq
# Example usage
#chrom2seq = get_chrom2seq(FASTA_FILE_19)

### Get data

In [68]:
def data_preprocessing(type_data,name_data,dataset_path):
    
    updated_data_df = pd.DataFrame()
    path_file = f"{dataset_path}{name_data}"

    if type_data == 'raQTL':
        old_data_df = pd.read_csv(path_file, sep='\t')
        updated_data_df['Chromosome'] = old_data_df['chr']
        updated_data_df['Position'] = old_data_df['SNPabspos']
        updated_data_df['Reference'] = old_data_df['ref']
        updated_data_df['Alternative'] = old_data_df['alt']
        if 'hepg2' in  name_data:
            type_cell = 'hepg2'
        else:
            type_cell = 'k562'    
        updated_data_df['Value_Ratio'] = old_data_df[f'{type_cell}.alt.mean']/old_data_df[f'{type_cell}.ref.mean']
        updated_data_df['Value_Diff'] = old_data_df[f'{type_cell}.alt.mean']-old_data_df[f'{type_cell}.ref.mean']
        updated_data_df['Value_Pvalue_signed'] = -np.log10(old_data_df[f'{type_cell}.wilcox.p.value'])*np.sign(updated_data_df['Value_Diff'])
        updated_data_df['P_value'] = old_data_df[f'{type_cell}.wilcox.p.value']
    elif type_data == 'mpra':
        old_data_df = pd.read_csv(path_file)
        if 'GSE87711' in name_data:  
            updated_data_df['Chromosome'] = old_data_df['chr'].apply(lambda x: f'chr{x}')
            updated_data_df['Position'] = old_data_df['pos']
            updated_data_df['Reference'] = old_data_df['ref']
            updated_data_df['Alternative'] = old_data_df['alt']
            updated_data_df['Value_Ratio'] = old_data_df['CTRL.fc(log2)']
            updated_data_df['Value_Diff'] = old_data_df['CTRL.padj']-old_data_df['CTRL.mut.padj']
            updated_data_df['Value_Pvalue_signed'] = -np.log10(old_data_df['CTRL.mut.p'])*np.sign(updated_data_df['Value_Diff'])
            updated_data_df['P_value'] = old_data_df['CTRL.mut.p']
        if 'SORT1' in name_data:
            updated_data_df['Chromosome'] = old_data_df['Chromosome'].apply(lambda x: f'chr{x}')
            updated_data_df['Position'] = old_data_df['Position']
            updated_data_df['Reference'] = old_data_df['Ref']
            updated_data_df['Alternative'] = old_data_df['Alt']
            updated_data_df['Value_Ratio'] = old_data_df['VariantExpressionEffect (log2)']
            updated_data_df['Value_Pvalue_signed'] = -np.log10(old_data_df['P-value'])*np.sign(updated_data_df['Value_Ratio'])
            updated_data_df['P_value'] = old_data_df['P-value']
        if 'GSE68331' in name_data:
            updated_data_df['Chromosome'] = old_data_df['chr3']
            updated_data_df['Position'] = old_data_df['Pos']
            updated_data_df['Reference'] = old_data_df['Allele0']
            updated_data_df['Alternative'] = old_data_df['Allele1']
            updated_data_df['Value_Ratio'] = old_data_df['effect']
            updated_data_df['Value_Pvalue_signed'] = -np.log10(old_data_df['P'])*np.sign(np.log2(old_data_df['effect']))
            updated_data_df['P_value'] = old_data_df['P']
            
    return updated_data_df

# Example usage
#data_df = data_preprocessing(type_data,name_data,dataset_path)

### Extract Sequences

In [69]:
def process_sequences(data_df, chrom2seq, length_bp=999):
    """
    Process sequences from a DataFrame and extract reference and alternative sequences.

    Parameters:
        mpra_df (pd.DataFrame): DataFrame containing chromosome, position, alt, and p-value columns.
        chrom2seq (dict): Dictionary mapping chromosomes to sequence data.
        length_bp (int): Length of the sequence to extract centered around each position.

    Returns:
        tuple: A tuple containing three lists:
            - seq_ref (list): List of reference sequences.
            - seq_alt (list): List of alternative sequences.
            - seq_val (list): List of values.
    """


    seq_ref = []
    seq_alt = []
    

    # Iterate over the DataFrame rows
    for idx, row in data_df.iterrows():
        chromosome = f"{row['Chromosome']}"
        abspos = row['Position']
        
        # Calculate the start and end positions for the sequence extraction
        start_pos = abspos - (length_bp // 2)-1
        end_pos = abspos + (length_bp // 2)  # Add 1 to ensure the length is exactly 1000 bp
        
        # Extract the sequence from the chromosome data
        seq = str(chrom2seq[chromosome][start_pos:end_pos])
        if len(seq) != length_bp:
            raise ValueError(f"Extracted sequence length {len(seq)} does not match the expected length {length_bp}.")
        
        half_len = len(seq) // 2

        #seq_ref.append(seq)
        seq_ref.append(f"{seq[:half_len]}{row['Reference']}{seq[half_len + 1:]}")
        

        # Create the alternative sequence by replacing the middle base with 'Alt'
        seq_alt.append(f"{seq[:half_len]}{row['Alternative']}{seq[half_len + 1:]}")

        if seq[half_len]!= row['Reference'] and seq[half_len]!= row['Alternative']:
            print("Warning Nucleaotide does NOT matched Ref or Alt")

    data_df['Seq_Reference'] = seq_ref
    data_df['Seq_Alternative'] = seq_alt
    return data_df

## Evaluation 

### Get model predictions

In [70]:
import torch

# Function to get predictions in batches
def get_predictions_raw(models_tokenizers_dict, seq_ref, seq_alt, device="cuda", batch_size=32):
    models_predictions = {}

    def tokenize_in_batches(sequence, tokenizer, max_length=512, batch_size=32):
        tokens = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
        for i in range(0, tokens['input_ids'].size(0), batch_size):
            yield {k: v[i:i+batch_size].to(device) for k, v in tokens.items()}

    for model_name, item in models_tokenizers_dict.items():
        model = item['model'].to(device).eval()
        tokenizer = item['tokenizer']

        print(f"Processing model: {model_name}")

        outputs_ref = []
        outputs_alt = []

        # Process in batches
        for inputs_ref in tokenize_in_batches(seq_ref, tokenizer, batch_size=batch_size):
            with torch.no_grad():
                batch_outputs_ref = model(**inputs_ref).logits.cpu()
                outputs_ref.append(batch_outputs_ref)
            torch.cuda.empty_cache()  # Clear memory after each batch

        for inputs_alt in tokenize_in_batches(seq_alt, tokenizer, batch_size=batch_size):
            with torch.no_grad():
                batch_outputs_alt = model(**inputs_alt).logits.cpu()
                outputs_alt.append(batch_outputs_alt)
            torch.cuda.empty_cache()  # Clear memory after each batch

        # Concatenate all batch results
        outputs_ref = torch.cat(outputs_ref, dim=0)
        outputs_alt = torch.cat(outputs_alt, dim=0)

        # Store results in CPU memory
        models_predictions[model_name] = {'ref': outputs_ref, 'alt': outputs_alt}

        # Free GPU memory by moving model to CPU and clearing cache
        model.to("cpu")
        torch.cuda.empty_cache()

    return models_predictions

# Usage
#models_predictions = get_predictions_raw(models_tokenizers_dict, data_df['Seq_Reference'].to_list(), data_df['Seq_Alternative'].to_list() , batch_size=8)

## Preprocessing Predictions

In [71]:
def compute_delta(outputs_ref_cpu, outputs_alt_cpu, seq_val):
    delta_ref = outputs_ref_cpu[:,1] - outputs_ref_cpu[:,0]
    delta_alt = outputs_alt_cpu[:,1] - outputs_alt_cpu[:,0]

    # Calculate the difference in logits between alternative and reference sequences
    log2_fold_change =  np.log2(torch.sigmoid(delta_alt)/torch.sigmoid(delta_ref))
    diff_alt_ref =  np.array(delta_alt)-np.array(delta_ref)

    # Compute the difference in the logit values for the positive class (enhancer)     
    log2_variant_expression_effect = np.log2(seq_val) 
            
    return np.array(log2_fold_change), log2_variant_expression_effect

## Linear Regression

In [72]:
def compute_regression_and_correlation(deltas):
    slope, intercept, r_val, p_val, std_err = stats.linregress(deltas)
    spearman_corr = stats.spearmanr(deltas[0], deltas[1]).correlation
    return slope, intercept, r_val, p_val, std_err, spearman_corr

## Main

### Parameters (load models for prediction)

In [73]:
# model name from huggingface.co/model name_id:model_name
models_names = [
 'dnabert2',
 'nucleotide-transformer-v2-50m-multi-species',
 'nucleotide-transformer-v2-100m-multi-species',
 'nucleotide-transformer-v2-250m-multi-species',
 'nucleotide-transformer-v2-500m-multi-species',
 'nucleotide-transformer-500m-1000g',
 'nucleotide-transformer-500m-human-ref',
 'nucleotide-transformer-2.5b-1000g',
 'nucleotide-transformer-2.5b-multi-species',
 'Geneformer',
 'gena-lm-bert-base-t2t',
 'gena-lm-bert-large-t2t',
 'gena-lm-bert-base-t2t-multi',
 'gena-lm-bigbird-base-t2t',
 'hyenadna-small-32k-seqlen-hf',
 'hyenadna-medium-160k-seqlen-hf',
 'hyenadna-medium-450k-seqlen-hf',
 'hyenadna-large-1m-seqlen-hf'
 ]

# type of fine-tuned
ft_model_type = '1kbpHG19_DHSs_H3K27AC'

# samples for fine-tuning
#'BioS2'=Hela, 'BioS45'=neural progenitor cell, 'BioS73'=hepg2, 'BioS74'=k562
bios_ids = ['BioS2', 'BioS45', 'BioS73', 'BioS74']

FASTA_FILE_19 = "/data/Dcode/gaetano/repos/fasta_files/hg19.fa"
FASTA_FILE_38 = "/data/Dcode/gaetano/repos/fasta_files/hg38.fa"

### Run

In [74]:
data_to_source = pd.read_csv('/data/Dcode/gaetano/repos/AI4Genomic/data/data_mutagenesis.csv')
data_to_source['type_data'] = data_to_source['path'].apply(lambda x: x.split('/')[-2])

In [75]:
idx = 1
data_ = data_to_source.iloc[idx]

bios_id = data_['BioS']

if data_['hg type'] == 'hg19':
    chrom2seq = get_chrom2seq(FASTA_FILE_19)
else:
    chrom2seq = get_chrom2seq(FASTA_FILE_38)
    
dataset_path = f"/data/Dcode/gaetano{data_['path']}"
type_data = data_['type_data']
name_data = data_['name']

print(f'{bios_id}, {data_["cell line"]}, {data_["hg type"]}, {type_data}, {name_data}')

models_tokenizers_dict = load_models_and_tokenizers(models_names, bios_id, ft_model_type)

data_df = data_preprocessing(type_data, name_data, dataset_path)
data_df = process_sequences(data_df, chrom2seq)
data_df = data_df.iloc[:100]

models_predictions = get_predictions_raw(models_tokenizers_dict, data_df['Seq_Reference'].to_list(), data_df['Seq_Alternative'].to_list() , batch_size=8)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


BioS73, hepg2, hg19, raQTL, hepg2.sign.id.LP190708.txt
Loading model and tokenizer for: tanoManzo/dnabert2_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-v2-50m-multi-species_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-v2-100m-multi-species_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-v2-250m-multi-species_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-v2-500m-multi-species_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-500m-1000g_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-500m-human-ref_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/nucleotide-transformer-2.5b-1000g_ft_BioS73_1kbpHG19_DHSs_H3K27AC


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading model and tokenizer for: tanoManzo/nucleotide-transformer-2.5b-multi-species_ft_BioS73_1kbpHG19_DHSs_H3K27AC


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading model and tokenizer for: tanoManzo/Geneformer_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/gena-lm-bert-base-t2t_ft_BioS73_1kbpHG19_DHSs_H3K27AC


Some weights of BertForMaskedLM were not initialized from the model checkpoint at tanoManzo/gena-lm-bert-base-t2t_ft_BioS73_1kbpHG19_DHSs_H3K27AC and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model and tokenizer for: tanoManzo/gena-lm-bert-large-t2t_ft_BioS73_1kbpHG19_DHSs_H3K27AC


Some weights of BertForMaskedLM were not initialized from the model checkpoint at tanoManzo/gena-lm-bert-large-t2t_ft_BioS73_1kbpHG19_DHSs_H3K27AC and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model and tokenizer for: tanoManzo/gena-lm-bert-base-t2t-multi_ft_BioS73_1kbpHG19_DHSs_H3K27AC


Some weights of BertForMaskedLM were not initialized from the model checkpoint at tanoManzo/gena-lm-bert-base-t2t-multi_ft_BioS73_1kbpHG19_DHSs_H3K27AC and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading model and tokenizer for: tanoManzo/gena-lm-bigbird-base-t2t_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/hyenadna-small-32k-seqlen-hf_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/hyenadna-medium-160k-seqlen-hf_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/hyenadna-medium-450k-seqlen-hf_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Loading model and tokenizer for: tanoManzo/hyenadna-large-1m-seqlen-hf_ft_BioS73_1kbpHG19_DHSs_H3K27AC
Processing model: dnabert2_ft_BioS73
Processing model: nucleotide-transformer-v2-50m-multi-species_ft_BioS73
Processing model: nucleotide-transformer-v2-100m-multi-species_ft_BioS73
Processing model: nucleotide-transformer-v2-250m-multi-species_ft_BioS73
Processing model: nucleotide-transformer-v2-500m-multi-species_ft_BioS73
Processing model: nucleotide-transformer-500m-1000g_ft_BioS73
Processing model: nucleotide-transformer-500m-human-ref_ft_BioS73
Processing model: nucleot



Processing model: gena-lm-bert-large-t2t_ft_BioS73
Processing model: gena-lm-bert-base-t2t-multi_ft_BioS73


Attention type 'block_sparse' is not possible if sequence_length: 174 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Processing model: gena-lm-bigbird-base-t2t_ft_BioS73
Processing model: hyenadna-small-32k-seqlen-hf_ft_BioS73
Processing model: hyenadna-medium-160k-seqlen-hf_ft_BioS73
Processing model: hyenadna-medium-450k-seqlen-hf_ft_BioS73
Processing model: hyenadna-large-1m-seqlen-hf_ft_BioS73


### Save predictions

In [76]:

# Save dictionary to a pickle file
with open(f'/data/Dcode/gaetano/repos/AI4Genomic/data/predictions/{name_data}.pkl', "wb") as pickle_file:
    pickle.dump(models_predictions, pickle_file)

print(f"Done - prediction {name_data} saved")

Done - prediction hepg2.sign.id.LP190708.txt saved
