In [1]:
import data
import torch
import lightning as L
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
from models.test_model import MambaModel
from utils.lightning import LightningMamba
from config import get_config
from torch.optim.lr_scheduler import LambdaLR
from utils.utils import set_seed, model_summary, format_time, handle_wandb_login
import wandb
import time

config = get_config()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
start_time = time.time()
# Set seed for reproducibility
if config["seed"]:
    set_seed(config["seed"])

# Parse config
MODEL_CONFIG = config["model"]
TRAINER_CONFIG = config["trainer"]
DATASET_CONFIG = config["dataset"]
OPTIMIZER_CONFIG = config["optimizer"]
WANDB_CONFIG = config["wandb"]


In [3]:
# ------- Load Dataset and create DataLoaders -------
data = data.get_dataloaders(**DATASET_CONFIG)
train_loader = data["train_loader"]
val_loader = data["val_loader"]
test_loader = data["test_loader"]
num_classes = data["num_classes"]
if TRAINER_CONFIG["max_epochs"] is not None:
    total_steps = len(train_loader) * TRAINER_CONFIG["max_epochs"]
    if TRAINER_CONFIG["max_steps"] is not None:
        total_steps = min(total_steps, TRAINER_CONFIG["max_steps"])
else:
    try:
        total_steps = TRAINER_CONFIG["max_steps"]
    except: 
        raise ValueError("Either max_steps or max_epochs must be defined.")
    
print(f"  ✓ Dataset: {DATASET_CONFIG['dataset_name']}")
print(f"  ✓ Classes: {data['num_classes']}")
print(f"  ✓ Input shape: {data['input_shape']}")
print(f"  ✓ Features: {data['feature_dim']}")
print(f"  ✓ Sequence Length: {data['sequence_length']}")
    

	 Loading sc09 dataset...
	 Creating DataLoaders...
	 Dataloaders created.
  ✓ Dataset: sc09
  ✓ Classes: 10
  ✓ Input shape: torch.Size([128, 107, 64])
  ✓ Features: 64
  ✓ Sequence Length: 107


In [4]:
# ------- Create Model -------
print("Constructing Model...")
model = MambaModel(**MODEL_CONFIG, d_out=num_classes).cuda()
model_summary(model)

Constructing Model...


Layer (type:depth-idx)                   Param #
MambaModel                               --
├─ModuleList: 1-1                        --
│    └─PhotonicMamba: 2-1                --
│    │    └─Mamba: 3-1                   32,640
├─ModuleList: 1-2                        --
│    └─LayerNorm: 2-2                    128
├─LayerNorm: 1-3                         128
├─Sequential: 1-4                        --
│    └─Linear: 2-3                       8,320
│    └─LayerNorm: 2-4                    256
│    └─GELU: 2-5                         --
│    └─Dropout: 2-6                      --
│    └─Linear: 2-7                       1,290
Total params: 42,762
Trainable params: 42,762
Non-trainable params: 0

In [5]:
# ------- W&B Logger -------
print("\n[3/6] Setting up W&B Logger...")
usrname = handle_wandb_login()
wandb_logger = WandbLogger(
    project=WANDB_CONFIG["project"],
    entity=usrname,
    name=WANDB_CONFIG["name"],
    log_model="all",
    save_dir="./wandb_logs"
)

print(f"  ✓ Project:", wandb_logger.experiment.project)
print(f"  ✓ Name:", wandb_logger.experiment.name)
print(f"  ✓ URL:", wandb_logger.experiment.url)


[3/6] Setting up W&B Logger...

--- Weights & Biases (W&B) Configuration ---
W&B username: tmpoulionis-


[34m[1mwandb[0m: Currently logged in as: [33mtmpoulionis[0m ([33mtmpoulionis-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


W&B login successful!


  ✓ Project: lightning_logs
  ✓ Name: splendid-silence-44
  ✓ URL: https://wandb.ai/tmpoulionis-/lightning_logs/runs/n7xq75va


In [7]:
# ------- Callbacks -------
print("\n[4/6] Setting up Callbacks...")

callbacks = [
    LearningRateMonitor(logging_interval='step'),
    ModelCheckpoint(
        dirpath=f"./checkpoints/{wandb_logger.name}",
        filename="best-{epoch:02d}-{val/acc:.4f}",
        monitor="val/acc",
        mode="max",
        save_top_k=1,
        save_last=True
    ),
    EarlyStopping(
        monitor="val/loss",
        patience=20,
        mode="min",
        verbose=True
    )
]

print(f"  ✓ Learning rate monitor")
print(f"  ✓ Model checkpointing (save top 3)")
print(f"  ✓ Early stopping (patience=20)")
    


[4/6] Setting up Callbacks...
  ✓ Learning rate monitor
  ✓ Model checkpointing (save top 3)
  ✓ Early stopping (patience=20)


In [8]:
# ------- Scheduler -------
from train import create_scheduler
warmup_steps = int(0.1 * total_steps)

scheduler_config = {
    "scheduler": create_scheduler,
    "params": {
        "total_steps": total_steps,
        "warmup_steps": warmup_steps
    }
}
    

In [9]:
# ------- Lightning Module -------
print("\n[5/6) Setting up Lightning Module...")
loss_fn = torch.nn.CrossEntropyLoss()

lightning_module = LightningMamba(
    model=model,
    optimizer=torch.optim.AdamW,
    loss_fn=loss_fn,
    opt_hyperparams=OPTIMIZER_CONFIG,
    scheduler_config=scheduler_config
)


[5/6) Setting up Lightning Module...


In [10]:
# ------- Trainer -------
print("\n[6/6] Initializing Trainer...")
trainer = L.Trainer(
    **TRAINER_CONFIG,
    logger=wandb_logger,
    callbacks=callbacks,
)

print(f"  ✓ Max steps: {TRAINER_CONFIG['max_steps'] if TRAINER_CONFIG['max_steps'] else 'N/A'}")
print(f"  ✓ Max epochs: {TRAINER_CONFIG['max_epochs'] if TRAINER_CONFIG['max_epochs'] else 'N/A'}")
print(f"  ✓ Accelerator: {TRAINER_CONFIG['accelerator']}")
print(f"  ✓ Gradient clip: {TRAINER_CONFIG['gradient_clip_val']}")
    

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores



[6/6] Initializing Trainer...
  ✓ Max steps: 200000
  ✓ Max epochs: 30
  ✓ Accelerator: auto
  ✓ Gradient clip: 0.1


In [None]:
!wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p14/hg38.p14.fa.gz
!gunzip hg38.p14.fa.gz

--2025-11-20 17:03:41--  https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p14/hg38.p14.fa.gz
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1012013082 (965M) [application/x-gzip]
Saving to: ‘hg38.p14.fa.gz’


2025-11-20 17:04:42 (16,1 MB/s) - ‘hg38.p14.fa.gz’ saved [1012013082/1012013082]

rm: cannot remove 'hg38.p14.gz': No such file or directory


In [2]:
!nano hg38.p14.fa

[19ANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNplace   [7m^U[m Paste Text[7m^T[m To Spell  [7m^_[m Go To Linenes ][m[H[7m  GNU nano 4.8                      hg38.p14.fa                                 [1;79H[m[22;9H[7m[ line 1/65985253 (0%), col 1/6 (16%), char 0/3365209190 (0%) ][m

In [4]:
from pyfaidx import Fasta


In [5]:
fasta = Fasta("hg38.p14.fa")

In [6]:
fasta.keys()

odict_keys(['chr1', 'chr10', 'chr11', 'chr11_KI270721v1_random', 'chr12', 'chr13', 'chr14', 'chr14_GL000009v2_random', 'chr14_GL000225v1_random', 'chr14_KI270722v1_random', 'chr14_GL000194v1_random', 'chr14_KI270723v1_random', 'chr14_KI270724v1_random', 'chr14_KI270725v1_random', 'chr14_KI270726v1_random', 'chr15', 'chr15_KI270727v1_random', 'chr16', 'chr16_KI270728v1_random', 'chr17', 'chr17_GL000205v2_random', 'chr17_KI270729v1_random', 'chr17_KI270730v1_random', 'chr18', 'chr19', 'chr1_KI270706v1_random', 'chr1_KI270707v1_random', 'chr1_KI270708v1_random', 'chr1_KI270709v1_random', 'chr1_KI270710v1_random', 'chr1_KI270711v1_random', 'chr1_KI270712v1_random', 'chr1_KI270713v1_random', 'chr1_KI270714v1_random', 'chr2', 'chr20', 'chr21', 'chr22', 'chr22_KI270731v1_random', 'chr22_KI270732v1_random', 'chr22_KI270733v1_random', 'chr22_KI270734v1_random', 'chr22_KI270735v1_random', 'chr22_KI270736v1_random', 'chr22_KI270737v1_random', 'chr22_KI270738v1_random', 'chr22_KI270739v1_random', 

In [9]:
fasta['chr1'][0:100]

>chr1:1-100
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

In [20]:
!mkdir -p data/hg38/
!curl https://storage.googleapis.com/basenji_barnyard2/hg38.ml.fa.gz > data/hg38/hg38.ml.fa.gz
!curl https://storage.googleapis.com/basenji_barnyard2/sequences_human.bed > data/hg38/human-sequences.bed

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  839M  100  839M    0     0  11.2M      0  0:01:14  0:01:14 --:--:-- 11.6M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1112k  100 1112k    0     0   854k      0  0:00:01  0:00:01 --:--:--  854k


In [21]:
!gunzip data/hg38/hg38.ml.fa.gz


In [1]:
from dataloaders.datasets.hg38_dataset import HG38Dataset
from dataloaders.datasets.hg38_char_tokenizer import CharacterTokenizer

seqlen = 2**17
bed_file_path = 'data/hg38/human-sequences.bed'
fasta_file_path = 'data/hg38/hg38.ml.fa'
chars = ['A', 'T', 'C', 'G', 'N', 'a', 't', 'c', 'g', 'n', '.']

tokenizer = CharacterTokenizer(
    characters=chars,
    model_max_length=seqlen
)

data = HG38Dataset(
    split='train',
    bed_file=bed_file_path,
    fasta_file=fasta_file_path,
    max_length=seqlen,
    tokenizer_name='char',
    tokenizer=tokenizer,
    add_eos=True
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
len(data)

34021

In [7]:
input, target = data[2]

  chr_name, start, end = (row[0], row[1], row[2])


In [8]:
input

tensor([ 7, 10, 10,  ...,  7,  7,  7])

In [9]:
len(input)

131071

In [10]:
len(target)

131071

In [11]:
data[1]

  chr_name, start, end = (row[0], row[1], row[2])


(tensor([ 8,  7, 10,  ..., 10,  7, 10]),
 tensor([ 7, 10,  9,  ...,  7, 10,  1]))

In [18]:
print(data[1][0][0:20])
print(data[1][1][0:20])

tensor([ 8,  7, 10,  9,  7,  8,  7,  8,  8,  7, 10,  9,  7,  8,  8,  8,  8,  7,
         8, 10])
tensor([ 7, 10,  9,  7,  8,  7,  8,  8,  7, 10,  9,  7,  8,  8,  8,  8,  7,  8,
        10,  8])
