In [None]:
import tokenizers
import torch
from datasets import load_dataset

import deepchopper

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
class KmerPreTokenizer:
    def __init__(self, kmer_size: int, *, overlap: bool):
        self.kmer_size = kmer_size
        self.overlap = overlap

    def pre_tokenize_str(self, sequence: str) -> list[tuple[str, tuple[int, int]]]:
        """Pre-tokenize a sequence into overlapping kmers.

        Example:
            sequence = "ATCGG"
            kmer_size = 3
            overlap = False
            pre_tokenize_str(sequence) -> [("ATC", (0, 3)), ("TCG", (1, 4)), ("CGG", (2, 5))]
        """
        return deepchopper.seq_to_kmers_and_offset(sequence, self.kmer_size, self.overlap)


from rich.console import Console
from rich.text import Text


def hight_text(text: str, start: int, end: int):
    text = Text(text)
    console = Console()
    text.stylize("bold magenta", start, end)
    console.print(text)

In [None]:
def test_pre_tokenize_str_no_overlap():
    tokenizer = KmerPreTokenizer(3, overlap=False)
    sequence = "ATCGGCC"
    expected_output = [("ATC", (0, 3)), ("GGC", (3, 6))]
    res = tokenizer.pre_tokenize_str(sequence)
    assert res == expected_output

In [None]:
data_files = {"train": "../tests/data/test_input.parquet"}
num_proc = 8
train_dataset = load_dataset(
    "parquet", data_files=data_files, num_proc=num_proc, split="train[:70%]"
)
val_dataset = load_dataset(
    "parquet", data_files=data_files, num_proc=num_proc, split="train[70%:90%]"
)
test_dataset = load_dataset(
    "parquet", data_files=data_files, num_proc=num_proc, split="train[90%:]"
)

print(f"train_dataset: {train_dataset}")
print(f"val_dataset: {val_dataset}")
print(f"test_dataset: {test_dataset}")

In [None]:
train_dataset["seq"][0]
train_dataset["id"][0]
# train_dataset['qual'][0]
train_dataset["target"][0]

In [None]:
hight_text(train_dataset["seq"][0], *train_dataset["target"][0])

In [None]:
# deepchopper.seq_to_kmers(train_dataset['seq'][0], 5, overlap=False)

In [None]:
# test_dataset.map(lambda x : partial(deepchopper.seq_to_kmers, overlap=False, k=5)(x['seq']))
test_dataset.map(lambda x: print(x["seq"]))

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel

tokenizer = Tokenizer(WordLevel())

In [None]:
pre_tokenizer = KmerPreTokenizer(3, overlap=True)

In [None]:
tokenizer.encode(pre_tokenizer.pre_tokenize_str(train_dataset["seq"][0]))