In [None]:
import tokenizers
import torch
from datasets import load_dataset

import deepchopper
from deepchopper.models import KmerPreTokenizer

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
from tokenizers import NormalizedString, PreTokenizedString, Regex, Tokenizer
from tokenizers.decoders import Decoder
from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer

import deepchopper


def compute_tokens_to_ids(kmer_size: int) -> tuple[dict[str, int], list[str]]:
    kmers_tokens = deepchopper.generate_kmers(deepchopper.default.BASES, kmer_size)
    standard_tokens = kmers_tokens

    unknown_token = "<UNK>"
    padding_token = "<PAD>"
    eos_token = "<EOS>"
    bos_token = "<BOS>"
    sep_token = "<SEP>"

    specical_tokens = [padding_token, unknown_token, eos_token, bos_token, sep_token]
    all_tokens = standard_tokens + specical_tokens
    tokens_to_ids = {tok: i for i, tok in enumerate(all_tokens)}
    return tokens_to_ids, all_tokens


class KmerPreTokenizer:
    def __init__(self, kmer_size: int, *, overlap: bool):
        self.kmer_size = kmer_size
        self.overlap = overlap

    def kmer_split(self, i: int, normalized_string: NormalizedString) -> list[NormalizedString]:
        return [
            normalized_string[start:end]
            for (_token, (start, end)) in deepchopper.seq_to_kmers_and_offset(
                sequence, self.kmer_size, self.overlap
            )
        ]

    def pre_tokenize(self, pretok: PreTokenizedString):
        # Let's call split on the PreTokenizedString to split using `self.jieba_split`
        pretok.split(self.kmer_split)


class KmerDecoder:
    def decode(self, tokens: list[str]) -> str:
        return "".join(tokens)

In [None]:
from rich.console import Console
from rich.text import Text


def hight_text(text: str, start: int, end: int):
    text = Text(text)
    console = Console()
    text.stylize("bold magenta", start, end)
    console.print(text)

In [None]:
def test_pre_tokenize_str_no_overlap():
    tokenizer = KmerPreTokenizer(3, overlap=False)
    sequence = "ATCGGCC"
    expected_output = [("ATC", (0, 3)), ("GGC", (3, 6))]
    res = tokenizer.pre_tokenize_str(sequence)
    assert res == expected_output

In [None]:
data_files = {"train": "../tests/data/test_input.parquet"}
num_proc = 8
train_dataset = load_dataset(
    "parquet", data_files=data_files, num_proc=num_proc, split="train[:70%]"
)
val_dataset = load_dataset(
    "parquet", data_files=data_files, num_proc=num_proc, split="train[70%:90%]"
)
test_dataset = load_dataset(
    "parquet", data_files=data_files, num_proc=num_proc, split="train[90%:]"
)

print(f"train_dataset: {train_dataset}")
print(f"val_dataset: {val_dataset}")
print(f"test_dataset: {test_dataset}")

In [None]:
train_dataset["seq"][0]
train_dataset["id"][0]
# train_dataset['qual'][0]
train_dataset["target"][0]

In [None]:
hight_text(train_dataset["seq"][0], *train_dataset["target"][0])

In [None]:
# deepchopper.seq_to_kmers(train_dataset['seq'][0], 5, overlap=False)

In [None]:
# test_dataset.map(lambda x : partial(deepchopper.seq_to_kmers, overlap=False, k=5)(x['seq']))
# test_dataset.map(lambda x : print(x['seq']))

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel

tokenizer = Tokenizer(WordLevel())
tokenizer.pre_tokenizer = PreTokenizer.custom(KmerPreTokenizer(3, overlap=True))

In [None]:
ts = train_dataset["seq"][0]

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str(ts)

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))

In [None]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

In [None]:
from tokenizers.pre_tokenizers import ByteLevel

tokenizer.pre_tokenizer = ByteLevel()

In [None]:
tokenizer.train?

In [130]:
import torch
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)

In [132]:
encode_train_dataset = train_dataset.map(lambda x : tokenizer(x['seq']))

In [136]:
input_ids_len  = [ len(i) for i in encode_train_dataset['input_ids']]

In [138]:
max(input_ids_len)

2247

In [None]:
tokenizer.

In [140]:
tokenizer.convert_ids_to_tokens(encode_train_dataset['input_ids'][0])

['[CLS]',
 'GCAGCTA',
 'TGAATG',
 'CAA',
 'GGCCA',
 'CAAGGTG',
 'GATGGAA',
 'GAGTT',
 'GTGGAA',
 'CCAAA',
 'GAGCTG',
 'TCTTCCA',
 'GAGAA',
 'GATT',
 'TCGAGA',
 'TAAGTC',
 'GCC',
 'CATCA',
 'GTGAA',
 'CAAGA',
 'TATTGTT',
 'GGTG',
 'GCATT',
 'TGA',
 'TGAGAA',
 'CGTT',
 'CCAA',
 'GATTATT',
 'GACAGA',
 'TTA',
 'GTGAAAA',
 'GTAA',
 'GATT',
 'GAAA',
 'TCATGA',
 'CTGA',
 'CCGTAA',
 'GTGGCAA',
 'GAAAGG',
 'GCTTTT',
 'GCCTTTG',
 'TAACCTT',
 'TGACGA',
 'CCATGA',
 'CTCC',
 'GTG',
 'GATAA',
 'GATT',
 'GTCA',
 'TTCA',
 'GAA',
 'TACCA',
 'TACTG',
 'TGAATG',
 'GCCACA',
 'TCTTTATT',
 'GTGAA',
 'GTTA',
 'GAAAA',
 'GCCCTG',
 'TCAAA',
 'GCAA',
 'GAGA',
 'TGAA',
 'TCAGTG',
 'CTT',
 'CTCCAGCC',
 'AAA',
 'GAGG',
 'TCGAA',
 'GTG',
 'GTTCTG',
 'GAAA',
 'CTTTG',
 'GTGGTG',
 'GTCGTG',
 'GAGGTG',
 'GTT',
 'TCGGTG',
 'GGAA',
 'TGACAA',
 'CTCGG',
 'TCGTG',
 'GAGGAAA',
 'CTT',
 'CAGTG',
 'GTC',
 'GTGGTG',
 'GCTTTG',
 'GTGGCA',
 'GCC',
 'GTGGTG',
 'GTGGTG',
 'GATATG',
 'GTGGCA',
 'GTGGG',
 'GATG',
 'GCTA',
 'TAATG',

In [62]:
tokenizer.save_pretrained("./dnabert2_117M")

('./dnabert2_117M/tokenizer_config.json',
 './dnabert2_117M/special_tokens_map.json',
 './dnabert2_117M/tokenizer.json')

In [88]:
output = tokenizer(ts)

In [89]:
output

{'input_ids': [1, 4085, 513, 27, 229, 2886, 3551, 222, 671, 131, 728, 1403, 145, 73, 1154, 2482, 36, 197, 135, 421, 1310, 103, 183, 23, 430, 134, 76, 1973, 634, 24, 1033, 68, 73, 45, 949, 59, 3153, 2595, 2219, 301, 1697, 2246, 2470, 1008, 78, 30, 250, 73, 67, 115, 25, 268, 236, 513, 460, 2870, 135, 77, 85, 519, 107, 66, 50, 52, 574, 29, 1027, 18, 71, 480, 30, 535, 45, 157, 423, 2855, 719, 31, 1848, 57, 572, 3189, 539, 901, 29, 176, 41, 423, 532, 674, 36, 423, 423, 948, 674, 281, 83, 105, 185, 605, 68, 188, 1689, 86, 719, 671, 105, 27, 703, 200, 24, 361, 22, 161, 2436, 33, 102, 314, 128, 21, 138, 25, 365, 36, 249, 1317, 71, 131, 1543, 124, 34, 409, 76, 103, 772, 247, 82, 349, 918, 772, 515, 32, 330, 182, 24, 128, 49, 263, 595, 191, 212, 1182, 61, 53, 860, 53, 1028, 410, 295, 3818, 801, 176, 30, 253, 628, 62, 1702, 330, 65, 69, 610, 1300, 3883, 1800, 29, 191, 28, 296, 2389, 28, 170, 53, 2356, 31, 319, 42, 45, 2145, 183, 76, 49, 568, 185, 98, 206, 86, 140, 249, 146, 605, 1523, 20, 369, 23

In [None]:
tokenizer.convert_tokens_to_string?

In [None]:
tokenizer.decode(output)

In [90]:
token_test_dataset = test_dataset.map(lambda x : tokenizer(x['seq']))

In [91]:
test_dataset

Dataset({
    features: ['id', 'seq', 'qual', 'target'],
    num_rows: 500
})

In [93]:
token_test_dataset

Dataset({
    features: ['id', 'seq', 'qual', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 500
})

In [101]:
ts1, ts2 = ts[:1024], ts[1024:]

In [110]:
res = file_tokenizer.encode(ts1, ts2)

In [121]:
file_tokenizer.encode?

[0;31mSignature:[0m
[0mfile_tokenizer[0m[0;34m.[0m[0mencode[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mself[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msequence[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpair[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mis_pretokenized[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0madd_special_tokens[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Encode the given sequence and pair. This method can process raw text sequences
as well as already pre-tokenized sequences.

Example:
    Here are some examples of the inputs that are accepted::

        encode("A single sequence")`
        encode("A sequence", "And its pair")`
        encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
        encode(
            [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
            is_p

In [118]:
len(res.tokens)

277

In [103]:
ts2

'TCGTGACGCTGAATAAATGTCTTTTTTAAAAAAAAAAAAAAGCTCCCTCCCATCCCCTGCTGCTAACTGATCCCATTATATCTAACCTGCCCCCCCATATCACCTGCTCCCGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGCATGTAGCAAAATAGTGGGAAGATTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCTAGATAGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGTAAAAAATCAACACCCA'

In [122]:
full_seq = "TCGTGACGCTGAATAAATGTCTTTTTTAAAAAAAAAAAAAA"
a1, a2 = "TCGTGACGCTGAATAAATGTCTT", "TTTTAAAAAAAAAAAAAA"

In [127]:
file_tokenizer.encode(a1,a2).tokens

['[CLS]',
 'TCGTGA',
 'CGC',
 'TGAA',
 'TAAATG',
 'TCTT',
 '[SEP]',
 'TTTT',
 'AAAAAAAAAAAAAA',
 '[SEP]']

In [128]:
file_tokenizer.encode(a1).tokens

['[CLS]', 'TCGTGA', 'CGC', 'TGAA', 'TAAATG', 'TCTT', '[SEP]']

In [129]:
file_tokenizer.encode(a2).tokens

['[CLS]', 'TTTT', 'AAAAAAAAAAAAAA', '[SEP]']

In [97]:
file_tokenizer = tokenizers.Tokenizer.from_file("dnabert2_117M/tokenizer.json")


In [98]:
file_tokenizer.add_special_tokens?

[0;31mSignature:[0m [0mfile_tokenizer[0m[0;34m.[0m[0madd_special_tokens[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mtokens[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Add the given special tokens to the Tokenizer.

If these tokens are already part of the vocabulary, it just let the Tokenizer know about
them. If they don't exist, the Tokenizer creates them, giving them a new id.

These special tokens will never be processed by the model (ie won't be split into
multiple tokens), and they can be removed from the output when decoding.

Args:
    tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
        The list of special tokens we want to add to the vocabulary. Each token can either
        be a string or an instance of :class:`~tokenizers.AddedToken` for more
        customization.

Returns:
    :obj:`int`: The number of tokens that were created in the vocabulary
[0;31mType:[0m      builtin_function_or_method

In [95]:
test_dataset

Dataset({
    features: ['id', 'seq', 'qual', 'target'],
    num_rows: 500
})

In [96]:
file_tokenizer.encode(ts)

Encoding(num_tokens=276, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])